Skip to content
Snippets Groups Projects
Commit c7c9bcea authored by Hashim Sharif's avatar Hashim Sharif
Browse files

CPU version of visc-rt - DSOC

parent 6f5ea74d
No related branches found
No related tags found
No related merge requests found
add_custom_target(visc-cpu-rt ALL)
add_custom_command(
TARGET visc-cpu-rt PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_CURRENT_SOURCE_DIR}/deviceStatusSwitchIntervals.txt
${CMAKE_CURRENT_BINARY_DIR}/deviceStatusSwitchIntervals.txt
DEPENDS deviceStatusSwitchIntervals.txt
COMMENT "Copying deviceStatusSwitchIntervals.txt")
add_custom_command(
TARGET visc-cpu-rt PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_CURRENT_SOURCE_DIR}/device_abstraction.h
${CMAKE_CURRENT_BINARY_DIR}/device_abstraction.h
DEPENDS device_abstraction.h
COMMENT "Copying device_abstraction.h")
add_custom_command(
TARGET visc-cpu-rt PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_CURRENT_SOURCE_DIR}/policy.h
${CMAKE_CURRENT_BINARY_DIR}/policy.h
DEPENDS policy.h
COMMENT "Copying policy.h")
add_custom_command(
TARGET visc-cpu-rt PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_CURRENT_SOURCE_DIR}/visc-rt.h
${CMAKE_CURRENT_BINARY_DIR}/visc-rt.h
DEPENDS visc-rt.h
COMMENT "Copying visc-rt.h")
add_custom_command(
TARGET visc-cpu-rt PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_CURRENT_SOURCE_DIR}/visc-rt.cpp
${CMAKE_CURRENT_BINARY_DIR}/visc-rt.cpp
DEPENDS visc-rt.cpp
COMMENT "Copying visc-rt.cpp")
add_custom_command(
TARGET visc-cpu-rt PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_CURRENT_SOURCE_DIR}/makefile
${CMAKE_CURRENT_BINARY_DIR}/makefile
DEPENDS makefile
COMMENT "Copying makefile")
10
10 15 10 16 15 30 15 25 20 15
#ifndef __DEVICE_ABSTRACTION__
#define __DEVICE_ABSTRACTION__
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <time.h>
#include <thread>
#include <vector>
#include <iostream>
#include <fstream>
#define MIN_INTERVAL 2
#define MAX_INTERVAL 8
#define NUM_INTERVALS 10
// Device status variable: true if the device is available for use
volatile bool deviceStatus = true;
// Intervals at which to change the device status
std::vector<unsigned> Intervals;
// Set to true when program execution ends and so we can end the device
// simulation
volatile bool executionEnd = false;
void initializeDeviceStatusIntervals() {
unsigned sz = 0;
unsigned tmp = 0;
const char *fn =
"/home/kotsifa2/HPVM/hpvm/build/projects/visc-rt/deviceStatusSwitchIntervals.txt";
std::ifstream infile;
infile.open(fn);
if (!infile.is_open()) {
std::cout << "Failed to open " << fn << " for reading\n";
return;
}
infile >> sz;
if (sz) {
// We have data. Read them into the vector
for (unsigned i = 0; i < sz; i++) {
infile >> tmp;
Intervals.push_back(tmp);
}
infile.close();
} else {
// We have no data. Create random data and write them into the file
infile.close();
std::ofstream outfile;
outfile.open(fn);
if (!outfile.is_open()) {
std::cout << "Failed to open " << fn << " for writing\n";
return;
}
sz = 1 + rand()%NUM_INTERVALS;
outfile << sz;
for (unsigned i = 0; i < sz; i++) {
Intervals.push_back(MIN_INTERVAL + rand()%(MAX_INTERVAL - MIN_INTERVAL));
outfile << Intervals[i];
}
outfile.close();
}
return;
}
void updateDeviceStatus() {
unsigned i = 0;
while (!executionEnd) {
std::this_thread::sleep_for (std::chrono::seconds(Intervals[i]));
deviceStatus = !deviceStatus;
std::cout << "Changed device status to " << deviceStatus << "\n";
i = (i+1) % Intervals.size();
}
}
#endif // __DEVICE_ABSTRACTION__
LLVM_SRC_ROOT = ../../../llvm
LLVM_BUILD_ROOT = ../..
OPENCL_INC_PATH = /opt/intel/opencl-sdk/include
ifeq ($(NUM_CORES),)
NUM_CORES=8
endif
CPP_FLAGS = -I $(LLVM_SRC_ROOT)/include -I $(LLVM_BUILD_ROOT)/include -I $(OPENCL_INC_PATH) -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
TARGET:=visc-rt
#LLVM_CC:=$(LLVM_BUILD_ROOT)/bin/clang
#LLVM_CXX:=$(LLVM_BUILD_ROOT)/bin/clang++
LLVM_CC=clang-4.0
LLVM_CXX=clang++-4.0
OPTS =
ifeq ($(DEBUG),1)
OPTS+=-DDEBUG_BUILD
endif
all: $(TARGET:%=%.ll)
$(TARGET:%=%.ll):%.ll:%.cpp %.h
$(LLVM_CXX) -DNUM_CORES=$(NUM_CORES) -O3 -S -emit-llvm $(CPP_FLAGS) $(OPTS) $< -o $@
clean :
rm -f $(TARGET).ll
#ifndef __POLICY__
#define __POLICY__
#include <string>
#include "device_abstraction.h"
/************************* Policies *************************************/
class Policy {
public:
virtual int getVersion(const char *, int64_t) = 0;
virtual ~Policy() {};
};
class NodePolicy : public Policy {
virtual int getVersion(const char *name, int64_t it) override {
std::string s(name);
//std::string NodeNames[1] = { "_Z9mysgemmNTPfiS_iS_iiff_clonedInternal_level2_cloned" };
std::string NodeNames[] = {
"WrapperGaussianSmoothing_cloned",
"WrapperlaplacianEstimate_cloned",
"WrapperComputeZeroCrossings_cloned",
"WrapperComputeGradient_cloned",
"WrapperComputeMaxGradient_cloned",
"WrapperRejectZeroCrossings_cloned",
};
//if (!s.compare(NodeNames[4])) {
// std::cout << s << ": CPU" << "\n";
// return 0;
//}
return 2;
}
};
class IterationPolicy : public Policy {
virtual int getVersion(const char *name, int64_t it) override {
if ((it % 10 == 0) || (it % 10 == 1))
return 0;
else
return 2;
}
};
class DeviceStatusPolicy : public Policy {
virtual int getVersion(const char *name, int64_t it) override {
if (deviceStatus) {
//std::cout << "Returning GPU\n";
return 2;
}
else {
//std::cout << "Returning CPU\n";
return 0;
}
}
};
/* ------------------------------------------------------------------------- */
// Added for the CFAR interactive policy demo.
class InteractivePolicy : public Policy {
private:
// 0 :for CPU, 1 for GPU, 2 for Vector
unsigned int userTargetDeviceChoice;
// Used to end thread execution
bool end;
// Thread that will update userTargetDeviceChoice
//std::thread userTargetDeviceChoiceThread;
// Thread function
void updateUserTargetChoice() {
while (!end) {
std::cout << "Select target device (0 for CPU, 1 fpr GPU): ";
std::cin >> userTargetDeviceChoice;
if (userTargetDeviceChoice > 1) {
std::cout << "Invalid target device. Selecting GPU instead.\n";
userTargetDeviceChoice = 1;
}
}
}
public:
// Inherited method, erquired for every policy object
virtual int getVersion(const char *name, int64_t it) {
return userTargetDeviceChoice;
}
InteractivePolicy() {
userTargetDeviceChoice = 1;
end = false;
//userTargetDeviceChoiceThread =
// std::thread(&InteractivePolicy::updateUserTargetChoice, this);
}
~InteractivePolicy() {
end = true;
//userTargetDeviceChoiceThread.join();
}
};
#endif // __POLICY__
#include <iostream>
#include <string>
#include <pthread.h>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <cassert>
#include <map>
#include <CL/cl.h>
#include <unistd.h>
#if _POSIX_VERSION >= 200112L
# include <sys/time.h>
#endif
#include "visc-rt.h"
#ifndef DEBUG_BUILD
#define DEBUG(s) {}
#else
#define DEBUG(s) s
#endif
#define BILLION 1000000000LL
using namespace std;
typedef struct {
pthread_t threadID;
std::vector<pthread_t>* threads;
// Map from InputPort to Size
std::map<unsigned, uint64_t>* ArgInPortSizeMap;
//std::vector<uint64_t>* BindInSizes;
std::vector<unsigned>* BindInSourcePort;
std::vector<uint64_t>* BindOutSizes;
std::vector<uint64_t>* EdgeSizes;
std::vector<CircularBuffer<uint64_t>*>* BindInputBuffers;
std::vector<CircularBuffer<uint64_t>*>* BindOutputBuffers;
std::vector<CircularBuffer<uint64_t>*>* EdgeBuffers;
std::vector<CircularBuffer<uint64_t>*>* isLastInputBuffers;
} DFNodeContext_X86;
typedef struct {
cl_context clOCLContext;
cl_command_queue clCommandQue;
cl_program clProgram;
cl_kernel clKernel;
} DFNodeContext_OCL;
cl_context globalOCLContext;
cl_device_id* clDevices;
cl_command_queue globalCommandQue;
Policy *policy = NULL;
MemTracker MTracker;
vector<DFGDepth> DStack;
// Mutex to prevent concurrent access by multiple thereads in pipeline
pthread_mutex_t ocl_mtx;
#define NUM_TESTS 1
visc_TimerSet kernel_timer;
static inline void checkErr(cl_int err, cl_int success, const char * name) {
if (err != success) {
cout << "ERROR: " << name << flush << "\n";
cout << "ErrorCode: " << err << flush << "\n";
exit(EXIT_FAILURE);
}
}
/************************* Policies *************************************/
void llvm_visc_policy_init() {
cout << "Initializing policy object ...\n";
// policy = new NodePolicy();
// policy = new IterationPolicy();
// policy = new DeviceStatusPolicy();
policy = new InteractivePolicy();
cout << "DONE: Initializing policy object.\n";
}
void llvm_visc_policy_clear() {
if (policy) free(policy);
}
int llvm_visc_policy_getVersion(const char *name, int64_t i) {
return policy->getVersion(name, i);
}
/******************** Device Abstraction ********************************/
//std::thread deviceStatusThread;
/*
void llvm_visc_deviceAbstraction_start() {
cout << "Starting device status simulation ...\n";
// Initialize vector with points where ti switch device status
initializeDeviceStatusIntervals();
// Create a thread that performs the changes
deviceStatusThread = std::thread(updateDeviceStatus);
cout << "Started device status simulation thread ...\n";
return;
}
*/
void llvm_visc_deviceAbstraction_end() {
cout << "Ending device status simulation thread ...\n";
// Set the variable that allows the thread to know that execution has ended
executionEnd = true;
// Wait for the thread that manages device status to terminate
//deviceStatusThread.join();
cout << "Ended device status simulation.\n";
return;
}
void llvm_visc_deviceAbstraction_waitOnDeviceStatus() {
while (!deviceStatus) { };
return;
}
/************************* Depth Stack Routines ***************************/
void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX, uint64_t limitY,
uint64_t iY, uint64_t limitZ, uint64_t iZ) {
//DEBUG(cout << "Pushing node information on stack:\n");
//DEBUG(cout << "\tNumDim = " << n << "\t Limit(" << limitX << ", " << limitY << ", "<< limitZ <<")\n");
//DEBUG(cout << "\tInstance(" << iX << ", " << iY << ", "<< iZ <<")\n");
//DFGDepth nodeInfo (n, limitX, iX, limitY, iY, limitZ, iZ);
//DStack.push_back(nodeInfo);
//DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n");
}
void llvm_visc_x86_dstack_pop() {
//DEBUG(cout << "Popping from depth stack\n");
//DStack.pop_back();
//DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n");
}
uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim) {
//DEBUG(cout << "Request limit for dim " << dim << " of ancestor " << level <<flush << "\n");
//unsigned size = DStack.size();
//DEBUG(cout << "\t Return: " << DStack[size-level-1].getDimLimit(dim) <<flush << "\n");
//return DStack[size-level-1].getDimLimit(dim);
return 0;
}
uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim) {
//DEBUG(cout << "Request instance id for dim " << dim << " of ancestor " << level <<flush << "\n");
//unsigned size = DStack.size();
//DEBUG(cout << "\t Return: " << DStack[size-level-1].getDimInstance(dim) <<flush << "\n");
//return DStack[size-level-1].getDimInstance(dim);
return 0;
}
/********************** Memory Tracking Routines **************************/
void llvm_visc_track_mem(void* ptr, size_t size) {
DEBUG(cout << "Start tracking memory: " << ptr << flush << "\n");
MemTrackerEntry* MTE = MTracker.lookup(ptr);
if(MTE != NULL) {
DEBUG(cout << "ID " << ptr << " already present in the MemTracker Table\n");
return;
}
DEBUG(cout << "Inserting ID " << ptr << " in the MemTracker Table\n");
MTracker.insert(ptr, size, MemTrackerEntry::HOST, ptr);
DEBUG(MTracker.print());
}
void llvm_visc_untrack_mem(void* ptr) {
DEBUG(cout << "Stop tracking memory: " << ptr << flush << "\n");
MemTrackerEntry* MTE = MTracker.lookup(ptr);
if(MTE == NULL) {
cout << "WARNING: Trying to remove ID " << ptr << " not present in the MemTracker Table\n";
return;
}
DEBUG(cout << "Removing ID " << ptr << " from MemTracker Table\n");
if(MTE->getLocation() == MemTrackerEntry::DEVICE)
clReleaseMemObject((cl_mem) MTE->getAddress());
MTracker.remove(ptr);
DEBUG(MTracker.print());
}
static void* llvm_visc_ocl_request_mem(void* ptr, size_t size, DFNodeContext_OCL* Context, bool isInput, bool isOutput) {
pthread_mutex_lock(&ocl_mtx);
DEBUG(cout << "[OCL] Request memory: " << ptr << " for context: " << Context->clOCLContext << flush << "\n");
MemTrackerEntry* MTE = MTracker.lookup(ptr);
if (MTE == NULL) {
MTracker.print();
cout << "ERROR: Requesting memory not present in Table\n";
exit(EXIT_FAILURE);
}
// If already on device
if (MTE->getLocation() == MemTrackerEntry::DEVICE &&
((DFNodeContext_OCL*)MTE->getContext())->clOCLContext == Context->clOCLContext) {
DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush << "\n");
pthread_mutex_unlock(&ocl_mtx);
return MTE->getAddress();
}
DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush << "\n");
DEBUG(cout << "\t"; MTE->print(); cout << flush << "\n");
// Else copy and update the latest copy
cl_mem_flags clFlags;
cl_int errcode;
if(isInput && isOutput) clFlags = CL_MEM_READ_WRITE;
else if(isInput) clFlags = CL_MEM_READ_ONLY;
else if(isOutput) clFlags = CL_MEM_WRITE_ONLY;
else clFlags = CL_MEM_READ_ONLY;
visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY);
//pthread_mutex_lock(&ocl_mtx);
cl_mem d_input = clCreateBuffer(Context->clOCLContext, clFlags, size, NULL, &errcode);
//pthread_mutex_unlock(&ocl_mtx);
checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
DEBUG(cout<< "\nMemory allocated on device: " << d_input << flush << "\n");
if(isInput) {
DEBUG(cout << "\tCopying ...");
//pthread_mutex_lock(&ocl_mtx);
errcode = clEnqueueWriteBuffer(Context->clCommandQue,
d_input,
CL_TRUE,
0,
size,MTE->getAddress(),
0,NULL,NULL);
//pthread_mutex_unlock(&ocl_mtx);
checkErr(errcode, CL_SUCCESS, "Failure to copy memory to device");
}
visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
DEBUG(cout << " done\n");
MTE->update(MemTrackerEntry::DEVICE, (void*) d_input, Context);
DEBUG(cout << "Updated Table\n");
DEBUG(MTracker.print());
pthread_mutex_unlock(&ocl_mtx);
return d_input;
}
void* llvm_visc_x86_argument_ptr(void* ptr, size_t size) {
return llvm_visc_request_mem(ptr, size);
}
void* llvm_visc_request_mem(void* ptr, size_t size) {
// Ignore objects whose size is 0 - no memory is requested.
if (size == 0) {
DEBUG(cout << "[X86] Request memory (ignored): " << ptr << flush << "\n");
return ptr;
}
pthread_mutex_lock(&ocl_mtx);
DEBUG(cout << "[X86] Request memory: " << ptr << flush << "\n");
MemTrackerEntry* MTE = MTracker.lookup(ptr);
if(MTE == NULL) {
cout << "ERROR: Requesting memory not present in Table\n";
pthread_mutex_unlock(&ocl_mtx);
exit(EXIT_FAILURE);
}
// If already on host
if(MTE->getLocation() == MemTrackerEntry::HOST) {
DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush << "\n");
pthread_mutex_unlock(&ocl_mtx);
return MTE->getAddress();
}
// Else copy from device and update table
DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush << "\n");
DEBUG(cout << "\tCopying ...");
visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY);
//pthread_mutex_lock(&ocl_mtx);
cl_int errcode = clEnqueueReadBuffer(((DFNodeContext_OCL*)MTE->getContext())->clCommandQue,
(cl_mem) MTE->getAddress(),
CL_TRUE,
0,
size,
ptr,
0, NULL, NULL);
//pthread_mutex_unlock(&ocl_mtx);
visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
DEBUG(cout << " done\n");
checkErr(errcode, CL_SUCCESS, "[request mem] Failure to read output");
DEBUG(cout << "Free mem object on device\n");
clReleaseMemObject((cl_mem) MTE->getAddress());
DEBUG(cout << "Updated Table\n");
MTE->update(MemTrackerEntry::HOST, ptr);
DEBUG(MTracker.print());
pthread_mutex_unlock(&ocl_mtx);
return ptr;
}
/*************************** Timer Routines **********************************/
static int is_async(enum visc_TimerID timer)
{
return (timer == visc_TimerID_KERNEL) ||
(timer == visc_TimerID_COPY_ASYNC);
}
static int is_blocking(enum visc_TimerID timer)
{
return (timer == visc_TimerID_COPY) || (timer == visc_TimerID_NONE);
}
#define INVALID_TIMERID visc_TimerID_LAST
static int asyncs_outstanding(struct visc_TimerSet* timers)
{
return (timers->async_markers != NULL) &&
(timers->async_markers->timerID != INVALID_TIMERID);
}
static struct visc_async_time_marker_list *
get_last_async(struct visc_TimerSet* timers)
{
/* Find the last event recorded thus far */
struct visc_async_time_marker_list * last_event = timers->async_markers;
if(last_event != NULL && last_event->timerID != INVALID_TIMERID) {
while(last_event->next != NULL &&
last_event->next->timerID != INVALID_TIMERID)
last_event = last_event->next;
return last_event;
} else
return NULL;
}
static void insert_marker(struct visc_TimerSet* tset, enum visc_TimerID timer)
{
cl_int ciErrNum = CL_SUCCESS;
struct visc_async_time_marker_list ** new_event = &(tset->async_markers);
while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
new_event = &((*new_event)->next);
}
if(*new_event == NULL) {
*new_event = (struct visc_async_time_marker_list *)
malloc(sizeof(struct visc_async_time_marker_list));
(*new_event)->marker = calloc(1, sizeof(cl_event));
/*
// I don't think this is needed at all. I believe clEnqueueMarker 'creates' the event
#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 )
fprintf(stderr, "Creating Marker [%d]\n", timer);
*((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum);
if (ciErrNum != CL_SUCCESS) {
fprintf(stderr, "Error Creating User Event Object!\n");
}
ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED);
if (ciErrNum != CL_SUCCESS) {
fprintf(stderr, "Error Setting User Event Status!\n");
}
#endif
*/
(*new_event)->next = NULL;
}
/* valid event handle now aquired: insert the event record */
(*new_event)->label = NULL;
(*new_event)->timerID = timer;
//pthread_mutex_lock(&ocl_mtx);
ciErrNum = clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker);
//pthread_mutex_unlock(&ocl_mtx);
if (ciErrNum != CL_SUCCESS) {
fprintf(stderr, "Error Enqueueing Marker!\n");
}
}
static void insert_submarker(struct visc_TimerSet* tset, char *label, enum visc_TimerID timer)
{
cl_int ciErrNum = CL_SUCCESS;
struct visc_async_time_marker_list ** new_event = &(tset->async_markers);
while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
new_event = &((*new_event)->next);
}
if(*new_event == NULL) {
*new_event = (struct visc_async_time_marker_list *)
malloc(sizeof(struct visc_async_time_marker_list));
(*new_event)->marker = calloc(1, sizeof(cl_event));
/*
#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 )
fprintf(stderr, "Creating SubMarker %s[%d]\n", label, timer);
*((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum);
if (ciErrNum != CL_SUCCESS) {
fprintf(stderr, "Error Creating User Event Object!\n");
}
ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED);
if (ciErrNum != CL_SUCCESS) {
fprintf(stderr, "Error Setting User Event Status!\n");
}
#endif
*/
(*new_event)->next = NULL;
}
/* valid event handle now aquired: insert the event record */
(*new_event)->label = label;
(*new_event)->timerID = timer;
//pthread_mutex_lock(&ocl_mtx);
ciErrNum = clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker);
//pthread_mutex_unlock(&ocl_mtx);
if (ciErrNum != CL_SUCCESS) {
fprintf(stderr, "Error Enqueueing Marker!\n");
}
}
/* Assumes that all recorded events have completed */
static visc_Timestamp record_async_times(struct visc_TimerSet* tset)
{
struct visc_async_time_marker_list * next_interval = NULL;
struct visc_async_time_marker_list * last_marker = get_last_async(tset);
visc_Timestamp total_async_time = 0;
for(next_interval = tset->async_markers; next_interval != last_marker;
next_interval = next_interval->next) {
cl_ulong command_start=0, command_end=0;
cl_int ciErrNum = CL_SUCCESS;
ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_start, NULL);
if (ciErrNum != CL_SUCCESS) {
fprintf(stderr, "Error getting first EventProfilingInfo: %d\n", ciErrNum);
}
ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_end, NULL);
if (ciErrNum != CL_SUCCESS) {
fprintf(stderr, "Error getting second EventProfilingInfo: %d\n", ciErrNum);
}
visc_Timestamp interval = (visc_Timestamp) (((double)(command_end - command_start)));
tset->timers[next_interval->timerID].elapsed += interval;
if (next_interval->label != NULL) {
struct visc_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list;
while (subtimer != NULL) {
if ( strcmp(subtimer->label, next_interval->label) == 0) {
subtimer->timer.elapsed += interval;
break;
}
subtimer = subtimer->next;
}
}
total_async_time += interval;
next_interval->timerID = INVALID_TIMERID;
}
if(next_interval != NULL)
next_interval->timerID = INVALID_TIMERID;
return total_async_time;
}
static void
accumulate_time(visc_Timestamp *accum,
visc_Timestamp start,
visc_Timestamp end)
{
#if _POSIX_VERSION >= 200112L
*accum += end - start;
#else
# error "Timestamps not implemented for this system"
#endif
}
#if _POSIX_VERSION >= 200112L
static visc_Timestamp get_time()
{
struct timespec tv;
clock_gettime(CLOCK_MONOTONIC, &tv);
return (visc_Timestamp) (tv.tv_sec * BILLION + tv.tv_nsec);
}
#else
# error "no supported time libraries are available on this platform"
#endif
void
visc_ResetTimer(struct visc_Timer *timer)
{
timer->state = visc_Timer_STOPPED;
#if _POSIX_VERSION >= 200112L
timer->elapsed = 0;
#else
# error "visc_ResetTimer: not implemented for this system"
#endif
}
void
visc_StartTimer(struct visc_Timer *timer)
{
if (timer->state != visc_Timer_STOPPED) {
// FIXME: Removing warning statement to avoid printing this error
// fputs("Ignoring attempt to start a running timer\n", stderr);
return;
}
timer->state = visc_Timer_RUNNING;
#if _POSIX_VERSION >= 200112L
{
struct timespec tv;
clock_gettime(CLOCK_MONOTONIC, &tv);
timer->init = tv.tv_sec * BILLION + tv.tv_nsec;
}
#else
# error "visc_StartTimer: not implemented for this system"
#endif
}
void
visc_StartTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer)
{
unsigned int numNotStopped = 0x3; // 11
if (timer->state != visc_Timer_STOPPED) {
fputs("Warning: Timer was not stopped\n", stderr);
numNotStopped &= 0x1; // Zero out 2^1
}
if (subtimer->state != visc_Timer_STOPPED) {
fputs("Warning: Subtimer was not stopped\n", stderr);
numNotStopped &= 0x2; // Zero out 2^0
}
if (numNotStopped == 0x0) {
//fputs("Ignoring attempt to start running timer and subtimer\n", stderr);
return;
}
timer->state = visc_Timer_RUNNING;
subtimer->state = visc_Timer_RUNNING;
#if _POSIX_VERSION >= 200112L
{
struct timespec tv;
clock_gettime(CLOCK_MONOTONIC, &tv);
if (numNotStopped & 0x2) {
timer->init = tv.tv_sec * BILLION + tv.tv_nsec;
}
if (numNotStopped & 0x1) {
subtimer->init = tv.tv_sec * BILLION + tv.tv_nsec;
}
}
#else
# error "visc_StartTimer: not implemented for this system"
#endif
}
void
visc_StopTimer(struct visc_Timer *timer)
{
visc_Timestamp fini;
if (timer->state != visc_Timer_RUNNING) {
//fputs("Ignoring attempt to stop a stopped timer\n", stderr);
return;
}
timer->state = visc_Timer_STOPPED;
#if _POSIX_VERSION >= 200112L
{
struct timespec tv;
clock_gettime(CLOCK_MONOTONIC, &tv);
fini = tv.tv_sec * BILLION + tv.tv_nsec;
}
#else
# error "visc_StopTimer: not implemented for this system"
#endif
accumulate_time(&timer->elapsed, timer->init, fini);
timer->init = fini;
}
void visc_StopTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer) {
visc_Timestamp fini;
unsigned int numNotRunning = 0x3; // 11
if (timer->state != visc_Timer_RUNNING) {
fputs("Warning: Timer was not running\n", stderr);
numNotRunning &= 0x1; // Zero out 2^1
}
if (subtimer->state != visc_Timer_RUNNING) {
fputs("Warning: Subtimer was not running\n", stderr);
numNotRunning &= 0x2; // Zero out 2^0
}
if (numNotRunning == 0x0) {
//fputs("Ignoring attempt to stop stopped timer and subtimer\n", stderr);
return;
}
timer->state = visc_Timer_STOPPED;
subtimer->state = visc_Timer_STOPPED;
#if _POSIX_VERSION >= 200112L
{
struct timespec tv;
clock_gettime(CLOCK_MONOTONIC, &tv);
fini = tv.tv_sec * BILLION + tv.tv_nsec;
}
#else
# error "visc_StopTimer: not implemented for this system"
#endif
if (numNotRunning & 0x2) {
accumulate_time(&timer->elapsed, timer->init, fini);
timer->init = fini;
}
if (numNotRunning & 0x1) {
accumulate_time(&subtimer->elapsed, subtimer->init, fini);
subtimer->init = fini;
}
}
/* Get the elapsed time in seconds. */
double
visc_GetElapsedTime(struct visc_Timer *timer)
{
double ret;
if (timer->state != visc_Timer_STOPPED) {
fputs("Elapsed time from a running timer is inaccurate\n", stderr);
}
#if _POSIX_VERSION >= 200112L
ret = timer->elapsed / 1e9;
#else
# error "visc_GetElapsedTime: not implemented for this system"
#endif
return ret;
}
void
visc_InitializeTimerSet(struct visc_TimerSet *timers)
{
int n;
timers->wall_begin = get_time();
timers->current = visc_TimerID_NONE;
timers->async_markers = NULL;
for (n = 0; n < visc_TimerID_LAST; n++) {
visc_ResetTimer(&timers->timers[n]);
timers->sub_timer_list[n] = NULL;
}
}
void
visc_AddSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID visc_Category) {
struct visc_SubTimer *subtimer = (struct visc_SubTimer *) malloc
(sizeof(struct visc_SubTimer));
int len = strlen(label);
subtimer->label = (char *) malloc (sizeof(char)*(len+1));
sprintf(subtimer->label, "%s", label);
visc_ResetTimer(&subtimer->timer);
subtimer->next = NULL;
struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[visc_Category];
if (subtimerlist == NULL) {
subtimerlist = (struct visc_SubTimerList *) calloc
(1, sizeof(struct visc_SubTimerList));
subtimerlist->subtimer_list = subtimer;
timers->sub_timer_list[visc_Category] = subtimerlist;
} else {
// Append to list
struct visc_SubTimer *element = subtimerlist->subtimer_list;
while (element->next != NULL) {
element = element->next;
}
element->next = subtimer;
}
}
void
visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer)
{
//cerr << "Switch to timer: " << timer << flush << "\n";
/* Stop the currently running timer */
if (timers->current != visc_TimerID_NONE) {
struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
struct visc_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL;
if (!is_async(timers->current) ) {
if (timers->current != timer) {
if (currSubTimer != NULL) {
visc_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer);
} else {
visc_StopTimer(&timers->timers[timers->current]);
}
} else {
if (currSubTimer != NULL) {
visc_StopTimer(&currSubTimer->timer);
}
}
} else {
insert_marker(timers, timer);
if (!is_async(timer)) { // if switching to async too, keep driver going
visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]);
}
}
}
visc_Timestamp currentTime = get_time();
/* The only cases we check for asynchronous task completion is
* when an overlapping CPU operation completes, or the next
* segment blocks on completion of previous async operations */
if( asyncs_outstanding(timers) &&
(!is_async(timers->current) || is_blocking(timer) ) ) {
struct visc_async_time_marker_list * last_event = get_last_async(timers);
/* CL_COMPLETE if completed */
cl_int ciErrNum = CL_SUCCESS;
cl_int async_done = CL_COMPLETE;
ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL);
if (ciErrNum != CL_SUCCESS) {
fprintf(stdout, "Error Querying EventInfo1!\n");
}
if(is_blocking(timer)) {
/* Async operations completed after previous CPU operations:
* overlapped time is the total CPU time since this set of async
* operations were first issued */
// timer to switch to is COPY or NONE
if(async_done != CL_COMPLETE) {
accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed),
timers->async_begin,currentTime);
}
/* Wait on async operation completion */
ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker);
if (ciErrNum != CL_SUCCESS) {
fprintf(stderr, "Error Waiting for Events!\n");
}
visc_Timestamp total_async_time = record_async_times(timers);
/* Async operations completed before previous CPU operations:
* overlapped time is the total async time */
if(async_done == CL_COMPLETE) {
//fprintf(stderr, "Async_done: total_async_type = %lld\n", total_async_time);
timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time;
}
} else
/* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
// i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding
// so something is deeper in stack
if(async_done == CL_COMPLETE ) {
/* Async operations completed before previous CPU operations:
* overlapped time is the total async time */
timers->timers[visc_TimerID_OVERLAP].elapsed += record_async_times(timers);
}
}
/* Start the new timer */
if (timer != visc_TimerID_NONE) {
if(!is_async(timer)) {
visc_StartTimer(&timers->timers[timer]);
} else {
// toSwitchTo Is Async (KERNEL/COPY_ASYNC)
if (!asyncs_outstanding(timers)) {
/* No asyncs outstanding, insert a fresh async marker */
insert_marker(timers, timer);
timers->async_begin = currentTime;
} else if(!is_async(timers->current)) {
/* Previous asyncs still in flight, but a previous SwitchTo
* already marked the end of the most recent async operation,
* so we can rename that marker as the beginning of this async
* operation */
struct visc_async_time_marker_list * last_event = get_last_async(timers);
last_event->label = NULL;
last_event->timerID = timer;
}
if (!is_async(timers->current)) {
visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]);
}
}
}
timers->current = timer;
}
void
visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID category)
{
struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
struct visc_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL;
if (timers->current != visc_TimerID_NONE) {
if (!is_async(timers->current) ) {
if (timers->current != category) {
if (curr != NULL) {
visc_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer);
} else {
visc_StopTimer(&timers->timers[timers->current]);
}
} else {
if (curr != NULL) {
visc_StopTimer(&curr->timer);
}
}
} else {
insert_submarker(timers, label, category);
if (!is_async(category)) { // if switching to async too, keep driver going
visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]);
}
}
}
visc_Timestamp currentTime = get_time();
/* The only cases we check for asynchronous task completion is
* when an overlapping CPU operation completes, or the next
* segment blocks on completion of previous async operations */
if( asyncs_outstanding(timers) &&
(!is_async(timers->current) || is_blocking(category) ) ) {
struct visc_async_time_marker_list * last_event = get_last_async(timers);
/* CL_COMPLETE if completed */
cl_int ciErrNum = CL_SUCCESS;
cl_int async_done = CL_COMPLETE;
ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL);
if (ciErrNum != CL_SUCCESS) {
fprintf(stdout, "Error Querying EventInfo2!\n");
}
if(is_blocking(category)) {
/* Async operations completed after previous CPU operations:
* overlapped time is the total CPU time since this set of async
* operations were first issued */
// timer to switch to is COPY or NONE
// if it hasn't already finished, then just take now and use that as the elapsed time in OVERLAP
// anything happening after now isn't OVERLAP because everything is being stopped to wait for synchronization
// it seems that the extra sync wall time isn't being recorded anywhere
if(async_done != CL_COMPLETE)
accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed),
timers->async_begin,currentTime);
/* Wait on async operation completion */
ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker);
if (ciErrNum != CL_SUCCESS) {
fprintf(stderr, "Error Waiting for Events!\n");
}
visc_Timestamp total_async_time = record_async_times(timers);
/* Async operations completed before previous CPU operations:
* overlapped time is the total async time */
// If it did finish, then accumulate all the async time that did happen into OVERLAP
// the immediately preceding EventSynchronize theoretically didn't have any effect since it was already completed.
if(async_done == CL_COMPLETE /*cudaSuccess*/)
timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time;
} else
/* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
// i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding
// so something is deeper in stack
if(async_done == CL_COMPLETE /*cudaSuccess*/) {
/* Async operations completed before previous CPU operations:
* overlapped time is the total async time */
timers->timers[visc_TimerID_OVERLAP].elapsed += record_async_times(timers);
}
// else, this isn't blocking, so just check the next time around
}
subtimerlist = timers->sub_timer_list[category];
struct visc_SubTimer *subtimer = NULL;
if (label != NULL) {
subtimer = subtimerlist->subtimer_list;
while (subtimer != NULL) {
if (strcmp(subtimer->label, label) == 0) {
break;
} else {
subtimer = subtimer->next;
}
}
}
/* Start the new timer */
if (category != visc_TimerID_NONE) {
if(!is_async(category)) {
if (subtimerlist != NULL) {
subtimerlist->current = subtimer;
}
if (category != timers->current && subtimer != NULL) {
visc_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer);
} else if (subtimer != NULL) {
visc_StartTimer(&subtimer->timer);
} else {
visc_StartTimer(&timers->timers[category]);
}
} else {
if (subtimerlist != NULL) {
subtimerlist->current = subtimer;
}
// toSwitchTo Is Async (KERNEL/COPY_ASYNC)
if (!asyncs_outstanding(timers)) {
/* No asyncs outstanding, insert a fresh async marker */
insert_submarker(timers, label, category);
timers->async_begin = currentTime;
} else if(!is_async(timers->current)) {
/* Previous asyncs still in flight, but a previous SwitchTo
* already marked the end of the most recent async operation,
* so we can rename that marker as the beginning of this async
* operation */
struct visc_async_time_marker_list * last_event = get_last_async(timers);
last_event->timerID = category;
last_event->label = label;
} // else, marker for switchToThis was already inserted
//toSwitchto is already asynchronous, but if current/prev state is async too, then DRIVER is already running
if (!is_async(timers->current)) {
visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]);
}
}
}
timers->current = category;
}
void
visc_PrintTimerSet(struct visc_TimerSet *timers)
{
visc_Timestamp wall_end = get_time();
struct visc_Timer *t = timers->timers;
struct visc_SubTimer* sub = NULL;
int maxSubLength;
const char *categories[] = {
"IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute", "Overlap",
"Init_Ctx", "Clear_Ctx", "Copy_Scalar", "Copy_Ptr", "Mem_Free",
"Read_Output", "Setup", "Mem_Track", "Mem_Untrack", "Misc",
"Pthread_Create", "Arg_Pack", "Arg_Unpack", "Computation", "Output_Pack", "Output_Unpack"
};
const int maxCategoryLength = 20;
int i;
for(i = 1; i < visc_TimerID_LAST; ++i) { // exclude NONE and OVRELAP from this format
if(visc_GetElapsedTime(&t[i]) != 0 || true) {
// Print Category Timer
printf("%-*s: %.9f\n", maxCategoryLength, categories[i-1], visc_GetElapsedTime(&t[i]));
if (timers->sub_timer_list[i] != NULL) {
sub = timers->sub_timer_list[i]->subtimer_list;
maxSubLength = 0;
while (sub != NULL) {
// Find longest SubTimer label
if (strlen(sub->label) > (unsigned long) maxSubLength) {
maxSubLength = strlen(sub->label);
}
sub = sub->next;
}
// Fit to Categories
if (maxSubLength <= maxCategoryLength) {
maxSubLength = maxCategoryLength;
}
sub = timers->sub_timer_list[i]->subtimer_list;
// Print SubTimers
while (sub != NULL) {
printf(" -%-*s: %.9f\n", maxSubLength, sub->label, visc_GetElapsedTime(&sub->timer));
sub = sub->next;
}
}
}
}
if(visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]) != 0)
printf("CPU/Kernel Overlap: %.9f\n", visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]));
float walltime = (wall_end - timers->wall_begin)/ 1e9;
printf("Timer Wall Time: %.9f\n", walltime);
}
void visc_DestroyTimerSet(struct visc_TimerSet * timers)
{
/* clean up all of the async event markers */
struct visc_async_time_marker_list* event = timers->async_markers;
while(event != NULL) {
cl_int ciErrNum = CL_SUCCESS;
ciErrNum = clWaitForEvents(1, (cl_event *)(event)->marker);
if (ciErrNum != CL_SUCCESS) {
//fprintf(stderr, "Error Waiting for Events!\n");
}
ciErrNum = clReleaseEvent( *((cl_event *)(event)->marker) );
if (ciErrNum != CL_SUCCESS) {
fprintf(stderr, "Error Release Events!\n");
}
free((event)->marker);
struct visc_async_time_marker_list* next = ((event)->next);
free(event);
// (*event) = NULL;
event = next;
}
int i = 0;
for(i = 0; i < visc_TimerID_LAST; ++i) {
if (timers->sub_timer_list[i] != NULL) {
struct visc_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list;
struct visc_SubTimer *prev = NULL;
while (subtimer != NULL) {
free(subtimer->label);
prev = subtimer;
subtimer = subtimer->next;
free(prev);
}
free(timers->sub_timer_list[i]);
}
}
}
/**************************** Pipeline API ************************************/
#define BUFFER_SIZE 1
// Launch API for a streaming dataflow graph
void* llvm_visc_streamLaunch(void(*LaunchFunc)(void*, void*), void* args) {
DFNodeContext_X86* Context = (DFNodeContext_X86*) malloc(sizeof(DFNodeContext_X86));
Context->threads = new std::vector<pthread_t>();
Context->ArgInPortSizeMap = new std::map<unsigned, uint64_t>();
//Context->BindInSizes = new std::vector<uint64_t>();
Context->BindInSourcePort = new std::vector<unsigned>();
Context->BindOutSizes = new std::vector<uint64_t>();
Context->EdgeSizes = new std::vector<uint64_t>();
Context->BindInputBuffers = new std::vector<CircularBuffer<uint64_t>*>();
Context->BindOutputBuffers = new std::vector<CircularBuffer<uint64_t>*>();
Context->EdgeBuffers = new std::vector<CircularBuffer<uint64_t>*>();
Context->isLastInputBuffers = new std::vector<CircularBuffer<uint64_t>*>();
DEBUG(cout << "StreamLaunch -- Graph: " << Context << ", Arguments: " << args << flush << "\n");
LaunchFunc(args, Context);
return Context;
}
// Push API for a streaming dataflow graph
void llvm_visc_streamPush(void* graphID, void* args) {
DEBUG(cout << "StreamPush -- Graph: " << graphID << ", Arguments: " << args << flush << "\n");
DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
unsigned offset = 0;
for (unsigned i=0; i< Ctx->ArgInPortSizeMap->size(); i++) {
uint64_t element;
memcpy(&element, (char*)args+offset, Ctx->ArgInPortSizeMap->at(i));
offset += Ctx->ArgInPortSizeMap->at(i);
for(unsigned j=0; j<Ctx->BindInputBuffers->size();j++) {
if(Ctx->BindInSourcePort->at(j) == i) {
// Push to all bind buffers connected to parent node at this port
//DEBUG(cout << "\tPushing Value " << element << " to buffer\n");
llvm_visc_bufferPush(Ctx->BindInputBuffers->at(j), element);
}
}
}
// Push 0 in isLastInput buffers of all child nodes
for (CircularBuffer<uint64_t>* buffer: *(Ctx->isLastInputBuffers))
llvm_visc_bufferPush(buffer, 0);
}
// Pop API for a streaming dataflow graph
void* llvm_visc_streamPop(void* graphID) {
DEBUG(cout << "StreamPop -- Graph: " << graphID << flush << "\n");
DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
unsigned totalBytes = 0;
for(uint64_t size: *(Ctx->BindOutSizes))
totalBytes+= size;
void* output = malloc(totalBytes);
unsigned offset = 0;
for (unsigned i=0; i< Ctx->BindOutputBuffers->size(); i++) {
uint64_t element = llvm_visc_bufferPop(Ctx->BindOutputBuffers->at(i));
//DEBUG(cout << "\tPopped Value " << element << " from buffer\n");
memcpy((char*)output+offset, &element, Ctx->BindOutSizes->at(i));
offset += Ctx->BindOutSizes->at(i);
}
return output;
}
// Wait API for a streaming dataflow graph
void llvm_visc_streamWait(void* graphID) {
DEBUG(cout << "StreamWait -- Graph: " << graphID << flush << "\n");
DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
// Push garbage to all other input buffers
for (unsigned i=0; i< Ctx->BindInputBuffers->size(); i++) {
uint64_t element = 0;
//DEBUG(cout << "\tPushing Value " << element << " to buffer\n");
llvm_visc_bufferPush(Ctx->BindInputBuffers->at(i), element);
}
// Push 1 in isLastInput buffers of all child nodes
for (unsigned i=0; i < Ctx->isLastInputBuffers->size(); i++)
llvm_visc_bufferPush(Ctx->isLastInputBuffers->at(i), 1);
llvm_visc_freeThreads(graphID);
}
// Create a buffer and return the bufferID
void* llvm_visc_createBindInBuffer(void* graphID, uint64_t size, unsigned inArgPort) {
DEBUG(cout << "Create BindInBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n");
DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindIn");
DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n");
Context->BindInputBuffers->push_back(bufferID);
(*(Context->ArgInPortSizeMap))[inArgPort] = size;
Context->BindInSourcePort->push_back(inArgPort);
//Context->BindInSizes->push_back(size);
return bufferID;
}
void* llvm_visc_createBindOutBuffer(void* graphID, uint64_t size) {
DEBUG(cout << "Create BindOutBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n");
DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
//Twine name = Twine("Bind.Out.")+Twine(Context->BindOutputBuffers->size());
CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindOut");
DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n");
Context->BindOutputBuffers->push_back(bufferID);
Context->BindOutSizes->push_back(size);
return bufferID;
}
void* llvm_visc_createEdgeBuffer(void* graphID, uint64_t size) {
DEBUG(cout << "Create EdgeBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n");
DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
//Twine name = Twine("Edge.")+Twine(Context->EdgeBuffers->size());
CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "Edge");
DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n");
Context->EdgeBuffers->push_back(bufferID);
Context->EdgeSizes->push_back(size);
return bufferID;
}
void* llvm_visc_createLastInputBuffer(void* graphID, uint64_t size) {
DEBUG(cout << "Create isLastInputBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n");
DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
//Twine name = Twine("isLastInput.")+Twine(Context->EdgeBuffers->size());
CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "LastInput");
DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n");
Context->isLastInputBuffers->push_back(bufferID);
return bufferID;
}
// Free buffers
void llvm_visc_freeBuffers(void* graphID) {
DEBUG(cout << "Free all buffers -- Graph: " << graphID << flush << "\n");
DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
for(CircularBuffer<uint64_t>* bufferID: *(Context->BindInputBuffers))
delete bufferID;
for(CircularBuffer<uint64_t>* bufferID: *(Context->BindOutputBuffers))
delete bufferID;
for(CircularBuffer<uint64_t>* bufferID: *(Context->EdgeBuffers))
delete bufferID;
for(CircularBuffer<uint64_t>* bufferID: *(Context->isLastInputBuffers))
delete bufferID;
}
// Pop an element from the buffer
uint64_t llvm_visc_bufferPop(void* bufferID) {
CircularBuffer<uint64_t>* buffer = (CircularBuffer<uint64_t>*) bufferID;
return buffer->pop();
}
// Push an element into the buffer
void llvm_visc_bufferPush(void* bufferID, uint64_t element) {
CircularBuffer<uint64_t>* buffer = (CircularBuffer<uint64_t>*) bufferID;
buffer->push(element);
}
// Create a thread
void llvm_visc_createThread(void* graphID, void* (*Func)(void*), void* arguments) {
DEBUG(cout << "Create Thread -- Graph: " << graphID << ", Func: " << Func << ", Args: " << arguments << flush << "\n");
DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
int err;
pthread_t threadID;
if((err = pthread_create(&threadID, NULL, Func, arguments)) != 0)
cout << "Failed to create thread. Error code = " << err << flush << "\n";
Ctx->threads->push_back(threadID);
}
// Wait for thread to finish
void llvm_visc_freeThreads(void* graphID) {
DEBUG(cout << "Free Threads -- Graph: " << graphID << flush << "\n");
DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
for(pthread_t thread: *(Ctx->threads))
pthread_join(thread, NULL);
}
/************************ OPENCL & PTHREAD API ********************************/
void* llvm_visc_x86_launch(void* (*rootFunc)(void*), void* arguments) {
DFNodeContext_X86 *Context = (DFNodeContext_X86*) malloc(sizeof(DFNodeContext_X86));
//int err;
//if((err = pthread_create(&Context->threadID, NULL, rootFunc, arguments)) != 0)
//cout << "Failed to create pthread. Error code = " << err << flush << "\n";
rootFunc(arguments);
return Context;
}
void llvm_visc_x86_wait(void* graphID) {
DEBUG(cout << "Waiting for pthread to finish ...\n");
//DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
//pthread_join(Context->threadID, NULL);
free(graphID);
DEBUG(cout << "\t... pthread Done!\n");
}
void* llvm_visc_ocl_initContext(enum visc::Target T) {
pthread_mutex_lock(&ocl_mtx);
//DEBUG(std::string Target = T == visc::GPU_TARGET? "GPU" : "SPIR");
DEBUG(const char* Target = T == visc::GPU_TARGET? "GPU" : "SPIR");
DEBUG(cout << "Initializing Context for " << Target << " device\n");
cl_uint numPlatforms;
cl_int errcode;
errcode = clGetPlatformIDs(0, NULL, &numPlatforms);
checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms");
// now get all the platform IDs
cl_platform_id* platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id)*numPlatforms);
errcode = clGetPlatformIDs(numPlatforms, platforms, NULL);
checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs");
for(unsigned i=0; i < numPlatforms; i++) {
char buffer[10240];
DEBUG(cout << "Device " << i << " Info -->\n");
clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL);
DEBUG(cout << "\tPROFILE = " << buffer << flush << "\n");
clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL);
DEBUG(cout << "\tVERSION = "<< buffer << flush << "\n");
clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL);
DEBUG(cout << "\tNAME = " << buffer << flush << "\n");
clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL);
DEBUG(cout << "\tVENDOR = " << buffer << flush << "\n");
clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL);
DEBUG(cout << "\tEXTENSIONS = " << buffer << flush << "\n");
}
// set platform property - just pick the first one
//cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
//(long) platforms[0],
//0};
//globalOCLContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU,
//NULL, NULL, &errcode);
assert(numPlatforms >= 2 && "Expecting two OpenCL platforms");
// Choose second one which is X86 AVX
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
(long) platforms[T == visc::GPU_TARGET? 0 : 1],
0};
globalOCLContext = clCreateContextFromType(properties,
T == visc::GPU_TARGET?
CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU,
NULL, NULL, &errcode);
// get the list of OCL devices associated with context
size_t dataBytes;
errcode = clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, 0,
NULL, &dataBytes);
checkErr(errcode, CL_SUCCESS, "Failure to get context info length");
clDevices = (cl_device_id *) malloc(dataBytes);
errcode |= clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, dataBytes,
clDevices, NULL);
checkErr(errcode, CL_SUCCESS, "Failure to get context info");
if(false && T == visc::SPIR_TARGET) {
cl_device_partition_property props[4];
props[0] = CL_DEVICE_PARTITION_BY_COUNTS;
props[1] = NUM_CORES;
props[2] = CL_DEVICE_PARTITION_BY_COUNTS_LIST_END;
props[3] = 0;
cl_device_id subdevice_id[8];
cl_uint num_entries = 8;
cl_uint numDevices;
clCreateSubDevices(clDevices[0], props, num_entries, subdevice_id, &numDevices);
//printf("Num of devices = %d\n", numDevices);
//for(unsigned i =0 ; i< numDevices; i++)
//printf("Subdevice id %d = %p\n", i, subdevice_id[i]);
clDevices[0] = subdevice_id[0];
globalOCLContext = clCreateContext(properties, 1, clDevices, NULL, NULL, &errcode);
checkErr(errcode, CL_SUCCESS, "Failure to create OCL context");
}
free(platforms);
DEBUG(cout << "\tContext " << globalOCLContext << flush << "\n");
checkErr(errcode, CL_SUCCESS, "Failure to create OCL context");
DEBUG(cout << "Initialize Kernel Timer\n");
visc_InitializeTimerSet(&kernel_timer);
pthread_mutex_unlock(&ocl_mtx);
return globalOCLContext;
}
void llvm_visc_ocl_clearContext(void* graphID) {
pthread_mutex_lock(&ocl_mtx);
DEBUG(cout << "Clear Context\n");
DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
// FIXME: Have separate function to release command queue and clear context.
// Would be useful when a context has multiple command queues
clReleaseKernel(Context->clKernel);
//clReleaseProgram(Context->clProgram);
//clReleaseCommandQueue(Context->clCommandQue);
//clReleaseContext(globalOCLContext);
//DEBUG(cout << "Released context at: " << globalOCLContext);
free(Context);
DEBUG(cout << "Done with OCL kernel\n");
cout << "Printing VISC Timer: KernelTimer\n";
visc_PrintTimerSet(&kernel_timer);
pthread_mutex_unlock(&ocl_mtx);
}
void llvm_visc_ocl_argument_shared(void* graphID, int arg_index, size_t size) {
pthread_mutex_lock(&ocl_mtx);
DEBUG(cout << "Set Shared Memory Input:");
DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size << flush << "\n");
DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
DEBUG(cout << "Using Context: " << Context << flush << "\n");
DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n");
//pthread_mutex_lock(&ocl_mtx);
cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, NULL);
//pthread_mutex_unlock(&ocl_mtx);
checkErr(errcode, CL_SUCCESS, "Failure to set shared memory argument");
pthread_mutex_unlock(&ocl_mtx);
}
void llvm_visc_ocl_argument_scalar(void* graphID, void* input, int arg_index, size_t size) {
pthread_mutex_lock(&ocl_mtx);
DEBUG(cout << "Set Scalar Input:");
DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size << flush << "\n");
DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
DEBUG(cout << "Using Context: " << Context << flush << "\n");
DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n");
//pthread_mutex_lock(&ocl_mtx);
cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input);
//pthread_mutex_unlock(&ocl_mtx);
checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument");
pthread_mutex_unlock(&ocl_mtx);
}
void* llvm_visc_ocl_argument_ptr(void* graphID, void* input, int arg_index, size_t size, bool isInput, bool isOutput) {
pthread_mutex_lock(&ocl_mtx);
DEBUG(cout << "Set Pointer Input:");
DEBUG(cout << "\tArgument Index = " << arg_index << ", Ptr = " << input << ", Size = "<< size << flush << "\n");
// Size should be non-zero
assert(size != 0 && "Size of data pointed to has to be non-zero!");
DEBUG(cout << "\tInput = "<< isInput << "\tOutput = " << isOutput << flush << "\n");
DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
pthread_mutex_unlock(&ocl_mtx);
// Check with runtime the location of this memory
cl_mem d_input = (cl_mem) llvm_visc_ocl_request_mem(input, size, Context, isInput, isOutput);
pthread_mutex_lock(&ocl_mtx);
// Set Kernel Argument
//pthread_mutex_lock(&ocl_mtx);
cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input);
//pthread_mutex_unlock(&ocl_mtx);
checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument");
DEBUG(cout << "\tDevicePtr = " << d_input << flush << "\n");
pthread_mutex_unlock(&ocl_mtx);
return d_input;
}
void* llvm_visc_ocl_output_ptr(void* graphID, int arg_index, size_t size) {
pthread_mutex_lock(&ocl_mtx);
DEBUG(cout << "Set device memory for Output Struct:");
DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = "<< size << flush << "\n");
DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
cl_int errcode;
//pthread_mutex_lock(&ocl_mtx);
cl_mem d_output = clCreateBuffer(Context->clOCLContext, CL_MEM_WRITE_ONLY, size, NULL, &errcode);
//pthread_mutex_unlock(&ocl_mtx);
checkErr(errcode, CL_SUCCESS, "Failure to create output buffer on device");
//pthread_mutex_lock(&ocl_mtx);
errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output);
//pthread_mutex_unlock(&ocl_mtx);
checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument");
DEBUG(cout << "\tDevicePtr = " << d_output << flush << "\n");
pthread_mutex_unlock(&ocl_mtx);
return d_output;
}
void llvm_visc_ocl_free(void* ptr) {
//DEBUG(cout << "Release Device Pointer: " << ptr << flush << "\n");
//cl_mem d_ptr = (cl_mem) ptr;
//clReleaseMemObject(d_ptr);
}
void* llvm_visc_ocl_getOutput(void* graphID, void* h_output, void* d_output, size_t size) {
pthread_mutex_lock(&ocl_mtx);
DEBUG(cout << "Get Output:\n");
DEBUG(cout << "\tHostPtr = " << h_output << ", DevicePtr = " << d_output << ", Size = "<< size << flush << "\n");
if(h_output == NULL)
h_output = malloc(size);
DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
//pthread_mutex_lock(&ocl_mtx);
cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size,
h_output, 0, NULL, NULL);
//pthread_mutex_unlock(&ocl_mtx);
checkErr(errcode, CL_SUCCESS, "[getOutput] Failure to read output");
pthread_mutex_unlock(&ocl_mtx);
return h_output;
}
void* llvm_visc_ocl_executeNode(void* graphID, unsigned workDim , const size_t*
localWorkSize, const size_t* globalWorkSize) {
pthread_mutex_lock(&ocl_mtx);
size_t GlobalWG[3];
size_t LocalWG[3];
// OpenCL EnqeueNDRangeKernel function results in segementation fault if we
// directly use local and global work groups arguments. Hence, allocating it
// on stack and copying.
for(unsigned i=0; i<workDim; i++) {
GlobalWG[i] = globalWorkSize[i];
}
// OpenCL allows local workgroup to be null.
if(localWorkSize != NULL) {
for(unsigned i=0; i<workDim; i++) {
LocalWG[i] = localWorkSize[i];
}
}
DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
// TODO: Would like to use event to ensure better scheduling of kernels.
// Currently passing the event paratemeter results in seg fault with
// clEnqueueNDRangeKernel.
cl_event* event;
DEBUG(cout << "Enqueuing kernel:\n");
DEBUG(cout << "\tCommand Queue: " << Context->clCommandQue << flush << "\n");
DEBUG(cout << "\tKernel: " << Context->clKernel << flush << "\n");
DEBUG(cout << "\tNumber of dimensions: " << workDim << flush << "\n");
DEBUG(cout << "\tGlobal Work Group: ( ");
for(unsigned i = 0; i<workDim; i++) {
DEBUG(cout << GlobalWG[i] << " ");
}
DEBUG(cout << ")\n");
if(localWorkSize != NULL) {
DEBUG(cout << "\tLocal Work Group: ( ");
for(unsigned i = 0; i<workDim; i++) {
DEBUG(cout << LocalWG[i] << " ");
}
DEBUG(cout << ")\n");
}
//pthread_mutex_lock(&ocl_mtx);
clFinish(Context->clCommandQue);
//pthread_mutex_unlock(&ocl_mtx);
visc_SwitchToTimer(&kernel_timer, visc_TimerID_COMPUTATION);
//for(int i=0 ;i < NUM_TESTS; i++) {
//cout << "Iteration = " << i << flush << "\n";
//pthread_mutex_lock(&ocl_mtx);
cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue,
Context->clKernel, workDim, NULL, GlobalWG, (localWorkSize == NULL)? NULL : LocalWG, 0, NULL, NULL);
//pthread_mutex_unlock(&ocl_mtx);
checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
//}
//pthread_mutex_lock(&ocl_mtx);
clFinish(Context->clCommandQue);
//pthread_mutex_unlock(&ocl_mtx);
visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
pthread_mutex_unlock(&ocl_mtx);
return event;
}
//////////////////////////////////////////////////////////////////////////////
//! Loads a Program binary file.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param Filename program filename
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
static char* LoadProgSource(const char* Filename, size_t* szFinalLength)
{
DEBUG(cout << "Load Prog Source\n");
// locals
FILE* pFileStream = NULL;
size_t szSourceLength;
// open the OpenCL source code file
pFileStream = fopen(Filename, "rb");
if(pFileStream == 0)
{
return NULL;
}
// get the length of the source code
fseek(pFileStream, 0, SEEK_END);
szSourceLength = ftell(pFileStream);
fseek(pFileStream, 0, SEEK_SET);
// allocate a buffer for the source code string and read it in
char* cSourceString = (char *)malloc(szSourceLength + 1);
if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1)
{
fclose(pFileStream);
free(cSourceString);
return 0;
}
// close the file and return the total length of the combined (preamble + source) string
fclose(pFileStream);
if(szFinalLength != 0)
{
*szFinalLength = szSourceLength;
}
cSourceString[szSourceLength] = '\0';
return cSourceString;
}
void* llvm_visc_ocl_launch(const char* FileName, const char* KernelName) {
pthread_mutex_lock(&ocl_mtx);
DEBUG(cout << "Launch OCL Kernel\n");
// Initialize OpenCL
// OpenCL specific variables
DFNodeContext_OCL *Context = (DFNodeContext_OCL *) malloc(sizeof(DFNodeContext_OCL));
size_t kernelLength;
cl_int errcode;
// For a single context for all kernels
Context->clOCLContext = globalOCLContext;
//Create a command-queue
//pthread_mutex_lock(&ocl_mtx);
Context->clCommandQue = clCreateCommandQueue(Context->clOCLContext, clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode);
globalCommandQue = Context->clCommandQue;
//pthread_mutex_unlock(&ocl_mtx);
checkErr(errcode, CL_SUCCESS, "Failure to create command queue");
DEBUG(cout << "Loading program binary: " << FileName << flush << "\n");
char *programSource = LoadProgSource(FileName, &kernelLength);
checkErr(programSource != NULL, 1 /*bool true*/, "Failure to load Program Binary");
cl_int binaryStatus;
//pthread_mutex_lock(&ocl_mtx);
Context->clProgram = clCreateProgramWithBinary(Context->clOCLContext, 1, &clDevices[0],
&kernelLength,
(const unsigned char **)&programSource,
&binaryStatus, &errcode);
//pthread_mutex_unlock(&ocl_mtx);
checkErr(errcode, CL_SUCCESS, "Failure to create program from binary");
DEBUG(cout << "Building kernel - " << KernelName << " from file " << FileName << flush << "\n");
errcode = clBuildProgram(Context->clProgram, 0, NULL, NULL, NULL, NULL);
// If build fails, get build log from device
if(errcode != CL_SUCCESS) {
cout << "ERROR: Failure to build program\n";
size_t len = 0;
errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0] , CL_PROGRAM_BUILD_LOG, 0,
NULL, &len);
cout << "LOG LENGTH: " << len << flush << "\n";
checkErr(errcode, CL_SUCCESS, "Failure to collect program build log length");
char *log = (char*) malloc(len*sizeof(char));
errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0], CL_PROGRAM_BUILD_LOG, len,
log, NULL);
checkErr(errcode, CL_SUCCESS, "Failure to collect program build log");
cout << "Device Build Log:\n" << log << flush << "\n";
free(log);
pthread_mutex_unlock(&ocl_mtx);
exit(EXIT_FAILURE);
}
Context->clKernel = clCreateKernel(Context->clProgram, KernelName, &errcode);
checkErr(errcode, CL_SUCCESS, "Failure to create kernel");
DEBUG(cout << "Kernel ID = " << Context->clKernel << "\n");
//free(clDevices);
free(programSource);
pthread_mutex_unlock(&ocl_mtx);
return Context;
}
void llvm_visc_ocl_wait(void* graphID) {
pthread_mutex_lock(&ocl_mtx);
DEBUG(cout << "Wait\n");
DFNodeContext_OCL *Context = (DFNodeContext_OCL*) graphID;
//pthread_mutex_lock(&ocl_mtx);
clFinish(Context->clCommandQue);
//pthread_mutex_unlock(&ocl_mtx);
pthread_mutex_unlock(&ocl_mtx);
}
void llvm_visc_switchToTimer(void** timerSet, enum visc_TimerID timer) {
//cout << "Switching to timer " << timer << flush << "\n";
pthread_mutex_lock(&ocl_mtx);
//visc_SwitchToTimer((visc_TimerSet*)(*timerSet), timer);
pthread_mutex_unlock(&ocl_mtx);
}
void llvm_visc_printTimerSet(void** timerSet, char* timerName) {
pthread_mutex_lock(&ocl_mtx);
cout << "Printing VISC Timer: ";
if(timerName != NULL)
cout << timerName << flush << "\n";
else
cout << "Anonymous\n";
visc_PrintTimerSet((visc_TimerSet*) (*timerSet));
pthread_mutex_unlock(&ocl_mtx);
}
void* llvm_visc_initializeTimerSet() {
pthread_mutex_lock(&ocl_mtx);
visc_TimerSet* TS = (visc_TimerSet*) malloc (sizeof(visc_TimerSet));
visc_InitializeTimerSet(TS);
pthread_mutex_unlock(&ocl_mtx);
return TS;
}
/*
*
* (c) 2010 The Board of Trustees of the University of Illinois.
*/
#ifndef VISC_RT_HEADER
#define VISC_RT_HEADER
#include <iostream>
#include <map>
#include <ctime>
#include <vector>
#include <pthread.h>
#include <string>
//#include <condition_variable>
#include "llvm/SupportVISC/VISCHint.h"
#include "llvm/SupportVISC/VISCTimer.h"
#include "device_abstraction.h"
#include "policy.h"
#ifndef DEBUG_BUILD
#define DEBUG(s) {}
#else
#define DEBUG(s) s
#endif
using namespace std;
extern "C" {
/************************* Policies *************************************/
void llvm_visc_policy_init();
void llvm_visc_policy_clear();
int llvm_visc_policy_getVersion(const char *, int64_t);
/******************** Device Abstraction ********************************/
void llvm_visc_deviceAbstraction_start();
void llvm_visc_deviceAbstraction_end();
void llvm_visc_deviceAbstraction_waitOnDeviceStatus();
/********************* DFG Depth Stack **********************************/
class DFGDepth {
private:
unsigned numDim;
unsigned dimLimit[3];
unsigned dimInstance[3];
public:
DFGDepth() {}
DFGDepth(unsigned n, unsigned dimX = 0, unsigned iX = 0, unsigned dimY = 0, unsigned iY = 0,
unsigned dimZ = 0, unsigned iZ = 0) {
assert(n <= 3 && "Error! More than 3 dimensions not supported");
numDim = n;
dimLimit[0] = dimX;
dimLimit[1] = dimY;
dimLimit[2] = dimZ;
dimInstance[0] = iX;
dimInstance[1] = iY;
dimInstance[2] = iZ;
}
unsigned getDimLimit(unsigned dim) {
assert(dim <= numDim && "Error! Requested dimension limit is not specified");
return dimLimit[dim];
}
unsigned getDimInstance(unsigned dim) {
assert(dim <= numDim && "Error! Requested dimension instance is not specified");
return dimInstance[dim];
}
unsigned getNumDim() {
return numDim;
}
};
void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX = 0, uint64_t iX = 0,
uint64_t limitY = 0, uint64_t iY = 0, uint64_t limitZ = 0, uint64_t iZ = 0);
void llvm_visc_x86_dstack_pop();
uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim);
uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim);
/********************* Memory Tracker **********************************/
class MemTrackerEntry {
public:
enum Location {HOST, DEVICE};
private:
size_t size;
Location loc;
void* addr;
void* Context;
public:
MemTrackerEntry(size_t _size, Location _loc, void* _addr, void* _Context):
size(_size), loc(_loc), addr(_addr), Context(_Context) {
}
size_t getSize() {
return size;
}
Location getLocation() {
return loc;
}
void* getAddress() {
return addr;
}
void* getContext() {
return Context;
}
void update(Location _loc, void* _addr, void* _Context = NULL) {
loc = _loc;
addr = _addr;
Context = _Context;
}
void print() {
cout << "Size = " << size << "\tLocation = " << loc << "\tAddress = " << addr << "\tContext = " << Context;
}
};
class MemTracker {
private:
std::map<void*, MemTrackerEntry*> Table;
public:
MemTracker() {
}
bool insert(void* ID, size_t size, MemTrackerEntry::Location loc, void* addr, void* Context = NULL) {
MemTrackerEntry* MTE = new MemTrackerEntry(size, loc, addr, Context);
Table.insert(std::pair<void*, MemTrackerEntry*>(ID, MTE));
return MTE != NULL;
}
MemTrackerEntry* lookup(void* ID) {
if(Table.count(ID) == 0)
return NULL;
return Table[ID];
}
void remove(void* ID) {
MemTrackerEntry* MTE = Table[ID];
free(MTE);
Table.erase(ID);
}
void print() {
cout << "Printing Table ... Size = " << Table.size() << flush << "\n";
for(auto& Entry: Table) {
cout << Entry.first << ":\t" ;
Entry.second->print();
cout << flush << "\n";
}
}
};
void llvm_visc_track_mem(void*, size_t);
void llvm_visc_untrack_mem(void*);
void* llvm_visc_request_mem(void*, size_t);
/*********************** OPENCL & PTHREAD API **************************/
void* llvm_visc_x86_launch(void* (void*), void*);
void llvm_visc_x86_wait(void*);
void* llvm_visc_ocl_initContext(enum visc::Target);
void* llvm_visc_x86_argument_ptr(void*, size_t);
void llvm_visc_ocl_clearContext(void*);
void llvm_visc_ocl_argument_shared(void*, int, size_t);
void llvm_visc_ocl_argument_scalar(void*, void*, int, size_t);
void* llvm_visc_ocl_argument_ptr(void*, void*, int, size_t, bool, bool);
void* llvm_visc_ocl_output_ptr(void*, int, size_t);
void llvm_visc_ocl_free(void*);
void* llvm_visc_ocl_getOutput(void*, void*, void*, size_t);
void* llvm_visc_ocl_executeNode(void*, unsigned, const size_t*, const size_t*);
void* llvm_visc_ocl_launch(const char*, const char*);
void llvm_visc_ocl_wait(void*);
void llvm_visc_switchToTimer(void** timerSet, enum visc_TimerID);
void llvm_visc_printTimerSet(void** timerSet, char* timerName = NULL);
void* llvm_visc_initializeTimerSet();
}
/*************************** Pipeline API ******************************/
// Circular Buffer class
unsigned counter = 0;
template <class ElementType>
class CircularBuffer {
private:
int numElements;
int bufferSize;
int Head;
int Tail;
pthread_mutex_t mtx;
pthread_cond_t cv;
vector<ElementType> buffer;
std::string name;
unsigned ID;
public:
CircularBuffer(int maxElements, std::string _name = "ANON") {
ID = counter;
Head = 0;
Tail = 0;
numElements = 0;
name = _name;
bufferSize = maxElements+1;
buffer.reserve(bufferSize);
pthread_mutex_init(&mtx, NULL);
pthread_cond_init(&cv, NULL);
counter++;
}
bool push(ElementType E);
ElementType pop();
};
template <class ElementType>
bool CircularBuffer<ElementType>::push(ElementType E) {
//DEBUG(cout << name << " Buffer[" << ID << "]: Push " << E << flush << "\n");
//unique_lock<mutex> lk(mtx);
pthread_mutex_lock(&mtx);
if((Head +1) % bufferSize == Tail) {
//DEBUG(cout << name << " Buffer[" << ID << "]: Push going to sleep ...\n");
//cv.wait(lk);
pthread_cond_wait(&cv, &mtx);
//DEBUG(cout << name << " Buffer[" << ID << "]: Push woke up\n");
}
buffer[Head] = E;
Head = (Head+1) % bufferSize;
numElements++;
//DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " << numElements << flush << "\n");
//lk.unlock();
pthread_mutex_unlock(&mtx);
//cv.notify_one();
pthread_cond_signal(&cv);
return true;
}
template <class ElementType>
ElementType CircularBuffer<ElementType>::pop() {
//unique_lock<mutex> lk(mtx);
//DEBUG(cout << name << " Buffer[" << ID << "]: Pop\n");
pthread_mutex_lock(&mtx);
if(Tail == Head) {
//DEBUG(cout << name << " Buffer[" << ID << "]: Pop going to sleep ...\n");
//cv.wait(lk);
pthread_cond_wait(&cv, &mtx);
//DEBUG(cout << name << " Buffer[" << ID << "]: Pop woke up\n");
}
ElementType E = buffer[Tail];
Tail = (Tail + 1) % bufferSize;
numElements--;
//DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " << numElements << flush << "\n");
//lk.unlock();
pthread_mutex_unlock(&mtx);
//cv.notify_one();
pthread_cond_signal(&cv);
return E;
}
extern "C" {
// Functions to push and pop values from pipeline buffers
uint64_t llvm_visc_bufferPop(void*);
void llvm_visc_bufferPush(void*, uint64_t);
// Functions to create and destroy buffers
void* llvm_visc_createBindInBuffer(void*, uint64_t, unsigned);
void* llvm_visc_createBindOutBuffer(void*, uint64_t);
void* llvm_visc_createEdgeBuffer(void*, uint64_t);
void* llvm_visc_createLastInputBuffer(void*, uint64_t);
void llvm_visc_freeBuffers(void*);
// Functions to create and destroy threads
void llvm_visc_createThread(void* graphID, void*(*Func)(void*), void*);
void llvm_visc_freeThreads(void*);
// Launch API for a streaming graph.
// Arguments:
// (1) Launch Function: void* (void*, void*)
// (2) Push Function: void (void*, std::vector<uint64_t>**, unsgined)
// (3) Pop Function: void* (std::vector<uint64_t>**, unsigned)
void* llvm_visc_streamLaunch(void(*LaunchFunc)(void*, void*), void*);
void llvm_visc_streamPush(void* graphID, void* args);
void* llvm_visc_streamPop(void* graphID);
void llvm_visc_streamWait(void* graphID);
}
#endif //VISC_RT_HEADER
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment