From c7c9bcea1c004d089fc3a10eeafa249549fd2c99 Mon Sep 17 00:00:00 2001
From: Hashim Sharif <hsharif3@tyler.cs.illinois.edu>
Date: Tue, 26 Nov 2019 16:07:59 -0600
Subject: [PATCH] CPU version of visc-rt - DSOC

---
 llvm/projects/visc-cpu-rt/CMakeLists.txt      |   43 +
 .../deviceStatusSwitchIntervals.txt           |    2 +
 .../projects/visc-cpu-rt/device_abstraction.h |   82 +
 llvm/projects/visc-cpu-rt/makefile            |   31 +
 llvm/projects/visc-cpu-rt/policy.h            |  103 ++
 llvm/projects/visc-cpu-rt/visc-rt.cpp         | 1648 +++++++++++++++++
 llvm/projects/visc-cpu-rt/visc-rt.h           |  302 +++
 7 files changed, 2211 insertions(+)
 create mode 100644 llvm/projects/visc-cpu-rt/CMakeLists.txt
 create mode 100644 llvm/projects/visc-cpu-rt/deviceStatusSwitchIntervals.txt
 create mode 100644 llvm/projects/visc-cpu-rt/device_abstraction.h
 create mode 100644 llvm/projects/visc-cpu-rt/makefile
 create mode 100644 llvm/projects/visc-cpu-rt/policy.h
 create mode 100644 llvm/projects/visc-cpu-rt/visc-rt.cpp
 create mode 100644 llvm/projects/visc-cpu-rt/visc-rt.h

diff --git a/llvm/projects/visc-cpu-rt/CMakeLists.txt b/llvm/projects/visc-cpu-rt/CMakeLists.txt
new file mode 100644
index 0000000000..4887abbaf9
--- /dev/null
+++ b/llvm/projects/visc-cpu-rt/CMakeLists.txt
@@ -0,0 +1,43 @@
+add_custom_target(visc-cpu-rt ALL)
+add_custom_command(
+  TARGET visc-cpu-rt PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy
+    ${CMAKE_CURRENT_SOURCE_DIR}/deviceStatusSwitchIntervals.txt
+    ${CMAKE_CURRENT_BINARY_DIR}/deviceStatusSwitchIntervals.txt
+  DEPENDS deviceStatusSwitchIntervals.txt
+  COMMENT "Copying deviceStatusSwitchIntervals.txt")
+add_custom_command(
+  TARGET visc-cpu-rt PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy
+    ${CMAKE_CURRENT_SOURCE_DIR}/device_abstraction.h
+    ${CMAKE_CURRENT_BINARY_DIR}/device_abstraction.h
+  DEPENDS device_abstraction.h
+  COMMENT "Copying device_abstraction.h")
+add_custom_command(
+  TARGET visc-cpu-rt PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy
+    ${CMAKE_CURRENT_SOURCE_DIR}/policy.h
+    ${CMAKE_CURRENT_BINARY_DIR}/policy.h
+  DEPENDS policy.h
+  COMMENT "Copying policy.h")
+add_custom_command(
+  TARGET visc-cpu-rt PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy
+    ${CMAKE_CURRENT_SOURCE_DIR}/visc-rt.h
+    ${CMAKE_CURRENT_BINARY_DIR}/visc-rt.h
+  DEPENDS visc-rt.h
+  COMMENT "Copying visc-rt.h")
+add_custom_command(
+  TARGET visc-cpu-rt PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy
+    ${CMAKE_CURRENT_SOURCE_DIR}/visc-rt.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/visc-rt.cpp
+  DEPENDS visc-rt.cpp
+  COMMENT "Copying visc-rt.cpp")
+add_custom_command(
+  TARGET visc-cpu-rt PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy
+    ${CMAKE_CURRENT_SOURCE_DIR}/makefile
+    ${CMAKE_CURRENT_BINARY_DIR}/makefile
+  DEPENDS makefile
+  COMMENT "Copying makefile")
diff --git a/llvm/projects/visc-cpu-rt/deviceStatusSwitchIntervals.txt b/llvm/projects/visc-cpu-rt/deviceStatusSwitchIntervals.txt
new file mode 100644
index 0000000000..7069470a1a
--- /dev/null
+++ b/llvm/projects/visc-cpu-rt/deviceStatusSwitchIntervals.txt
@@ -0,0 +1,2 @@
+10
+10 15 10 16 15 30 15 25 20 15
diff --git a/llvm/projects/visc-cpu-rt/device_abstraction.h b/llvm/projects/visc-cpu-rt/device_abstraction.h
new file mode 100644
index 0000000000..68748c7ab7
--- /dev/null
+++ b/llvm/projects/visc-cpu-rt/device_abstraction.h
@@ -0,0 +1,82 @@
+#ifndef __DEVICE_ABSTRACTION__
+#define __DEVICE_ABSTRACTION__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <time.h>
+#include <thread>
+#include <vector>
+#include <iostream>
+#include <fstream>
+
+#define MIN_INTERVAL 2
+#define MAX_INTERVAL 8
+#define NUM_INTERVALS 10
+
+// Device status variable: true if the device is available for use
+volatile bool deviceStatus = true;
+// Intervals at which to change the device status
+std::vector<unsigned> Intervals;
+
+// Set to true when program execution ends and so we can end the device
+// simulation
+volatile bool executionEnd = false;
+
+
+void initializeDeviceStatusIntervals() {
+
+  unsigned sz = 0;
+  unsigned tmp = 0;
+
+  const char *fn =
+    "/home/kotsifa2/HPVM/hpvm/build/projects/visc-rt/deviceStatusSwitchIntervals.txt";
+  std::ifstream infile;
+  infile.open(fn);
+  if (!infile.is_open()) {
+    std::cout << "Failed to open " << fn << " for reading\n";
+    return;
+  }
+  infile >> sz;
+
+  if (sz) {
+    // We have data. Read them into the vector
+    for (unsigned i = 0; i < sz; i++) {
+      infile >> tmp;
+      Intervals.push_back(tmp);
+    }
+    infile.close();
+  } else {
+    // We have no data. Create random data and write them into the file
+    infile.close();
+    std::ofstream outfile;
+    outfile.open(fn);
+    if (!outfile.is_open()) {
+      std::cout << "Failed to open " << fn << " for writing\n";
+      return;
+    }
+    sz = 1 + rand()%NUM_INTERVALS;
+    outfile << sz; 
+    for (unsigned i = 0; i < sz; i++) {
+      Intervals.push_back(MIN_INTERVAL + rand()%(MAX_INTERVAL - MIN_INTERVAL));
+      outfile << Intervals[i];
+    }
+    outfile.close();
+  }
+
+  return;
+}
+
+void updateDeviceStatus() {
+
+  unsigned i = 0;
+  while (!executionEnd) {
+    std::this_thread::sleep_for (std::chrono::seconds(Intervals[i]));
+    deviceStatus = !deviceStatus;
+    std::cout << "Changed device status to " << deviceStatus << "\n";
+    i = (i+1) % Intervals.size();
+  }
+
+}
+
+#endif // __DEVICE_ABSTRACTION__
diff --git a/llvm/projects/visc-cpu-rt/makefile b/llvm/projects/visc-cpu-rt/makefile
new file mode 100644
index 0000000000..a1fd6e1e01
--- /dev/null
+++ b/llvm/projects/visc-cpu-rt/makefile
@@ -0,0 +1,31 @@
+LLVM_SRC_ROOT = ../../../llvm
+LLVM_BUILD_ROOT = ../..
+
+OPENCL_INC_PATH = /opt/intel/opencl-sdk/include
+
+ifeq ($(NUM_CORES),)
+  NUM_CORES=8
+endif
+
+CPP_FLAGS = -I $(LLVM_SRC_ROOT)/include -I $(LLVM_BUILD_ROOT)/include -I $(OPENCL_INC_PATH) -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
+TARGET:=visc-rt
+
+#LLVM_CC:=$(LLVM_BUILD_ROOT)/bin/clang
+#LLVM_CXX:=$(LLVM_BUILD_ROOT)/bin/clang++
+
+LLVM_CC=clang-4.0
+LLVM_CXX=clang++-4.0
+
+OPTS =
+
+ifeq ($(DEBUG),1)
+  OPTS+=-DDEBUG_BUILD
+endif
+
+all: $(TARGET:%=%.ll)
+
+$(TARGET:%=%.ll):%.ll:%.cpp %.h
+	$(LLVM_CXX) -DNUM_CORES=$(NUM_CORES) -O3 -S -emit-llvm $(CPP_FLAGS) $(OPTS) $< -o $@
+
+clean :
+	rm -f $(TARGET).ll
diff --git a/llvm/projects/visc-cpu-rt/policy.h b/llvm/projects/visc-cpu-rt/policy.h
new file mode 100644
index 0000000000..436ad39295
--- /dev/null
+++ b/llvm/projects/visc-cpu-rt/policy.h
@@ -0,0 +1,103 @@
+#ifndef __POLICY__
+#define __POLICY__
+
+#include <string>
+#include "device_abstraction.h"
+
+ /************************* Policies *************************************/
+class Policy {
+  public:
+    virtual int getVersion(const char *, int64_t) = 0;
+    virtual ~Policy() {};
+};
+
+class NodePolicy : public Policy {
+  virtual int getVersion(const char *name, int64_t it) override {
+    std::string s(name);
+    //std::string NodeNames[1] = { "_Z9mysgemmNTPfiS_iS_iiff_clonedInternal_level2_cloned" };
+    std::string NodeNames[] = {
+      "WrapperGaussianSmoothing_cloned",
+      "WrapperlaplacianEstimate_cloned",
+      "WrapperComputeZeroCrossings_cloned",
+      "WrapperComputeGradient_cloned",
+      "WrapperComputeMaxGradient_cloned",
+      "WrapperRejectZeroCrossings_cloned",
+    };
+    //if (!s.compare(NodeNames[4])) {
+    //  std::cout << s << ": CPU" << "\n";
+    //  return 0;
+    //}
+    return 2;
+  }
+};
+
+class IterationPolicy : public Policy {
+  virtual int getVersion(const char *name, int64_t it) override {
+    if ((it % 10 == 0) || (it % 10 == 1))
+      return 0;
+    else
+      return 2;
+  }
+};
+
+class DeviceStatusPolicy : public Policy {
+  virtual int getVersion(const char *name, int64_t it) override {
+    if (deviceStatus) {
+      //std::cout << "Returning GPU\n";
+      return 2;
+    }
+    else {
+      //std::cout << "Returning CPU\n";
+      return 0;
+    }
+  }
+};
+
+/* ------------------------------------------------------------------------- */
+// Added for the CFAR interactive policy demo.
+
+class InteractivePolicy : public Policy {
+private:
+  // 0 :for CPU, 1 for GPU, 2 for Vector
+  unsigned int userTargetDeviceChoice;
+  // Used to end thread execution
+  bool end;
+  // Thread that will update userTargetDeviceChoice
+
+  //std::thread userTargetDeviceChoiceThread;
+  
+  // Thread function
+  void updateUserTargetChoice() {
+    while (!end) {
+      std::cout << "Select target device (0 for CPU, 1 fpr GPU): ";
+
+      std::cin >> userTargetDeviceChoice;
+      
+      if (userTargetDeviceChoice > 1) {
+        std::cout << "Invalid target device. Selecting GPU instead.\n";
+        userTargetDeviceChoice = 1;
+      }
+      
+    }
+  }
+
+public:
+  // Inherited method, erquired for every policy object
+  virtual int getVersion(const char *name, int64_t it) {
+    return userTargetDeviceChoice;
+  }
+
+  InteractivePolicy() {
+    userTargetDeviceChoice = 1;
+    end = false;
+    //userTargetDeviceChoiceThread =
+    //  std::thread(&InteractivePolicy::updateUserTargetChoice, this);
+  }
+
+  ~InteractivePolicy() {
+    end = true;
+    //userTargetDeviceChoiceThread.join(); 
+  }
+};
+
+#endif // __POLICY__
diff --git a/llvm/projects/visc-cpu-rt/visc-rt.cpp b/llvm/projects/visc-cpu-rt/visc-rt.cpp
new file mode 100644
index 0000000000..b4e9405efb
--- /dev/null
+++ b/llvm/projects/visc-cpu-rt/visc-rt.cpp
@@ -0,0 +1,1648 @@
+
+#include <iostream>
+#include <string>
+#include <pthread.h>
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <cassert>
+#include <map>
+#include <CL/cl.h>
+
+#include <unistd.h>
+
+#if _POSIX_VERSION >= 200112L
+# include <sys/time.h>
+#endif
+#include "visc-rt.h"
+
+#ifndef DEBUG_BUILD
+#define DEBUG(s) {}
+#else
+#define DEBUG(s) s
+#endif
+
+#define BILLION   1000000000LL
+
+using namespace std;
+
+typedef struct {
+  pthread_t threadID;
+  std::vector<pthread_t>* threads;
+  // Map from InputPort to Size 
+  std::map<unsigned, uint64_t>* ArgInPortSizeMap;
+  //std::vector<uint64_t>* BindInSizes;
+  std::vector<unsigned>* BindInSourcePort;
+  std::vector<uint64_t>* BindOutSizes;
+  std::vector<uint64_t>* EdgeSizes;
+  std::vector<CircularBuffer<uint64_t>*>* BindInputBuffers;
+  std::vector<CircularBuffer<uint64_t>*>* BindOutputBuffers;
+  std::vector<CircularBuffer<uint64_t>*>* EdgeBuffers;
+  std::vector<CircularBuffer<uint64_t>*>* isLastInputBuffers;
+} DFNodeContext_X86;
+
+typedef struct {
+  cl_context clOCLContext;
+  cl_command_queue clCommandQue;
+  cl_program clProgram;
+  cl_kernel clKernel;
+} DFNodeContext_OCL;
+
+cl_context globalOCLContext;
+cl_device_id* clDevices;
+cl_command_queue globalCommandQue;
+
+Policy *policy = NULL;
+MemTracker MTracker;
+vector<DFGDepth> DStack;
+// Mutex to prevent concurrent access by multiple thereads in pipeline
+pthread_mutex_t ocl_mtx;
+
+#define NUM_TESTS 1
+visc_TimerSet kernel_timer;
+
+static inline void checkErr(cl_int err, cl_int success, const char * name) {
+  if (err != success) {
+  cout << "ERROR: " << name << flush << "\n";
+  cout << "ErrorCode: " << err << flush << "\n";
+    exit(EXIT_FAILURE);
+  }
+}
+
+/************************* Policies *************************************/
+void llvm_visc_policy_init() {
+  cout << "Initializing policy object ...\n";
+//  policy = new NodePolicy();
+//  policy = new IterationPolicy();
+//  policy = new DeviceStatusPolicy();
+  policy = new InteractivePolicy();
+  cout << "DONE: Initializing policy object.\n";
+}
+
+void llvm_visc_policy_clear() {
+  if (policy) free(policy);
+}
+
+int llvm_visc_policy_getVersion(const char *name, int64_t i) {
+  return policy->getVersion(name, i);
+}
+
+/******************** Device Abstraction ********************************/
+//std::thread deviceStatusThread;
+/*
+void llvm_visc_deviceAbstraction_start() {
+  cout << "Starting device status simulation ...\n";
+  // Initialize vector with points where ti switch device status
+  initializeDeviceStatusIntervals();
+  // Create a thread that performs the changes
+  deviceStatusThread = std::thread(updateDeviceStatus);
+  cout << "Started device status simulation thread ...\n";
+  return;
+}
+*/
+
+void llvm_visc_deviceAbstraction_end() {
+  cout << "Ending device status simulation thread ...\n";
+  // Set the variable that allows the thread to know that execution has ended
+  executionEnd = true;
+  // Wait for the thread that manages device status to terminate
+  //deviceStatusThread.join();
+  cout << "Ended device status simulation.\n";
+  return;
+}
+
+void llvm_visc_deviceAbstraction_waitOnDeviceStatus() {
+  while (!deviceStatus) { };
+  return;
+}
+
+/************************* Depth Stack Routines ***************************/
+
+void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX, uint64_t limitY,
+    uint64_t iY, uint64_t limitZ, uint64_t iZ) {
+  //DEBUG(cout << "Pushing node information on stack:\n");
+  //DEBUG(cout << "\tNumDim = " << n << "\t Limit(" << limitX << ", " << limitY << ", "<< limitZ <<")\n");
+  //DEBUG(cout << "\tInstance(" << iX << ", " << iY << ", "<< iZ <<")\n");
+  //DFGDepth nodeInfo (n, limitX, iX, limitY, iY, limitZ, iZ);
+  //DStack.push_back(nodeInfo);
+  //DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n");
+}
+
+void llvm_visc_x86_dstack_pop() {
+  //DEBUG(cout << "Popping from depth stack\n");
+  //DStack.pop_back();
+  //DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n");
+}
+
+uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim) {
+  //DEBUG(cout << "Request limit for dim " << dim << " of ancestor " << level <<flush << "\n");
+  //unsigned size = DStack.size();
+  //DEBUG(cout << "\t Return: " << DStack[size-level-1].getDimLimit(dim) <<flush << "\n");
+  //return DStack[size-level-1].getDimLimit(dim);
+  return 0;
+}
+
+uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim) {
+  //DEBUG(cout << "Request instance id for dim " << dim << " of ancestor " << level <<flush << "\n");
+  //unsigned size = DStack.size();
+  //DEBUG(cout << "\t Return: " << DStack[size-level-1].getDimInstance(dim) <<flush << "\n");
+  //return DStack[size-level-1].getDimInstance(dim);
+  return 0;
+}
+
+/********************** Memory Tracking Routines **************************/
+
+void llvm_visc_track_mem(void* ptr, size_t size) {
+  DEBUG(cout << "Start tracking memory: " << ptr << flush << "\n");
+  MemTrackerEntry* MTE = MTracker.lookup(ptr);
+  if(MTE != NULL) {
+    DEBUG(cout << "ID " << ptr << " already present in the MemTracker Table\n");
+    return;
+  }
+  DEBUG(cout << "Inserting ID " << ptr << " in the MemTracker Table\n");
+  MTracker.insert(ptr, size, MemTrackerEntry::HOST, ptr);
+  DEBUG(MTracker.print());
+}
+
+void llvm_visc_untrack_mem(void* ptr) {
+  DEBUG(cout << "Stop tracking memory: " << ptr << flush << "\n");
+  MemTrackerEntry* MTE = MTracker.lookup(ptr);
+  if(MTE == NULL) {
+    cout << "WARNING: Trying to remove ID " << ptr << " not present in the MemTracker Table\n";
+    return;
+  }
+  DEBUG(cout << "Removing ID " << ptr << " from MemTracker Table\n");
+  if(MTE->getLocation() == MemTrackerEntry::DEVICE)
+    clReleaseMemObject((cl_mem) MTE->getAddress());
+  MTracker.remove(ptr);
+  DEBUG(MTracker.print());
+}
+
+
+static void* llvm_visc_ocl_request_mem(void* ptr, size_t size, DFNodeContext_OCL* Context, bool isInput, bool isOutput) {
+  pthread_mutex_lock(&ocl_mtx);
+  DEBUG(cout << "[OCL] Request memory: " << ptr << " for context: " << Context->clOCLContext << flush << "\n");
+  MemTrackerEntry* MTE = MTracker.lookup(ptr);
+  if (MTE == NULL) {
+    MTracker.print();
+    cout << "ERROR: Requesting memory not present in Table\n";
+    exit(EXIT_FAILURE);
+  }
+  // If already on device
+  if (MTE->getLocation() == MemTrackerEntry::DEVICE &&
+      ((DFNodeContext_OCL*)MTE->getContext())->clOCLContext == Context->clOCLContext) {
+    DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush << "\n"); 
+    pthread_mutex_unlock(&ocl_mtx);
+    return MTE->getAddress();
+  }
+  
+  DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush << "\n");
+  DEBUG(cout << "\t"; MTE->print(); cout << flush << "\n");
+  // Else copy and update the latest copy
+  cl_mem_flags clFlags;
+  cl_int errcode;
+
+  if(isInput && isOutput) clFlags = CL_MEM_READ_WRITE;
+  else if(isInput)        clFlags = CL_MEM_READ_ONLY;
+  else if(isOutput)       clFlags = CL_MEM_WRITE_ONLY;
+  else                    clFlags = CL_MEM_READ_ONLY;
+
+  visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY);
+  //pthread_mutex_lock(&ocl_mtx);
+  cl_mem d_input = clCreateBuffer(Context->clOCLContext, clFlags, size, NULL, &errcode);
+  //pthread_mutex_unlock(&ocl_mtx);
+  checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
+  DEBUG(cout<< "\nMemory allocated on device: " << d_input << flush << "\n");
+  if(isInput) {
+    DEBUG(cout << "\tCopying ...");
+    //pthread_mutex_lock(&ocl_mtx);
+    errcode = clEnqueueWriteBuffer(Context->clCommandQue,
+                                  d_input,
+                                  CL_TRUE,
+                                  0,
+                                  size,MTE->getAddress(),
+                                  0,NULL,NULL);
+    //pthread_mutex_unlock(&ocl_mtx);
+    checkErr(errcode, CL_SUCCESS, "Failure to copy memory to device");
+  }
+
+  visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
+  DEBUG(cout << " done\n");
+  MTE->update(MemTrackerEntry::DEVICE, (void*) d_input, Context);
+  DEBUG(cout << "Updated Table\n");
+  DEBUG(MTracker.print());
+  pthread_mutex_unlock(&ocl_mtx);
+  return d_input;
+}
+
+void* llvm_visc_x86_argument_ptr(void* ptr, size_t size) {
+  return llvm_visc_request_mem(ptr, size);
+}
+
+void* llvm_visc_request_mem(void* ptr, size_t size) {
+  // Ignore objects whose size is 0 - no memory is requested.
+  if (size == 0) {
+    DEBUG(cout << "[X86] Request memory (ignored): " << ptr << flush << "\n");
+    return ptr;
+  }
+
+  pthread_mutex_lock(&ocl_mtx);
+  DEBUG(cout << "[X86] Request memory: " << ptr << flush << "\n");
+  MemTrackerEntry* MTE = MTracker.lookup(ptr);
+  if(MTE == NULL) {
+    cout << "ERROR: Requesting memory not present in Table\n";
+    pthread_mutex_unlock(&ocl_mtx);
+    exit(EXIT_FAILURE);
+  }
+  // If already on host
+  if(MTE->getLocation() == MemTrackerEntry::HOST) {
+    DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush << "\n"); 
+    pthread_mutex_unlock(&ocl_mtx);
+    return MTE->getAddress();
+  }
+
+  // Else copy from device and update table
+  DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush << "\n");
+  DEBUG(cout << "\tCopying ...");
+  visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY);
+  //pthread_mutex_lock(&ocl_mtx);
+  cl_int errcode = clEnqueueReadBuffer(((DFNodeContext_OCL*)MTE->getContext())->clCommandQue,
+                                      (cl_mem) MTE->getAddress(),
+                                      CL_TRUE,
+                                      0,
+                                      size,
+                                      ptr,
+                                      0, NULL, NULL);
+  //pthread_mutex_unlock(&ocl_mtx);
+  visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
+  DEBUG(cout << " done\n");
+  checkErr(errcode, CL_SUCCESS, "[request mem] Failure to read output");
+  DEBUG(cout << "Free mem object on device\n");
+  clReleaseMemObject((cl_mem) MTE->getAddress());
+  DEBUG(cout << "Updated Table\n");
+  MTE->update(MemTrackerEntry::HOST, ptr);
+  DEBUG(MTracker.print());
+  pthread_mutex_unlock(&ocl_mtx);
+  return ptr;
+}
+
+/*************************** Timer Routines **********************************/
+
+static int is_async(enum visc_TimerID timer)
+{
+  return (timer == visc_TimerID_KERNEL) ||
+             (timer == visc_TimerID_COPY_ASYNC);
+}
+
+static int is_blocking(enum visc_TimerID timer)
+{
+  return (timer == visc_TimerID_COPY) || (timer == visc_TimerID_NONE);
+}
+
+#define INVALID_TIMERID visc_TimerID_LAST
+
+static int asyncs_outstanding(struct visc_TimerSet* timers)
+{
+  return (timers->async_markers != NULL) &&
+           (timers->async_markers->timerID != INVALID_TIMERID);
+}
+
+static struct visc_async_time_marker_list *
+get_last_async(struct visc_TimerSet* timers)
+{
+  /* Find the last event recorded thus far */
+  struct visc_async_time_marker_list * last_event = timers->async_markers;
+  if(last_event != NULL && last_event->timerID != INVALID_TIMERID) {
+    while(last_event->next != NULL &&
+            last_event->next->timerID != INVALID_TIMERID)
+      last_event = last_event->next;
+    return last_event;
+  } else
+    return NULL;
+}
+
+static void insert_marker(struct visc_TimerSet* tset, enum visc_TimerID timer)
+{
+  cl_int ciErrNum = CL_SUCCESS;
+  struct visc_async_time_marker_list ** new_event = &(tset->async_markers);
+
+  while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
+    new_event = &((*new_event)->next);
+  }
+
+  if(*new_event == NULL) {
+    *new_event = (struct visc_async_time_marker_list *)
+      			malloc(sizeof(struct visc_async_time_marker_list));
+    (*new_event)->marker = calloc(1, sizeof(cl_event));
+    /*
+    // I don't think this is needed at all. I believe clEnqueueMarker 'creates' the event
+#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 )
+fprintf(stderr, "Creating Marker [%d]\n", timer);
+    *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum);
+    if (ciErrNum != CL_SUCCESS) {
+      fprintf(stderr, "Error Creating User Event Object!\n");
+    }
+    ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED);
+    if (ciErrNum != CL_SUCCESS) {
+      fprintf(stderr, "Error Setting User Event Status!\n");
+    }
+#endif
+*/
+    (*new_event)->next = NULL;
+  }
+
+  /* valid event handle now aquired: insert the event record */
+  (*new_event)->label = NULL;
+  (*new_event)->timerID = timer;
+  //pthread_mutex_lock(&ocl_mtx);
+  ciErrNum = clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker);
+  //pthread_mutex_unlock(&ocl_mtx);
+  if (ciErrNum != CL_SUCCESS) {
+      fprintf(stderr, "Error Enqueueing Marker!\n");
+  }
+
+}
+
+static void insert_submarker(struct visc_TimerSet* tset, char *label, enum visc_TimerID timer)
+{
+  cl_int ciErrNum = CL_SUCCESS;
+  struct visc_async_time_marker_list ** new_event = &(tset->async_markers);
+
+  while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
+    new_event = &((*new_event)->next);
+  }
+
+  if(*new_event == NULL) {
+    *new_event = (struct visc_async_time_marker_list *)
+      			malloc(sizeof(struct visc_async_time_marker_list));
+    (*new_event)->marker = calloc(1, sizeof(cl_event));
+    /*
+#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 )
+fprintf(stderr, "Creating SubMarker %s[%d]\n", label, timer);
+    *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum);
+    if (ciErrNum != CL_SUCCESS) {
+      fprintf(stderr, "Error Creating User Event Object!\n");
+    }
+    ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED);
+    if (ciErrNum != CL_SUCCESS) {
+      fprintf(stderr, "Error Setting User Event Status!\n");
+    }
+#endif
+*/
+    (*new_event)->next = NULL;
+  }
+
+  /* valid event handle now aquired: insert the event record */
+  (*new_event)->label = label;
+  (*new_event)->timerID = timer;
+  //pthread_mutex_lock(&ocl_mtx);
+  ciErrNum = clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker);
+  //pthread_mutex_unlock(&ocl_mtx);
+  if (ciErrNum != CL_SUCCESS) {
+      fprintf(stderr, "Error Enqueueing Marker!\n");
+  }
+
+}
+
+
+/* Assumes that all recorded events have completed */
+static visc_Timestamp record_async_times(struct visc_TimerSet* tset)
+{
+  struct visc_async_time_marker_list * next_interval = NULL;
+  struct visc_async_time_marker_list * last_marker = get_last_async(tset);
+  visc_Timestamp total_async_time = 0;
+
+  for(next_interval = tset->async_markers; next_interval != last_marker;
+      next_interval = next_interval->next) {
+    cl_ulong command_start=0, command_end=0;
+    cl_int ciErrNum = CL_SUCCESS;
+
+    ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_start, NULL);
+    if (ciErrNum != CL_SUCCESS) {
+      fprintf(stderr, "Error getting first EventProfilingInfo: %d\n", ciErrNum);
+    }
+
+    ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_end, NULL);
+    if (ciErrNum != CL_SUCCESS) {
+      fprintf(stderr, "Error getting second EventProfilingInfo: %d\n", ciErrNum);
+    }
+
+    visc_Timestamp interval = (visc_Timestamp) (((double)(command_end - command_start)));
+    tset->timers[next_interval->timerID].elapsed += interval;
+    if (next_interval->label != NULL) {
+      struct visc_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list;
+      while (subtimer != NULL) {
+        if ( strcmp(subtimer->label, next_interval->label) == 0) {
+          subtimer->timer.elapsed += interval;
+          break;
+        }
+        subtimer = subtimer->next;
+      }
+    }
+    total_async_time += interval;
+    next_interval->timerID = INVALID_TIMERID;
+  }
+
+  if(next_interval != NULL)
+    next_interval->timerID = INVALID_TIMERID;
+
+  return total_async_time;
+}
+
+static void
+accumulate_time(visc_Timestamp *accum,
+		visc_Timestamp start,
+		visc_Timestamp end)
+{
+#if _POSIX_VERSION >= 200112L
+  *accum += end - start;
+#else
+# error "Timestamps not implemented for this system"
+#endif
+}
+
+#if _POSIX_VERSION >= 200112L
+static visc_Timestamp get_time()
+{
+  struct timespec tv;
+  clock_gettime(CLOCK_MONOTONIC, &tv);
+  return (visc_Timestamp) (tv.tv_sec * BILLION + tv.tv_nsec);
+}
+#else
+# error "no supported time libraries are available on this platform"
+#endif
+
+void
+visc_ResetTimer(struct visc_Timer *timer)
+{
+  timer->state = visc_Timer_STOPPED;
+
+#if _POSIX_VERSION >= 200112L
+  timer->elapsed = 0;
+#else
+# error "visc_ResetTimer: not implemented for this system"
+#endif
+}
+
+void
+visc_StartTimer(struct visc_Timer *timer)
+{
+  if (timer->state != visc_Timer_STOPPED) {
+    // FIXME: Removing warning statement to avoid printing this error
+    // fputs("Ignoring attempt to start a running timer\n", stderr);
+    return;
+  }
+
+  timer->state = visc_Timer_RUNNING;
+
+#if _POSIX_VERSION >= 200112L
+  {
+    struct timespec tv;
+    clock_gettime(CLOCK_MONOTONIC, &tv);
+    timer->init = tv.tv_sec * BILLION + tv.tv_nsec;
+  }
+#else
+# error "visc_StartTimer: not implemented for this system"
+#endif
+}
+
+void
+visc_StartTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer)
+{
+
+  unsigned int numNotStopped = 0x3; // 11
+  if (timer->state != visc_Timer_STOPPED) {
+    fputs("Warning: Timer was not stopped\n", stderr);
+    numNotStopped &= 0x1; // Zero out 2^1
+  }
+  if (subtimer->state != visc_Timer_STOPPED) {
+    fputs("Warning: Subtimer was not stopped\n", stderr);
+    numNotStopped &= 0x2; // Zero out 2^0
+  }
+  if (numNotStopped == 0x0) {
+    //fputs("Ignoring attempt to start running timer and subtimer\n", stderr);
+    return;
+  }
+
+  timer->state = visc_Timer_RUNNING;
+  subtimer->state = visc_Timer_RUNNING;
+
+#if _POSIX_VERSION >= 200112L
+  {
+    struct timespec tv;
+    clock_gettime(CLOCK_MONOTONIC, &tv);
+
+    if (numNotStopped & 0x2) {
+      timer->init = tv.tv_sec * BILLION + tv.tv_nsec;
+    }
+
+    if (numNotStopped & 0x1) {
+      subtimer->init = tv.tv_sec * BILLION + tv.tv_nsec;
+    }
+  }
+#else
+# error "visc_StartTimer: not implemented for this system"
+#endif
+
+}
+
+void
+visc_StopTimer(struct visc_Timer *timer)
+{
+  visc_Timestamp fini;
+
+  if (timer->state != visc_Timer_RUNNING) {
+    //fputs("Ignoring attempt to stop a stopped timer\n", stderr);
+    return;
+  }
+
+  timer->state = visc_Timer_STOPPED;
+
+#if _POSIX_VERSION >= 200112L
+  {
+    struct timespec tv;
+    clock_gettime(CLOCK_MONOTONIC, &tv);
+    fini = tv.tv_sec * BILLION + tv.tv_nsec;
+  }
+#else
+# error "visc_StopTimer: not implemented for this system"
+#endif
+
+  accumulate_time(&timer->elapsed, timer->init, fini);
+  timer->init = fini;
+}
+
+void visc_StopTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer) {
+
+  visc_Timestamp fini;
+
+  unsigned int numNotRunning = 0x3; // 11
+  if (timer->state != visc_Timer_RUNNING) {
+    fputs("Warning: Timer was not running\n", stderr);
+    numNotRunning &= 0x1; // Zero out 2^1
+  }
+  if (subtimer->state != visc_Timer_RUNNING) {
+    fputs("Warning: Subtimer was not running\n", stderr);
+    numNotRunning &= 0x2; // Zero out 2^0
+  }
+  if (numNotRunning == 0x0) {
+    //fputs("Ignoring attempt to stop stopped timer and subtimer\n", stderr);
+    return;
+  }
+
+
+  timer->state = visc_Timer_STOPPED;
+  subtimer->state = visc_Timer_STOPPED;
+
+#if _POSIX_VERSION >= 200112L
+  {
+    struct timespec tv;
+    clock_gettime(CLOCK_MONOTONIC, &tv);
+    fini = tv.tv_sec * BILLION + tv.tv_nsec;
+  }
+#else
+# error "visc_StopTimer: not implemented for this system"
+#endif
+
+  if (numNotRunning & 0x2) {
+    accumulate_time(&timer->elapsed, timer->init, fini);
+    timer->init = fini;
+  }
+
+  if (numNotRunning & 0x1) {
+    accumulate_time(&subtimer->elapsed, subtimer->init, fini);
+    subtimer->init = fini;
+  }
+
+}
+
+/* Get the elapsed time in seconds. */
+double
+visc_GetElapsedTime(struct visc_Timer *timer)
+{
+  double ret;
+
+  if (timer->state != visc_Timer_STOPPED) {
+    fputs("Elapsed time from a running timer is inaccurate\n", stderr);
+  }
+
+#if _POSIX_VERSION >= 200112L
+  ret = timer->elapsed / 1e9;
+#else
+# error "visc_GetElapsedTime: not implemented for this system"
+#endif
+  return ret;
+}
+
+void
+visc_InitializeTimerSet(struct visc_TimerSet *timers)
+{
+  int n;
+
+  timers->wall_begin = get_time();
+  timers->current = visc_TimerID_NONE;
+
+  timers->async_markers = NULL;
+
+  for (n = 0; n < visc_TimerID_LAST; n++) {
+    visc_ResetTimer(&timers->timers[n]);
+    timers->sub_timer_list[n] = NULL;
+  }
+}
+
+
+void
+visc_AddSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID visc_Category) {
+
+  struct visc_SubTimer *subtimer = (struct visc_SubTimer *) malloc
+    (sizeof(struct visc_SubTimer));
+
+  int len = strlen(label);
+
+  subtimer->label = (char *) malloc (sizeof(char)*(len+1));
+  sprintf(subtimer->label, "%s", label);
+
+  visc_ResetTimer(&subtimer->timer);
+  subtimer->next = NULL;
+
+  struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[visc_Category];
+  if (subtimerlist == NULL) {
+    subtimerlist = (struct visc_SubTimerList *) calloc
+      (1, sizeof(struct visc_SubTimerList));
+    subtimerlist->subtimer_list = subtimer;
+    timers->sub_timer_list[visc_Category] = subtimerlist;
+  } else {
+    // Append to list
+    struct visc_SubTimer *element = subtimerlist->subtimer_list;
+    while (element->next != NULL) {
+      element = element->next;
+    }
+    element->next = subtimer;
+  }
+
+}
+
+void
+visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer)
+{
+  //cerr << "Switch to timer: " << timer << flush << "\n";
+  /* Stop the currently running timer */
+  if (timers->current != visc_TimerID_NONE) {
+    struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
+    struct visc_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL;
+
+    if (!is_async(timers->current) ) {
+      if (timers->current != timer) {
+        if (currSubTimer != NULL) {
+          visc_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer);
+        } else {
+          visc_StopTimer(&timers->timers[timers->current]);
+        }
+      } else {
+        if (currSubTimer != NULL) {
+          visc_StopTimer(&currSubTimer->timer);
+        }
+      }
+    } else {
+      insert_marker(timers, timer);
+      if (!is_async(timer)) { // if switching to async too, keep driver going
+        visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]);
+      }
+    }
+  }
+
+  visc_Timestamp currentTime = get_time();
+
+  /* The only cases we check for asynchronous task completion is
+   * when an overlapping CPU operation completes, or the next
+   * segment blocks on completion of previous async operations */
+  if( asyncs_outstanding(timers) &&
+      (!is_async(timers->current) || is_blocking(timer) ) ) {
+
+    struct visc_async_time_marker_list * last_event = get_last_async(timers);
+    /* CL_COMPLETE if completed */
+
+    cl_int ciErrNum = CL_SUCCESS;
+    cl_int async_done = CL_COMPLETE;
+
+    ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL);
+    if (ciErrNum != CL_SUCCESS) {
+      fprintf(stdout, "Error Querying EventInfo1!\n");
+    }
+
+
+    if(is_blocking(timer)) {
+      /* Async operations completed after previous CPU operations:
+       * overlapped time is the total CPU time since this set of async
+       * operations were first issued */
+
+      // timer to switch to is COPY or NONE
+      if(async_done != CL_COMPLETE) {
+        accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed),
+	                  timers->async_begin,currentTime);
+      }
+
+      /* Wait on async operation completion */
+      ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker);
+      if (ciErrNum != CL_SUCCESS) {
+        fprintf(stderr, "Error Waiting for Events!\n");
+      }
+
+      visc_Timestamp total_async_time = record_async_times(timers);
+
+      /* Async operations completed before previous CPU operations:
+       * overlapped time is the total async time */
+      if(async_done == CL_COMPLETE) {
+        //fprintf(stderr, "Async_done: total_async_type = %lld\n", total_async_time);
+        timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time;
+      }
+
+    } else
+    /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
+    // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding
+    // so something is deeper in stack
+    if(async_done == CL_COMPLETE ) {
+      /* Async operations completed before previous CPU operations:
+       * overlapped time is the total async time */
+      timers->timers[visc_TimerID_OVERLAP].elapsed += record_async_times(timers);
+    }
+  }
+
+  /* Start the new timer */
+  if (timer != visc_TimerID_NONE) {
+    if(!is_async(timer)) {
+      visc_StartTimer(&timers->timers[timer]);
+    } else {
+      // toSwitchTo Is Async (KERNEL/COPY_ASYNC)
+      if (!asyncs_outstanding(timers)) {
+        /* No asyncs outstanding, insert a fresh async marker */
+
+        insert_marker(timers, timer);
+        timers->async_begin = currentTime;
+      } else if(!is_async(timers->current)) {
+        /* Previous asyncs still in flight, but a previous SwitchTo
+         * already marked the end of the most recent async operation,
+         * so we can rename that marker as the beginning of this async
+         * operation */
+
+        struct visc_async_time_marker_list * last_event = get_last_async(timers);
+        last_event->label = NULL;
+        last_event->timerID = timer;
+      }
+      if (!is_async(timers->current)) {
+        visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]);
+      }
+    }
+  }
+  timers->current = timer;
+
+}
+
+void
+visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID category)
+{
+  struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
+  struct visc_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL;
+
+  if (timers->current != visc_TimerID_NONE) {
+    if (!is_async(timers->current) ) {
+      if (timers->current != category) {
+        if (curr != NULL) {
+          visc_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer);
+        } else {
+          visc_StopTimer(&timers->timers[timers->current]);
+        }
+      } else {
+        if (curr != NULL) {
+          visc_StopTimer(&curr->timer);
+        }
+      }
+    } else {
+      insert_submarker(timers, label, category);
+      if (!is_async(category)) { // if switching to async too, keep driver going
+        visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]);
+      }
+    }
+  }
+
+  visc_Timestamp currentTime = get_time();
+
+  /* The only cases we check for asynchronous task completion is
+   * when an overlapping CPU operation completes, or the next
+   * segment blocks on completion of previous async operations */
+  if( asyncs_outstanding(timers) &&
+      (!is_async(timers->current) || is_blocking(category) ) ) {
+
+    struct visc_async_time_marker_list * last_event = get_last_async(timers);
+    /* CL_COMPLETE if completed */
+
+    cl_int ciErrNum = CL_SUCCESS;
+    cl_int async_done = CL_COMPLETE;
+
+    ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL);
+    if (ciErrNum != CL_SUCCESS) {
+      fprintf(stdout, "Error Querying EventInfo2!\n");
+    }
+
+    if(is_blocking(category)) {
+      /* Async operations completed after previous CPU operations:
+       * overlapped time is the total CPU time since this set of async
+       * operations were first issued */
+
+      // timer to switch to is COPY or NONE
+      // if it hasn't already finished, then just take now and use that as the elapsed time in OVERLAP
+      // anything happening after now isn't OVERLAP because everything is being stopped to wait for synchronization
+      // it seems that the extra sync wall time isn't being recorded anywhere
+      if(async_done != CL_COMPLETE)
+        accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed),
+	                  timers->async_begin,currentTime);
+
+      /* Wait on async operation completion */
+      ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker);
+      if (ciErrNum != CL_SUCCESS) {
+        fprintf(stderr, "Error Waiting for Events!\n");
+      }
+      visc_Timestamp total_async_time = record_async_times(timers);
+
+      /* Async operations completed before previous CPU operations:
+       * overlapped time is the total async time */
+       // If it did finish, then accumulate all the async time that did happen into OVERLAP
+       // the immediately preceding EventSynchronize theoretically didn't have any effect since it was already completed.
+      if(async_done == CL_COMPLETE /*cudaSuccess*/)
+        timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time;
+
+    } else
+    /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
+    // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding
+    // so something is deeper in stack
+    if(async_done == CL_COMPLETE /*cudaSuccess*/) {
+      /* Async operations completed before previous CPU operations:
+       * overlapped time is the total async time */
+      timers->timers[visc_TimerID_OVERLAP].elapsed += record_async_times(timers);
+    }
+    // else, this isn't blocking, so just check the next time around
+  }
+
+  subtimerlist = timers->sub_timer_list[category];
+  struct visc_SubTimer *subtimer = NULL;
+
+  if (label != NULL) {
+    subtimer = subtimerlist->subtimer_list;
+    while (subtimer != NULL) {
+      if (strcmp(subtimer->label, label) == 0) {
+        break;
+      } else {
+        subtimer = subtimer->next;
+      }
+    }
+  }
+
+  /* Start the new timer */
+  if (category != visc_TimerID_NONE) {
+    if(!is_async(category)) {
+      if (subtimerlist != NULL) {
+        subtimerlist->current = subtimer;
+      }
+
+      if (category != timers->current && subtimer != NULL) {
+        visc_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer);
+      } else if (subtimer != NULL) {
+        visc_StartTimer(&subtimer->timer);
+      } else {
+        visc_StartTimer(&timers->timers[category]);
+      }
+    } else {
+      if (subtimerlist != NULL) {
+        subtimerlist->current = subtimer;
+      }
+
+      // toSwitchTo Is Async (KERNEL/COPY_ASYNC)
+      if (!asyncs_outstanding(timers)) {
+        /* No asyncs outstanding, insert a fresh async marker */
+        insert_submarker(timers, label, category);
+        timers->async_begin = currentTime;
+      } else if(!is_async(timers->current)) {
+        /* Previous asyncs still in flight, but a previous SwitchTo
+         * already marked the end of the most recent async operation,
+         * so we can rename that marker as the beginning of this async
+         * operation */
+
+        struct visc_async_time_marker_list * last_event = get_last_async(timers);
+        last_event->timerID = category;
+        last_event->label = label;
+      } // else, marker for switchToThis was already inserted
+
+      //toSwitchto is already asynchronous, but if current/prev state is async too, then DRIVER is already running
+      if (!is_async(timers->current)) {
+        visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]);
+      }
+    }
+  }
+
+  timers->current = category;
+}
+
+void
+visc_PrintTimerSet(struct visc_TimerSet *timers)
+{
+  visc_Timestamp wall_end = get_time();
+
+  struct visc_Timer *t = timers->timers;
+  struct visc_SubTimer* sub = NULL;
+
+  int maxSubLength;
+
+  const char *categories[] = {
+    "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute", "Overlap",
+    "Init_Ctx", "Clear_Ctx", "Copy_Scalar", "Copy_Ptr", "Mem_Free",
+    "Read_Output", "Setup", "Mem_Track", "Mem_Untrack", "Misc",
+    "Pthread_Create", "Arg_Pack", "Arg_Unpack", "Computation", "Output_Pack", "Output_Unpack"
+
+  };
+
+  const int maxCategoryLength = 20;
+
+  int i;
+  for(i = 1; i < visc_TimerID_LAST; ++i) { // exclude NONE and OVRELAP from this format
+    if(visc_GetElapsedTime(&t[i]) != 0 || true) {
+
+      // Print Category Timer
+      printf("%-*s: %.9f\n", maxCategoryLength, categories[i-1], visc_GetElapsedTime(&t[i]));
+
+      if (timers->sub_timer_list[i] != NULL) {
+        sub = timers->sub_timer_list[i]->subtimer_list;
+        maxSubLength = 0;
+        while (sub != NULL) {
+          // Find longest SubTimer label
+          if (strlen(sub->label) > (unsigned long) maxSubLength) {
+            maxSubLength = strlen(sub->label);
+          }
+          sub = sub->next;
+        }
+
+        // Fit to Categories
+        if (maxSubLength <= maxCategoryLength) {
+         maxSubLength = maxCategoryLength;
+        }
+
+        sub = timers->sub_timer_list[i]->subtimer_list;
+
+        // Print SubTimers
+        while (sub != NULL) {
+          printf(" -%-*s: %.9f\n", maxSubLength, sub->label, visc_GetElapsedTime(&sub->timer));
+          sub = sub->next;
+        }
+      }
+    }
+  }
+
+  if(visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]) != 0)
+    printf("CPU/Kernel Overlap: %.9f\n", visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]));
+
+  float walltime = (wall_end - timers->wall_begin)/ 1e9;
+  printf("Timer Wall Time: %.9f\n", walltime);
+
+}
+
+void visc_DestroyTimerSet(struct visc_TimerSet * timers)
+{
+  /* clean up all of the async event markers */
+  struct visc_async_time_marker_list* event = timers->async_markers;
+  while(event != NULL) {
+
+    cl_int ciErrNum = CL_SUCCESS;
+    ciErrNum = clWaitForEvents(1, (cl_event *)(event)->marker);
+    if (ciErrNum != CL_SUCCESS) {
+      //fprintf(stderr, "Error Waiting for Events!\n");
+    }
+
+    ciErrNum = clReleaseEvent( *((cl_event *)(event)->marker) );
+    if (ciErrNum != CL_SUCCESS) {
+      fprintf(stderr, "Error Release Events!\n");
+    }
+
+    free((event)->marker);
+    struct visc_async_time_marker_list* next = ((event)->next);
+
+    free(event);
+
+    // (*event) = NULL;
+    event = next;
+  }
+
+  int i = 0;
+  for(i = 0; i < visc_TimerID_LAST; ++i) {
+    if (timers->sub_timer_list[i] != NULL) {
+      struct visc_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list;
+      struct visc_SubTimer *prev = NULL;
+      while (subtimer != NULL) {
+        free(subtimer->label);
+        prev = subtimer;
+        subtimer = subtimer->next;
+        free(prev);
+      }
+      free(timers->sub_timer_list[i]);
+    }
+  }
+}
+
+/**************************** Pipeline API ************************************/
+#define BUFFER_SIZE 1
+
+// Launch API for a streaming dataflow graph
+void* llvm_visc_streamLaunch(void(*LaunchFunc)(void*, void*), void* args) {
+  DFNodeContext_X86* Context = (DFNodeContext_X86*) malloc(sizeof(DFNodeContext_X86));
+
+  Context->threads = new std::vector<pthread_t>();
+  Context->ArgInPortSizeMap = new std::map<unsigned, uint64_t>();
+  //Context->BindInSizes = new std::vector<uint64_t>();
+  Context->BindInSourcePort = new std::vector<unsigned>();
+  Context->BindOutSizes = new std::vector<uint64_t>();
+  Context->EdgeSizes = new std::vector<uint64_t>();
+  Context->BindInputBuffers = new std::vector<CircularBuffer<uint64_t>*>();
+  Context->BindOutputBuffers = new std::vector<CircularBuffer<uint64_t>*>();
+  Context->EdgeBuffers = new std::vector<CircularBuffer<uint64_t>*>();
+  Context->isLastInputBuffers = new std::vector<CircularBuffer<uint64_t>*>();
+
+  DEBUG(cout << "StreamLaunch -- Graph: " << Context << ", Arguments: " << args << flush << "\n");
+  LaunchFunc(args, Context);
+  return Context;
+}
+
+// Push API for a streaming dataflow graph
+void llvm_visc_streamPush(void* graphID, void* args) {
+  DEBUG(cout << "StreamPush -- Graph: " << graphID << ", Arguments: " << args << flush << "\n");
+  DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
+  unsigned offset = 0;
+  for (unsigned i=0; i< Ctx->ArgInPortSizeMap->size(); i++) {
+    uint64_t element;
+    memcpy(&element, (char*)args+offset, Ctx->ArgInPortSizeMap->at(i));
+    offset += Ctx->ArgInPortSizeMap->at(i);
+    for(unsigned j=0; j<Ctx->BindInputBuffers->size();j++) {
+      if(Ctx->BindInSourcePort->at(j) == i) {
+        // Push to all bind buffers connected to parent node at this port
+        //DEBUG(cout << "\tPushing Value " << element << " to buffer\n");
+        llvm_visc_bufferPush(Ctx->BindInputBuffers->at(j), element);
+      }
+    }
+  }
+  // Push 0 in isLastInput buffers of all child nodes
+  for (CircularBuffer<uint64_t>* buffer: *(Ctx->isLastInputBuffers))
+    llvm_visc_bufferPush(buffer, 0);
+}
+
+// Pop API for a streaming dataflow graph
+void* llvm_visc_streamPop(void* graphID) {
+  DEBUG(cout << "StreamPop -- Graph: " << graphID << flush << "\n");
+  DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
+  unsigned totalBytes = 0;
+  for(uint64_t size:  *(Ctx->BindOutSizes))
+    totalBytes+= size;
+  void* output = malloc(totalBytes);
+  unsigned offset = 0;
+  for (unsigned i=0; i< Ctx->BindOutputBuffers->size(); i++) {
+    uint64_t element = llvm_visc_bufferPop(Ctx->BindOutputBuffers->at(i));
+    //DEBUG(cout << "\tPopped Value " << element << " from buffer\n");
+    memcpy((char*)output+offset, &element, Ctx->BindOutSizes->at(i));
+    offset += Ctx->BindOutSizes->at(i);
+  }
+  return output;
+}
+
+// Wait API for a streaming dataflow graph
+void llvm_visc_streamWait(void* graphID) {
+  DEBUG(cout << "StreamWait -- Graph: " << graphID << flush << "\n");
+  DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
+  // Push garbage to all other input buffers
+  for (unsigned i=0; i< Ctx->BindInputBuffers->size(); i++) {
+    uint64_t element = 0;
+    //DEBUG(cout << "\tPushing Value " << element << " to buffer\n");
+    llvm_visc_bufferPush(Ctx->BindInputBuffers->at(i), element);
+  }
+  // Push 1 in isLastInput buffers of all child nodes
+  for (unsigned i=0; i < Ctx->isLastInputBuffers->size(); i++)
+    llvm_visc_bufferPush(Ctx->isLastInputBuffers->at(i), 1);
+
+  llvm_visc_freeThreads(graphID);
+}
+
+// Create a buffer and return the bufferID
+void* llvm_visc_createBindInBuffer(void* graphID, uint64_t size, unsigned inArgPort) {
+  DEBUG(cout << "Create BindInBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n");
+  DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
+  CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindIn");
+  DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n");
+  Context->BindInputBuffers->push_back(bufferID);
+  (*(Context->ArgInPortSizeMap))[inArgPort] = size;
+  Context->BindInSourcePort->push_back(inArgPort);
+  //Context->BindInSizes->push_back(size);
+  return bufferID;
+}
+
+void* llvm_visc_createBindOutBuffer(void* graphID, uint64_t size) {
+  DEBUG(cout << "Create BindOutBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n");
+  DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
+  //Twine name = Twine("Bind.Out.")+Twine(Context->BindOutputBuffers->size());
+  CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindOut");
+  DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n");
+  Context->BindOutputBuffers->push_back(bufferID);
+  Context->BindOutSizes->push_back(size);
+  return bufferID;
+}
+void* llvm_visc_createEdgeBuffer(void* graphID, uint64_t size) {
+  DEBUG(cout << "Create EdgeBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n");
+  DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
+  //Twine name = Twine("Edge.")+Twine(Context->EdgeBuffers->size());
+  CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "Edge");
+  DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n");
+  Context->EdgeBuffers->push_back(bufferID);
+  Context->EdgeSizes->push_back(size);
+  return bufferID;
+}
+
+void* llvm_visc_createLastInputBuffer(void* graphID, uint64_t size) {
+  DEBUG(cout << "Create isLastInputBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n");
+  DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
+  //Twine name = Twine("isLastInput.")+Twine(Context->EdgeBuffers->size());
+  CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "LastInput");
+  DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n");
+  Context->isLastInputBuffers->push_back(bufferID);
+  return bufferID;
+}
+
+// Free buffers 
+void llvm_visc_freeBuffers(void* graphID) {
+  DEBUG(cout << "Free all buffers -- Graph: " << graphID << flush << "\n");
+  DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
+  for(CircularBuffer<uint64_t>* bufferID: *(Context->BindInputBuffers))
+    delete bufferID;
+  for(CircularBuffer<uint64_t>* bufferID: *(Context->BindOutputBuffers))
+    delete bufferID;
+  for(CircularBuffer<uint64_t>* bufferID: *(Context->EdgeBuffers))
+    delete bufferID;
+  for(CircularBuffer<uint64_t>* bufferID: *(Context->isLastInputBuffers))
+    delete bufferID;
+}
+
+// Pop an element from the buffer
+uint64_t llvm_visc_bufferPop(void* bufferID) {
+  CircularBuffer<uint64_t>* buffer = (CircularBuffer<uint64_t>*) bufferID;
+  return buffer->pop();
+}
+
+// Push an element into the buffer
+void llvm_visc_bufferPush(void* bufferID, uint64_t element) {
+  CircularBuffer<uint64_t>* buffer = (CircularBuffer<uint64_t>*) bufferID;
+  buffer->push(element);
+}
+
+// Create a thread
+void llvm_visc_createThread(void* graphID, void* (*Func)(void*), void* arguments) {
+  DEBUG(cout << "Create Thread -- Graph: " << graphID << ", Func: " << Func << ", Args: " << arguments << flush << "\n");
+  DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
+  int err;
+  pthread_t threadID;
+  if((err = pthread_create(&threadID, NULL, Func, arguments)) != 0)
+    cout << "Failed to create thread. Error code = " << err << flush << "\n";
+
+  Ctx->threads->push_back(threadID);
+}
+
+// Wait for thread to finish
+void llvm_visc_freeThreads(void* graphID) {
+  DEBUG(cout << "Free Threads -- Graph: " << graphID << flush << "\n");
+  DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
+  for(pthread_t thread: *(Ctx->threads))
+    pthread_join(thread, NULL);
+}
+
+/************************ OPENCL & PTHREAD API ********************************/
+
+void* llvm_visc_x86_launch(void* (*rootFunc)(void*), void* arguments) {
+  DFNodeContext_X86 *Context = (DFNodeContext_X86*) malloc(sizeof(DFNodeContext_X86));
+  //int err;
+  //if((err = pthread_create(&Context->threadID, NULL, rootFunc, arguments)) != 0)
+    //cout << "Failed to create pthread. Error code = " << err << flush << "\n";
+  rootFunc(arguments);
+  return Context;
+}
+
+void llvm_visc_x86_wait(void* graphID) {
+  DEBUG(cout << "Waiting for pthread to finish ...\n");
+  //DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
+  //pthread_join(Context->threadID, NULL);
+  free(graphID);
+  DEBUG(cout << "\t... pthread Done!\n");
+}
+
+void* llvm_visc_ocl_initContext(enum visc::Target T) {
+  pthread_mutex_lock(&ocl_mtx);
+  //DEBUG(std::string Target = T == visc::GPU_TARGET? "GPU" : "SPIR");
+  DEBUG(const char* Target = T == visc::GPU_TARGET? "GPU" : "SPIR");
+  DEBUG(cout << "Initializing Context for " << Target << " device\n");
+  cl_uint numPlatforms;
+  cl_int errcode;
+  errcode = clGetPlatformIDs(0, NULL, &numPlatforms);
+  checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms");
+
+  // now get all the platform IDs
+  cl_platform_id* platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id)*numPlatforms);
+  errcode = clGetPlatformIDs(numPlatforms, platforms, NULL);
+  checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs");
+
+
+  for(unsigned i=0; i < numPlatforms; i++) {
+    char buffer[10240];
+    DEBUG(cout << "Device " << i << " Info -->\n");
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL);
+    DEBUG(cout << "\tPROFILE = " << buffer << flush << "\n");
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL);
+    DEBUG(cout << "\tVERSION = "<< buffer << flush << "\n");
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL);
+    DEBUG(cout << "\tNAME = " << buffer << flush << "\n");
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL);
+    DEBUG(cout << "\tVENDOR = " << buffer << flush << "\n");
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL);
+    DEBUG(cout << "\tEXTENSIONS = " << buffer << flush << "\n");
+  }
+  // set platform property - just pick the first one
+  //cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
+                                        //(long) platforms[0],
+                                        //0};
+  //globalOCLContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU,
+                                         //NULL, NULL, &errcode);
+  assert(numPlatforms >= 2 && "Expecting two OpenCL platforms");
+  // Choose second one which is X86 AVX
+  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
+                                        (long) platforms[T == visc::GPU_TARGET? 0 : 1],
+                                        0};
+  globalOCLContext = clCreateContextFromType(properties,
+                                            T == visc::GPU_TARGET?
+                                              CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU,
+                                       NULL, NULL, &errcode);
+  // get the list of OCL devices associated with context
+  size_t dataBytes;
+  errcode = clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, 0,
+                              NULL, &dataBytes);
+  checkErr(errcode, CL_SUCCESS, "Failure to get context info length");
+
+  clDevices = (cl_device_id *) malloc(dataBytes);
+  errcode |= clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, dataBytes,
+                              clDevices, NULL);
+  checkErr(errcode, CL_SUCCESS, "Failure to get context info");
+  if(false && T == visc::SPIR_TARGET) {
+    cl_device_partition_property props[4];
+    props[0] = CL_DEVICE_PARTITION_BY_COUNTS;
+    props[1] = NUM_CORES;
+    props[2] = CL_DEVICE_PARTITION_BY_COUNTS_LIST_END;
+    props[3] = 0;
+    cl_device_id subdevice_id[8];
+    cl_uint num_entries = 8;
+
+    cl_uint numDevices;
+    clCreateSubDevices(clDevices[0], props, num_entries, subdevice_id, &numDevices);
+    //printf("Num of devices = %d\n", numDevices);
+    //for(unsigned i =0 ; i< numDevices; i++)
+      //printf("Subdevice id %d = %p\n", i, subdevice_id[i]);
+    clDevices[0] = subdevice_id[0];
+    globalOCLContext = clCreateContext(properties, 1, clDevices, NULL, NULL, &errcode);
+    checkErr(errcode, CL_SUCCESS, "Failure to create OCL context");
+  }
+
+  free(platforms);
+  DEBUG(cout << "\tContext " << globalOCLContext << flush << "\n");
+  checkErr(errcode, CL_SUCCESS, "Failure to create OCL context");
+
+  DEBUG(cout << "Initialize Kernel Timer\n");
+  visc_InitializeTimerSet(&kernel_timer);
+
+  pthread_mutex_unlock(&ocl_mtx);
+  return globalOCLContext;
+}
+
+void llvm_visc_ocl_clearContext(void* graphID) {
+  pthread_mutex_lock(&ocl_mtx);
+  DEBUG(cout << "Clear Context\n");
+  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
+  // FIXME: Have separate function to release command queue and clear context.
+  // Would be useful when a context has multiple command queues
+  clReleaseKernel(Context->clKernel);
+  //clReleaseProgram(Context->clProgram);
+  //clReleaseCommandQueue(Context->clCommandQue);
+  //clReleaseContext(globalOCLContext);
+  //DEBUG(cout << "Released context at: " << globalOCLContext);
+  free(Context);
+  DEBUG(cout << "Done with OCL kernel\n");
+  cout << "Printing VISC Timer: KernelTimer\n";
+  visc_PrintTimerSet(&kernel_timer);
+  pthread_mutex_unlock(&ocl_mtx);
+
+}
+
+void llvm_visc_ocl_argument_shared(void* graphID, int arg_index, size_t size) {
+  pthread_mutex_lock(&ocl_mtx);
+  DEBUG(cout << "Set Shared Memory Input:");
+  DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size << flush << "\n");
+  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
+  DEBUG(cout << "Using Context: " << Context << flush << "\n");
+  DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n");
+  //pthread_mutex_lock(&ocl_mtx);
+  cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, NULL);
+  //pthread_mutex_unlock(&ocl_mtx);
+  checkErr(errcode, CL_SUCCESS, "Failure to set shared memory argument");
+  pthread_mutex_unlock(&ocl_mtx);
+}
+
+void llvm_visc_ocl_argument_scalar(void* graphID, void* input, int arg_index, size_t size) {
+  pthread_mutex_lock(&ocl_mtx);
+  DEBUG(cout << "Set Scalar Input:");
+  DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size << flush << "\n");
+  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
+  DEBUG(cout << "Using Context: " << Context << flush << "\n");
+  DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n");
+  //pthread_mutex_lock(&ocl_mtx);
+  cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input);
+  //pthread_mutex_unlock(&ocl_mtx);
+  checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument");
+  pthread_mutex_unlock(&ocl_mtx);
+}
+
+void* llvm_visc_ocl_argument_ptr(void* graphID, void* input, int arg_index, size_t size, bool isInput, bool isOutput) {
+  pthread_mutex_lock(&ocl_mtx);
+  DEBUG(cout << "Set Pointer Input:");
+  DEBUG(cout << "\tArgument Index = " << arg_index << ", Ptr = " << input << ", Size = "<< size << flush << "\n");
+  // Size should be non-zero
+  assert(size != 0 && "Size of data pointed to has to be non-zero!");
+  DEBUG(cout << "\tInput = "<< isInput << "\tOutput = " << isOutput << flush << "\n");
+  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
+  
+  pthread_mutex_unlock(&ocl_mtx);
+  // Check with runtime the location of this memory 
+  cl_mem d_input = (cl_mem) llvm_visc_ocl_request_mem(input, size, Context, isInput, isOutput);
+  
+  pthread_mutex_lock(&ocl_mtx);
+  // Set Kernel Argument
+  //pthread_mutex_lock(&ocl_mtx);
+  cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input);
+  //pthread_mutex_unlock(&ocl_mtx);
+  checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument");
+  DEBUG(cout << "\tDevicePtr = " << d_input << flush << "\n");
+  pthread_mutex_unlock(&ocl_mtx);
+  return d_input;
+}
+
+void* llvm_visc_ocl_output_ptr(void* graphID, int arg_index, size_t size) {
+  pthread_mutex_lock(&ocl_mtx);
+  DEBUG(cout << "Set device memory for Output Struct:");
+  DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = "<< size << flush << "\n");
+  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
+  cl_int errcode;
+  //pthread_mutex_lock(&ocl_mtx);
+  cl_mem d_output = clCreateBuffer(Context->clOCLContext, CL_MEM_WRITE_ONLY, size, NULL, &errcode);
+  //pthread_mutex_unlock(&ocl_mtx);
+  checkErr(errcode, CL_SUCCESS, "Failure to create output buffer on device");
+  //pthread_mutex_lock(&ocl_mtx);
+  errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output);
+  //pthread_mutex_unlock(&ocl_mtx);
+  checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument");
+  DEBUG(cout << "\tDevicePtr = " << d_output << flush << "\n");
+  pthread_mutex_unlock(&ocl_mtx);
+  return d_output;
+}
+
+void llvm_visc_ocl_free(void* ptr) {
+  //DEBUG(cout << "Release Device Pointer: " << ptr << flush << "\n");
+  //cl_mem d_ptr = (cl_mem) ptr;
+  //clReleaseMemObject(d_ptr);
+}
+
+void* llvm_visc_ocl_getOutput(void* graphID, void* h_output, void* d_output, size_t size) {
+  pthread_mutex_lock(&ocl_mtx);
+  DEBUG(cout << "Get Output:\n");
+  DEBUG(cout << "\tHostPtr = " << h_output << ", DevicePtr = " << d_output << ", Size = "<< size << flush << "\n");
+  if(h_output == NULL)
+    h_output = malloc(size);
+  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
+  //pthread_mutex_lock(&ocl_mtx);
+  cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size,
+                                h_output, 0, NULL, NULL);
+  //pthread_mutex_unlock(&ocl_mtx);
+  checkErr(errcode, CL_SUCCESS, "[getOutput] Failure to read output");
+  pthread_mutex_unlock(&ocl_mtx);
+  return h_output;
+}
+
+void* llvm_visc_ocl_executeNode(void* graphID, unsigned workDim , const size_t*
+                                localWorkSize, const size_t* globalWorkSize) {
+  pthread_mutex_lock(&ocl_mtx);
+
+  size_t GlobalWG[3];
+  size_t LocalWG[3];
+
+  // OpenCL EnqeueNDRangeKernel function results in segementation fault if we
+  // directly use local and global work groups arguments. Hence, allocating it
+  // on stack and copying.
+  for(unsigned i=0; i<workDim; i++) {
+    GlobalWG[i] = globalWorkSize[i];
+  }
+
+  // OpenCL allows local workgroup to be null.
+  if(localWorkSize != NULL) {
+    for(unsigned i=0; i<workDim; i++) {
+      LocalWG[i] = localWorkSize[i];
+    }
+  }
+
+  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
+  // TODO: Would like to use event to ensure better scheduling of kernels.
+  // Currently passing the event paratemeter results in seg fault with
+  // clEnqueueNDRangeKernel.
+  cl_event* event;
+  DEBUG(cout << "Enqueuing kernel:\n");
+  DEBUG(cout << "\tCommand Queue: " << Context->clCommandQue << flush << "\n");
+  DEBUG(cout << "\tKernel: " << Context->clKernel << flush << "\n");
+  DEBUG(cout << "\tNumber of dimensions: " << workDim << flush << "\n");
+  DEBUG(cout << "\tGlobal Work Group: ( ");
+  for(unsigned i = 0; i<workDim; i++) {
+    DEBUG(cout << GlobalWG[i] << " ");
+  }
+  DEBUG(cout << ")\n");
+  if(localWorkSize != NULL) {
+    DEBUG(cout << "\tLocal Work Group: ( ");
+    for(unsigned i = 0; i<workDim; i++) {
+      DEBUG(cout << LocalWG[i] << " ");
+    }
+    DEBUG(cout << ")\n");
+  }
+  //pthread_mutex_lock(&ocl_mtx);
+  clFinish(Context->clCommandQue);
+  //pthread_mutex_unlock(&ocl_mtx);
+  visc_SwitchToTimer(&kernel_timer, visc_TimerID_COMPUTATION);
+  //for(int i=0 ;i < NUM_TESTS; i++) {
+    //cout << "Iteration = " << i << flush << "\n";
+    //pthread_mutex_lock(&ocl_mtx);
+    cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue,
+        Context->clKernel, workDim, NULL, GlobalWG, (localWorkSize == NULL)? NULL :  LocalWG, 0, NULL, NULL);
+    //pthread_mutex_unlock(&ocl_mtx);
+    checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
+  //}
+  //pthread_mutex_lock(&ocl_mtx);
+  clFinish(Context->clCommandQue);
+  //pthread_mutex_unlock(&ocl_mtx);
+  visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
+  
+  pthread_mutex_unlock(&ocl_mtx);
+  return event;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+//! Loads a Program binary file.
+//!
+//! @return the source string if succeeded, 0 otherwise
+//! @param Filename        program filename
+//! @param szFinalLength    returned length of the code string
+//////////////////////////////////////////////////////////////////////////////
+static char* LoadProgSource(const char* Filename, size_t* szFinalLength)
+{
+  DEBUG(cout << "Load Prog Source\n");
+  // locals
+  FILE* pFileStream = NULL;
+  size_t szSourceLength;
+
+  // open the OpenCL source code file
+  pFileStream = fopen(Filename, "rb");
+  if(pFileStream == 0)
+  {
+    return NULL;
+  }
+
+  // get the length of the source code
+  fseek(pFileStream, 0, SEEK_END);
+  szSourceLength = ftell(pFileStream);
+  fseek(pFileStream, 0, SEEK_SET);
+
+  // allocate a buffer for the source code string and read it in
+  char* cSourceString = (char *)malloc(szSourceLength + 1);
+  if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1)
+  {
+      fclose(pFileStream);
+      free(cSourceString);
+      return 0;
+  }
+
+  // close the file and return the total length of the combined (preamble + source) string
+  fclose(pFileStream);
+  if(szFinalLength != 0)
+  {
+      *szFinalLength = szSourceLength;
+  }
+  cSourceString[szSourceLength] = '\0';
+
+  return cSourceString;
+}
+
+void* llvm_visc_ocl_launch(const char* FileName, const char* KernelName) {
+  pthread_mutex_lock(&ocl_mtx);
+  DEBUG(cout << "Launch OCL Kernel\n");
+  // Initialize OpenCL
+
+  // OpenCL specific variables
+  DFNodeContext_OCL *Context = (DFNodeContext_OCL *) malloc(sizeof(DFNodeContext_OCL));
+
+  size_t kernelLength;
+  cl_int errcode;
+
+  // For a single context for all kernels
+  Context->clOCLContext = globalOCLContext;
+
+  //Create a command-queue
+  //pthread_mutex_lock(&ocl_mtx);
+  Context->clCommandQue = clCreateCommandQueue(Context->clOCLContext, clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode);
+  globalCommandQue = Context->clCommandQue;
+  //pthread_mutex_unlock(&ocl_mtx);
+  checkErr(errcode, CL_SUCCESS, "Failure to create command queue");
+
+  DEBUG(cout << "Loading program binary: " << FileName << flush << "\n");
+  char *programSource = LoadProgSource(FileName, &kernelLength);
+  checkErr(programSource != NULL, 1 /*bool true*/, "Failure to load Program Binary");
+
+  cl_int binaryStatus;
+  //pthread_mutex_lock(&ocl_mtx);
+  Context->clProgram = clCreateProgramWithBinary(Context->clOCLContext, 1, &clDevices[0],
+                                        &kernelLength,
+                                        (const unsigned char **)&programSource,
+                                        &binaryStatus, &errcode);
+  //pthread_mutex_unlock(&ocl_mtx);
+  checkErr(errcode, CL_SUCCESS, "Failure to create program from binary");
+
+  DEBUG(cout << "Building kernel - " << KernelName << " from file " << FileName << flush << "\n");
+  errcode = clBuildProgram(Context->clProgram, 0, NULL, NULL, NULL, NULL);
+  // If build fails, get build log from device
+  if(errcode != CL_SUCCESS) {
+    cout << "ERROR: Failure to build program\n";
+    size_t len = 0;
+    errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0] , CL_PROGRAM_BUILD_LOG, 0,
+        NULL, &len);
+    cout << "LOG LENGTH: " << len << flush << "\n";
+    checkErr(errcode, CL_SUCCESS, "Failure to collect program build log length");
+    char *log = (char*) malloc(len*sizeof(char));
+    errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0], CL_PROGRAM_BUILD_LOG, len,
+        log, NULL);
+    checkErr(errcode, CL_SUCCESS, "Failure to collect program build log");
+
+    cout << "Device Build Log:\n" << log << flush << "\n";
+    free(log);
+    pthread_mutex_unlock(&ocl_mtx);
+    exit(EXIT_FAILURE);
+  }
+
+  Context->clKernel = clCreateKernel(Context->clProgram, KernelName, &errcode);
+  checkErr(errcode, CL_SUCCESS, "Failure to create kernel");
+
+  DEBUG(cout << "Kernel ID = " << Context->clKernel << "\n");
+  //free(clDevices);
+  free(programSource);
+
+  pthread_mutex_unlock(&ocl_mtx);
+  return Context;
+}
+
+
+void llvm_visc_ocl_wait(void* graphID) {
+  pthread_mutex_lock(&ocl_mtx);
+  DEBUG(cout << "Wait\n");
+  DFNodeContext_OCL *Context = (DFNodeContext_OCL*) graphID;
+  //pthread_mutex_lock(&ocl_mtx);
+  clFinish(Context->clCommandQue);
+  //pthread_mutex_unlock(&ocl_mtx);
+  pthread_mutex_unlock(&ocl_mtx);
+}
+
+void llvm_visc_switchToTimer(void** timerSet, enum visc_TimerID timer) {
+  //cout << "Switching to timer " << timer << flush << "\n";
+  pthread_mutex_lock(&ocl_mtx);
+  //visc_SwitchToTimer((visc_TimerSet*)(*timerSet), timer);
+  pthread_mutex_unlock(&ocl_mtx);
+}
+void llvm_visc_printTimerSet(void** timerSet, char* timerName) {
+  pthread_mutex_lock(&ocl_mtx);
+  cout << "Printing VISC Timer: ";
+  if(timerName != NULL)
+    cout << timerName << flush << "\n";
+  else
+    cout << "Anonymous\n";
+  visc_PrintTimerSet((visc_TimerSet*) (*timerSet));
+  pthread_mutex_unlock(&ocl_mtx);
+}
+
+void* llvm_visc_initializeTimerSet() {
+  pthread_mutex_lock(&ocl_mtx);
+  visc_TimerSet* TS = (visc_TimerSet*) malloc (sizeof(visc_TimerSet));
+  visc_InitializeTimerSet(TS);
+  pthread_mutex_unlock(&ocl_mtx);
+  return TS;
+}
+
+
+
diff --git a/llvm/projects/visc-cpu-rt/visc-rt.h b/llvm/projects/visc-cpu-rt/visc-rt.h
new file mode 100644
index 0000000000..5e7546f582
--- /dev/null
+++ b/llvm/projects/visc-cpu-rt/visc-rt.h
@@ -0,0 +1,302 @@
+/*
+ *
+ * (c) 2010 The Board of Trustees of the University of Illinois.
+ */
+#ifndef VISC_RT_HEADER
+#define VISC_RT_HEADER
+
+#include <iostream>
+#include <map>
+#include <ctime>
+#include <vector>
+#include <pthread.h>
+#include <string>
+//#include <condition_variable>
+
+#include "llvm/SupportVISC/VISCHint.h"
+#include "llvm/SupportVISC/VISCTimer.h"
+#include "device_abstraction.h"
+#include "policy.h"
+#ifndef DEBUG_BUILD
+#define DEBUG(s) {}
+#else
+#define DEBUG(s) s
+#endif
+
+
+
+using namespace std;
+
+extern "C" {
+
+/************************* Policies *************************************/
+
+void llvm_visc_policy_init();
+void llvm_visc_policy_clear();
+int llvm_visc_policy_getVersion(const char *, int64_t);
+
+/******************** Device Abstraction ********************************/
+void llvm_visc_deviceAbstraction_start();
+void llvm_visc_deviceAbstraction_end();
+void llvm_visc_deviceAbstraction_waitOnDeviceStatus();
+
+/********************* DFG Depth Stack **********************************/
+class DFGDepth {
+  private:
+    unsigned numDim;
+    unsigned dimLimit[3];
+    unsigned dimInstance[3];
+  public:
+    DFGDepth() {}
+    DFGDepth(unsigned n, unsigned dimX = 0, unsigned iX = 0, unsigned dimY = 0, unsigned iY = 0,
+        unsigned dimZ = 0, unsigned iZ = 0) {
+      assert(n <= 3 && "Error! More than 3 dimensions not supported");
+      numDim = n;
+      dimLimit[0] = dimX;
+      dimLimit[1] = dimY;
+      dimLimit[2] = dimZ;
+      dimInstance[0] = iX;
+      dimInstance[1] = iY;
+      dimInstance[2] = iZ;
+    }
+    unsigned getDimLimit(unsigned dim) {
+      assert(dim <= numDim && "Error! Requested dimension limit is not specified");
+      return dimLimit[dim];
+    }
+
+    unsigned getDimInstance(unsigned dim) {
+      assert(dim <= numDim && "Error! Requested dimension instance is not specified");
+      return dimInstance[dim];
+    }
+
+    unsigned getNumDim() {
+      return numDim;
+    }
+};
+
+void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX = 0, uint64_t iX = 0,
+    uint64_t limitY = 0, uint64_t iY = 0, uint64_t limitZ = 0, uint64_t iZ = 0);
+void llvm_visc_x86_dstack_pop();
+uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim);
+uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim);
+
+
+/********************* Memory Tracker **********************************/
+class MemTrackerEntry {
+public:
+  enum Location {HOST, DEVICE};
+  private:
+    size_t size;
+    Location loc;
+    void* addr;
+    void* Context;
+
+  public:
+    MemTrackerEntry(size_t _size, Location _loc, void* _addr, void* _Context):
+      size(_size), loc(_loc), addr(_addr), Context(_Context) {
+    }
+
+    size_t getSize() {
+      return size;
+    }
+
+    Location getLocation() {
+      return loc;
+    }
+
+    void* getAddress() {
+      return addr;
+    }
+
+    void* getContext() {
+      return Context;
+    }
+
+    void update(Location _loc, void* _addr, void* _Context = NULL) {
+      loc = _loc;
+      addr = _addr;
+      Context = _Context;
+    }
+
+    void print() {
+      cout << "Size = " << size << "\tLocation = " << loc << "\tAddress = " << addr << "\tContext = " << Context;
+    }
+};
+
+
+class MemTracker {
+
+private:
+  std::map<void*, MemTrackerEntry*> Table;
+
+public:
+  MemTracker() {
+  }
+
+  bool insert(void* ID, size_t size, MemTrackerEntry::Location loc, void* addr, void* Context = NULL) {
+    MemTrackerEntry* MTE = new MemTrackerEntry(size, loc, addr, Context);
+    Table.insert(std::pair<void*, MemTrackerEntry*>(ID, MTE));
+    return MTE != NULL;
+  }
+
+  MemTrackerEntry* lookup(void* ID) {
+    if(Table.count(ID) == 0)
+      return NULL;
+    return Table[ID];
+  }
+
+  void remove(void* ID) {
+    MemTrackerEntry* MTE = Table[ID];
+    free(MTE);
+    Table.erase(ID);
+  }
+
+  void print() {
+    cout << "Printing Table ... Size = " << Table.size() << flush << "\n";
+    for(auto& Entry: Table) {
+      cout << Entry.first << ":\t" ;
+      Entry.second->print();
+      cout << flush << "\n";
+    }
+  }
+
+};
+
+void llvm_visc_track_mem(void*, size_t);
+void llvm_visc_untrack_mem(void*);
+void* llvm_visc_request_mem(void*, size_t);
+
+/*********************** OPENCL & PTHREAD API **************************/
+void* llvm_visc_x86_launch(void* (void*), void*);
+void llvm_visc_x86_wait(void*);
+void* llvm_visc_ocl_initContext(enum visc::Target);
+
+void* llvm_visc_x86_argument_ptr(void*, size_t);
+
+void llvm_visc_ocl_clearContext(void*);
+void llvm_visc_ocl_argument_shared(void*, int, size_t);
+void llvm_visc_ocl_argument_scalar(void*, void*, int, size_t);
+void* llvm_visc_ocl_argument_ptr(void*, void*, int, size_t, bool, bool);
+void* llvm_visc_ocl_output_ptr(void*, int, size_t);
+void llvm_visc_ocl_free(void*);
+void* llvm_visc_ocl_getOutput(void*, void*, void*, size_t);
+void* llvm_visc_ocl_executeNode(void*, unsigned, const size_t*, const size_t*);
+void* llvm_visc_ocl_launch(const char*, const char*);
+void llvm_visc_ocl_wait(void*);
+
+void llvm_visc_switchToTimer(void** timerSet, enum visc_TimerID);
+void llvm_visc_printTimerSet(void** timerSet, char* timerName = NULL);
+void* llvm_visc_initializeTimerSet();
+
+}
+
+/*************************** Pipeline API ******************************/
+// Circular Buffer class
+unsigned counter = 0;
+template <class ElementType>
+class CircularBuffer {
+private:
+    int numElements;
+    int bufferSize;
+    int Head;
+    int Tail;
+    pthread_mutex_t mtx;
+    pthread_cond_t cv;
+    vector<ElementType> buffer;
+    std::string name;
+    unsigned ID;
+
+public:
+    CircularBuffer(int maxElements, std::string _name =  "ANON") {
+        ID = counter;
+        Head = 0;
+        Tail = 0;
+        numElements = 0;
+        name = _name;
+        bufferSize = maxElements+1;
+        buffer.reserve(bufferSize);
+        pthread_mutex_init(&mtx, NULL);
+        pthread_cond_init(&cv, NULL);
+        counter++;
+
+    }
+
+    bool push(ElementType E);
+    ElementType pop();
+
+};
+
+template <class ElementType>
+bool CircularBuffer<ElementType>::push(ElementType E) {
+    //DEBUG(cout << name << " Buffer[" << ID << "]: Push " << E << flush << "\n");
+    //unique_lock<mutex> lk(mtx);
+    pthread_mutex_lock(&mtx);
+    if((Head +1) % bufferSize == Tail) {
+        //DEBUG(cout << name << " Buffer[" << ID << "]: Push going to sleep ...\n");
+        //cv.wait(lk);
+        pthread_cond_wait(&cv, &mtx);
+        //DEBUG(cout << name << " Buffer[" << ID << "]: Push woke up\n");
+    }
+    buffer[Head] = E;
+    Head = (Head+1) % bufferSize;
+    numElements++;
+    //DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " << numElements << flush << "\n");
+    //lk.unlock();
+    pthread_mutex_unlock(&mtx);
+    //cv.notify_one();
+    pthread_cond_signal(&cv);
+    return true;
+}
+
+template <class ElementType>
+ElementType CircularBuffer<ElementType>::pop() {
+    //unique_lock<mutex> lk(mtx);
+    //DEBUG(cout << name << " Buffer[" << ID << "]: Pop\n");
+    pthread_mutex_lock(&mtx);
+    if(Tail == Head) {
+        //DEBUG(cout << name << " Buffer[" << ID << "]: Pop going to sleep ...\n");
+        //cv.wait(lk);
+        pthread_cond_wait(&cv, &mtx);
+        //DEBUG(cout << name << " Buffer[" << ID << "]: Pop woke up\n");
+    }
+    ElementType E = buffer[Tail];
+    Tail = (Tail + 1) % bufferSize;
+    numElements--;
+    //DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " << numElements << flush << "\n");
+    //lk.unlock();
+    pthread_mutex_unlock(&mtx);
+    //cv.notify_one();
+    pthread_cond_signal(&cv);
+    return E;
+}
+
+extern "C" {
+// Functions to push and pop values from pipeline buffers
+uint64_t llvm_visc_bufferPop(void*);
+void llvm_visc_bufferPush(void*, uint64_t);
+
+// Functions to create and destroy buffers
+void* llvm_visc_createBindInBuffer(void*, uint64_t, unsigned);
+void* llvm_visc_createBindOutBuffer(void*, uint64_t);
+void* llvm_visc_createEdgeBuffer(void*, uint64_t);
+void* llvm_visc_createLastInputBuffer(void*, uint64_t);
+
+void llvm_visc_freeBuffers(void*);
+
+// Functions to create and destroy threads
+void llvm_visc_createThread(void* graphID, void*(*Func)(void*), void*);
+void llvm_visc_freeThreads(void*);
+
+// Launch API for a streaming graph.
+// Arguments:
+// (1) Launch Function: void* (void*, void*)
+// (2) Push Function:   void (void*, std::vector<uint64_t>**, unsgined)
+// (3) Pop Function:    void* (std::vector<uint64_t>**, unsigned)
+void* llvm_visc_streamLaunch(void(*LaunchFunc)(void*, void*), void*);
+void llvm_visc_streamPush(void* graphID, void* args);
+void* llvm_visc_streamPop(void* graphID);
+void llvm_visc_streamWait(void* graphID);
+
+}
+
+#endif //VISC_RT_HEADER
-- 
GitLab