From 21496d80b142bec1a44cccd358d8ef9eb79bc4d1 Mon Sep 17 00:00:00 2001
From: akashk4 <akashk4@illinois.edu>
Date: Wed, 8 Jan 2020 21:02:09 -0600
Subject: [PATCH] Remove redundant visc-rt

---
 hpvm/visc-rt/CMakeLists.txt                  |   43 -
 hpvm/visc-rt/deviceStatusSwitchIntervals.txt |    2 -
 hpvm/visc-rt/device_abstraction.h            |   82 -
 hpvm/visc-rt/makefile                        |   28 -
 hpvm/visc-rt/policy.h                        |   98 --
 hpvm/visc-rt/visc-rt.cpp                     | 1645 ------------------
 hpvm/visc-rt/visc-rt.h                       |  303 ----
 7 files changed, 2201 deletions(-)
 delete mode 100644 hpvm/visc-rt/CMakeLists.txt
 delete mode 100644 hpvm/visc-rt/deviceStatusSwitchIntervals.txt
 delete mode 100644 hpvm/visc-rt/device_abstraction.h
 delete mode 100644 hpvm/visc-rt/makefile
 delete mode 100644 hpvm/visc-rt/policy.h
 delete mode 100644 hpvm/visc-rt/visc-rt.cpp
 delete mode 100644 hpvm/visc-rt/visc-rt.h

diff --git a/hpvm/visc-rt/CMakeLists.txt b/hpvm/visc-rt/CMakeLists.txt
deleted file mode 100644
index 5767d82cdc..0000000000
--- a/hpvm/visc-rt/CMakeLists.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-add_custom_target(visc-rt ALL)
-add_custom_command(
-  TARGET visc-rt PRE_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy
-    ${CMAKE_CURRENT_SOURCE_DIR}/deviceStatusSwitchIntervals.txt
-    ${CMAKE_CURRENT_BINARY_DIR}/deviceStatusSwitchIntervals.txt
-  DEPENDS deviceStatusSwitchIntervals.txt
-  COMMENT "Copying deviceStatusSwitchIntervals.txt")
-add_custom_command(
-  TARGET visc-rt PRE_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy
-    ${CMAKE_CURRENT_SOURCE_DIR}/device_abstraction.h
-    ${CMAKE_CURRENT_BINARY_DIR}/device_abstraction.h
-  DEPENDS device_abstraction.h
-  COMMENT "Copying device_abstraction.h")
-add_custom_command(
-  TARGET visc-rt PRE_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy
-    ${CMAKE_CURRENT_SOURCE_DIR}/policy.h
-    ${CMAKE_CURRENT_BINARY_DIR}/policy.h
-  DEPENDS policy.h
-  COMMENT "Copying policy.h")
-add_custom_command(
-  TARGET visc-rt PRE_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy
-    ${CMAKE_CURRENT_SOURCE_DIR}/visc-rt.h
-    ${CMAKE_CURRENT_BINARY_DIR}/visc-rt.h
-  DEPENDS visc-rt.h
-  COMMENT "Copying visc-rt.h")
-add_custom_command(
-  TARGET visc-rt PRE_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy
-    ${CMAKE_CURRENT_SOURCE_DIR}/visc-rt.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/visc-rt.cpp
-  DEPENDS visc-rt.cpp
-  COMMENT "Copying visc-rt.cpp")
-add_custom_command(
-  TARGET visc-rt PRE_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy
-    ${CMAKE_CURRENT_SOURCE_DIR}/makefile
-    ${CMAKE_CURRENT_BINARY_DIR}/makefile
-  DEPENDS makefile
-  COMMENT "Copying makefile")
diff --git a/hpvm/visc-rt/deviceStatusSwitchIntervals.txt b/hpvm/visc-rt/deviceStatusSwitchIntervals.txt
deleted file mode 100644
index 7069470a1a..0000000000
--- a/hpvm/visc-rt/deviceStatusSwitchIntervals.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-10
-10 15 10 16 15 30 15 25 20 15
diff --git a/hpvm/visc-rt/device_abstraction.h b/hpvm/visc-rt/device_abstraction.h
deleted file mode 100644
index 68748c7ab7..0000000000
--- a/hpvm/visc-rt/device_abstraction.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef __DEVICE_ABSTRACTION__
-#define __DEVICE_ABSTRACTION__
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <time.h>
-#include <thread>
-#include <vector>
-#include <iostream>
-#include <fstream>
-
-#define MIN_INTERVAL 2
-#define MAX_INTERVAL 8
-#define NUM_INTERVALS 10
-
-// Device status variable: true if the device is available for use
-volatile bool deviceStatus = true;
-// Intervals at which to change the device status
-std::vector<unsigned> Intervals;
-
-// Set to true when program execution ends and so we can end the device
-// simulation
-volatile bool executionEnd = false;
-
-
-void initializeDeviceStatusIntervals() {
-
-  unsigned sz = 0;
-  unsigned tmp = 0;
-
-  const char *fn =
-    "/home/kotsifa2/HPVM/hpvm/build/projects/visc-rt/deviceStatusSwitchIntervals.txt";
-  std::ifstream infile;
-  infile.open(fn);
-  if (!infile.is_open()) {
-    std::cout << "Failed to open " << fn << " for reading\n";
-    return;
-  }
-  infile >> sz;
-
-  if (sz) {
-    // We have data. Read them into the vector
-    for (unsigned i = 0; i < sz; i++) {
-      infile >> tmp;
-      Intervals.push_back(tmp);
-    }
-    infile.close();
-  } else {
-    // We have no data. Create random data and write them into the file
-    infile.close();
-    std::ofstream outfile;
-    outfile.open(fn);
-    if (!outfile.is_open()) {
-      std::cout << "Failed to open " << fn << " for writing\n";
-      return;
-    }
-    sz = 1 + rand()%NUM_INTERVALS;
-    outfile << sz; 
-    for (unsigned i = 0; i < sz; i++) {
-      Intervals.push_back(MIN_INTERVAL + rand()%(MAX_INTERVAL - MIN_INTERVAL));
-      outfile << Intervals[i];
-    }
-    outfile.close();
-  }
-
-  return;
-}
-
-void updateDeviceStatus() {
-
-  unsigned i = 0;
-  while (!executionEnd) {
-    std::this_thread::sleep_for (std::chrono::seconds(Intervals[i]));
-    deviceStatus = !deviceStatus;
-    std::cout << "Changed device status to " << deviceStatus << "\n";
-    i = (i+1) % Intervals.size();
-  }
-
-}
-
-#endif // __DEVICE_ABSTRACTION__
diff --git a/hpvm/visc-rt/makefile b/hpvm/visc-rt/makefile
deleted file mode 100644
index 01cc2b7b3f..0000000000
--- a/hpvm/visc-rt/makefile
+++ /dev/null
@@ -1,28 +0,0 @@
-#LLVM_SRC_ROOT = ../../../llvm
-LLVM_BUILD_ROOT = ${LLVM_SRC_ROOT}/../build/
-
-OPENCL_INC_PATH = /opt/intel/opencl-sdk/include
-
-ifeq ($(NUM_CORES),)
-  NUM_CORES=8
-endif
-
-CPP_FLAGS = -I $(LLVM_SRC_ROOT)/include -I $(LLVM_BUILD_ROOT)/include -I $(OPENCL_INC_PATH) -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
-TARGET:=visc-rt
-
-LLVM_CC:=$(LLVM_BUILD_ROOT)/bin/clang
-LLVM_CXX:=$(LLVM_BUILD_ROOT)/bin/clang++
-
-OPTS =
-
-ifeq ($(DEBUG),1)
-  OPTS+=-DDEBUG_BUILD
-endif
-
-all: $(TARGET:%=%.ll)
-
-$(TARGET:%=%.ll):%.ll:%.cpp %.h
-	$(LLVM_CXX) -DNUM_CORES=$(NUM_CORES) -O3 -S -emit-llvm $(CPP_FLAGS) $(OPTS) $< -o $@
-
-clean :
-	rm -f $(TARGET).ll
diff --git a/hpvm/visc-rt/policy.h b/hpvm/visc-rt/policy.h
deleted file mode 100644
index f30c310c1a..0000000000
--- a/hpvm/visc-rt/policy.h
+++ /dev/null
@@ -1,98 +0,0 @@
-#ifndef __POLICY__
-#define __POLICY__
-
-#include <string>
-#include "device_abstraction.h"
-
- /************************* Policies *************************************/
-class Policy {
-  public:
-    virtual int getVersion(const char *, int64_t) = 0;
-    virtual ~Policy() {};
-};
-
-class NodePolicy : public Policy {
-  virtual int getVersion(const char *name, int64_t it) override {
-    std::string s(name);
-    //std::string NodeNames[1] = { "_Z9mysgemmNTPfiS_iS_iiff_clonedInternal_level2_cloned" };
-    std::string NodeNames[] = {
-      "WrapperGaussianSmoothing_cloned",
-      "WrapperlaplacianEstimate_cloned",
-      "WrapperComputeZeroCrossings_cloned",
-      "WrapperComputeGradient_cloned",
-      "WrapperComputeMaxGradient_cloned",
-      "WrapperRejectZeroCrossings_cloned",
-    };
-    //if (!s.compare(NodeNames[4])) {
-    //  std::cout << s << ": CPU" << "\n";
-    //  return 0;
-    //}
-    return 2;
-  }
-};
-
-class IterationPolicy : public Policy {
-  virtual int getVersion(const char *name, int64_t it) override {
-    if ((it % 10 == 0) || (it % 10 == 1))
-      return 0;
-    else
-      return 2;
-  }
-};
-
-class DeviceStatusPolicy : public Policy {
-  virtual int getVersion(const char *name, int64_t it) override {
-    if (deviceStatus) {
-      //std::cout << "Returning GPU\n";
-      return 2;
-    }
-    else {
-      //std::cout << "Returning CPU\n";
-      return 0;
-    }
-  }
-};
-
-/* ------------------------------------------------------------------------- */
-// Added for the CFAR interactive policy demo.
-
-class InteractivePolicy : public Policy {
-private:
-  // 0 :for CPU, 1 for GPU, 2 for Vector
-  unsigned int userTargetDeviceChoice;
-  // Used to end thread execution
-  bool end;
-  // Thread that will update userTargetDeviceChoice
-  std::thread userTargetDeviceChoiceThread;
-  // Thread function
-  void updateUserTargetChoice() {
-    while (!end) {
-      std::cout << "Select target device (0 for CPU, 1 fpr GPU): ";
-      std::cin >> userTargetDeviceChoice;
-      if (userTargetDeviceChoice > 1) {
-        std::cout << "Invalid target device. Selecting GPU instead.\n";
-        userTargetDeviceChoice = 1;
-      }
-    }
-  }
-
-public:
-  // Inherited method, erquired for every policy object
-  virtual int getVersion(const char *name, int64_t it) {
-    return userTargetDeviceChoice;
-  }
-
-  InteractivePolicy() {
-    userTargetDeviceChoice = 1;
-    end = false;
-    userTargetDeviceChoiceThread =
-      std::thread(&InteractivePolicy::updateUserTargetChoice, this);
-  }
-
-  ~InteractivePolicy() {
-    end = true;
-    userTargetDeviceChoiceThread.join(); 
-  }
-};
-
-#endif // __POLICY__
diff --git a/hpvm/visc-rt/visc-rt.cpp b/hpvm/visc-rt/visc-rt.cpp
deleted file mode 100644
index f05aa2ac24..0000000000
--- a/hpvm/visc-rt/visc-rt.cpp
+++ /dev/null
@@ -1,1645 +0,0 @@
-#include <iostream>
-#include <string>
-#include <pthread.h>
-#include <cstdlib>
-#include <cstdio>
-#include <cstring>
-#include <cassert>
-#include <map>
-#include <CL/cl.h>
-
-#include <unistd.h>
-
-#if _POSIX_VERSION >= 200112L
-# include <sys/time.h>
-#endif
-#include "visc-rt.h"
-
-#ifndef DEBUG_BUILD
-#define DEBUG(s) {}
-#else
-#define DEBUG(s) s
-#endif
-
-#define BILLION   1000000000LL
-
-using namespace std;
-
-typedef struct {
-  pthread_t threadID;
-  std::vector<pthread_t>* threads;
-  // Map from InputPort to Size 
-  std::map<unsigned, uint64_t>* ArgInPortSizeMap;
-  //std::vector<uint64_t>* BindInSizes;
-  std::vector<unsigned>* BindInSourcePort;
-  std::vector<uint64_t>* BindOutSizes;
-  std::vector<uint64_t>* EdgeSizes;
-  std::vector<CircularBuffer<uint64_t>*>* BindInputBuffers;
-  std::vector<CircularBuffer<uint64_t>*>* BindOutputBuffers;
-  std::vector<CircularBuffer<uint64_t>*>* EdgeBuffers;
-  std::vector<CircularBuffer<uint64_t>*>* isLastInputBuffers;
-} DFNodeContext_X86;
-
-typedef struct {
-  cl_context clOCLContext;
-  cl_command_queue clCommandQue;
-  cl_program clProgram;
-  cl_kernel clKernel;
-} DFNodeContext_OCL;
-
-cl_context globalOCLContext;
-cl_device_id* clDevices;
-cl_command_queue globalCommandQue;
-
-Policy *policy = NULL;
-MemTracker MTracker;
-vector<DFGDepth> DStack;
-// Mutex to prevent concurrent access by multiple thereads in pipeline
-pthread_mutex_t ocl_mtx;
-
-#define NUM_TESTS 1
-visc_TimerSet kernel_timer;
-
-static inline void checkErr(cl_int err, cl_int success, const char * name) {
-  if (err != success) {
-  cout << "ERROR: " << name << flush << "\n";
-  cout << "ErrorCode: " << err << flush << "\n";
-    exit(EXIT_FAILURE);
-  }
-}
-
-/************************* Policies *************************************/
-void llvm_visc_policy_init() {
-  cout << "Initializing policy object ...\n";
-//  policy = new NodePolicy();
-//  policy = new IterationPolicy();
-//  policy = new DeviceStatusPolicy();
-  policy = new InteractivePolicy();
-  cout << "DONE: Initializing policy object.\n";
-}
-
-void llvm_visc_policy_clear() {
-  if (policy) free(policy);
-}
-
-int llvm_visc_policy_getVersion(const char *name, int64_t i) {
-  return policy->getVersion(name, i);
-}
-
-/******************** Device Abstraction ********************************/
-std::thread deviceStatusThread;
-
-void llvm_visc_deviceAbstraction_start() {
-  cout << "Starting device status simulation ...\n";
-  // Initialize vector with points where ti switch device status
-  initializeDeviceStatusIntervals();
-  // Create a thread that performs the changes
-  deviceStatusThread = std::thread(updateDeviceStatus);
-  cout << "Started device status simulation thread ...\n";
-  return;
-}
-
-void llvm_visc_deviceAbstraction_end() {
-  cout << "Ending device status simulation thread ...\n";
-  // Set the variable that allows the thread to know that execution has ended
-  executionEnd = true;
-  // Wait for the thread that manages device status to terminate
-  deviceStatusThread.join();
-  cout << "Ended device status simulation.\n";
-  return;
-}
-
-void llvm_visc_deviceAbstraction_waitOnDeviceStatus() {
-  while (!deviceStatus) { };
-  return;
-}
-
-/************************* Depth Stack Routines ***************************/
-
-void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX, uint64_t limitY,
-    uint64_t iY, uint64_t limitZ, uint64_t iZ) {
-  //DEBUG(cout << "Pushing node information on stack:\n");
-  //DEBUG(cout << "\tNumDim = " << n << "\t Limit(" << limitX << ", " << limitY << ", "<< limitZ <<")\n");
-  //DEBUG(cout << "\tInstance(" << iX << ", " << iY << ", "<< iZ <<")\n");
-  //DFGDepth nodeInfo (n, limitX, iX, limitY, iY, limitZ, iZ);
-  //DStack.push_back(nodeInfo);
-  //DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n");
-}
-
-void llvm_visc_x86_dstack_pop() {
-  //DEBUG(cout << "Popping from depth stack\n");
-  //DStack.pop_back();
-  //DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n");
-}
-
-uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim) {
-  //DEBUG(cout << "Request limit for dim " << dim << " of ancestor " << level <<flush << "\n");
-  //unsigned size = DStack.size();
-  //DEBUG(cout << "\t Return: " << DStack[size-level-1].getDimLimit(dim) <<flush << "\n");
-  //return DStack[size-level-1].getDimLimit(dim);
-  return 0;
-}
-
-uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim) {
-  //DEBUG(cout << "Request instance id for dim " << dim << " of ancestor " << level <<flush << "\n");
-  //unsigned size = DStack.size();
-  //DEBUG(cout << "\t Return: " << DStack[size-level-1].getDimInstance(dim) <<flush << "\n");
-  //return DStack[size-level-1].getDimInstance(dim);
-  return 0;
-}
-
-/********************** Memory Tracking Routines **************************/
-
-void llvm_visc_track_mem(void* ptr, size_t size) {
-  DEBUG(cout << "Start tracking memory: " << ptr << flush << "\n");
-  MemTrackerEntry* MTE = MTracker.lookup(ptr);
-  if(MTE != NULL) {
-    DEBUG(cout << "ID " << ptr << " already present in the MemTracker Table\n");
-    return;
-  }
-  DEBUG(cout << "Inserting ID " << ptr << " in the MemTracker Table\n");
-  MTracker.insert(ptr, size, MemTrackerEntry::HOST, ptr);
-  DEBUG(MTracker.print());
-}
-
-void llvm_visc_untrack_mem(void* ptr) {
-  DEBUG(cout << "Stop tracking memory: " << ptr << flush << "\n");
-  MemTrackerEntry* MTE = MTracker.lookup(ptr);
-  if(MTE == NULL) {
-    cout << "WARNING: Trying to remove ID " << ptr << " not present in the MemTracker Table\n";
-    return;
-  }
-  DEBUG(cout << "Removing ID " << ptr << " from MemTracker Table\n");
-  if(MTE->getLocation() == MemTrackerEntry::DEVICE)
-    clReleaseMemObject((cl_mem) MTE->getAddress());
-  MTracker.remove(ptr);
-  DEBUG(MTracker.print());
-}
-
-
-static void* llvm_visc_ocl_request_mem(void* ptr, size_t size, DFNodeContext_OCL* Context, bool isInput, bool isOutput) {
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "[OCL] Request memory: " << ptr << " for context: " << Context->clOCLContext << flush << "\n");
-  MemTrackerEntry* MTE = MTracker.lookup(ptr);
-  if (MTE == NULL) {
-    MTracker.print();
-    cout << "ERROR: Requesting memory not present in Table\n";
-    exit(EXIT_FAILURE);
-  }
-  // If already on device
-  if (MTE->getLocation() == MemTrackerEntry::DEVICE &&
-      ((DFNodeContext_OCL*)MTE->getContext())->clOCLContext == Context->clOCLContext) {
-    DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush << "\n"); 
-    pthread_mutex_unlock(&ocl_mtx);
-    return MTE->getAddress();
-  }
-  
-  DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush << "\n");
-  DEBUG(cout << "\t"; MTE->print(); cout << flush << "\n");
-  // Else copy and update the latest copy
-  cl_mem_flags clFlags;
-  cl_int errcode;
-
-  if(isInput && isOutput) clFlags = CL_MEM_READ_WRITE;
-  else if(isInput)        clFlags = CL_MEM_READ_ONLY;
-  else if(isOutput)       clFlags = CL_MEM_WRITE_ONLY;
-  else                    clFlags = CL_MEM_READ_ONLY;
-
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY);
-  //pthread_mutex_lock(&ocl_mtx);
-  cl_mem d_input = clCreateBuffer(Context->clOCLContext, clFlags, size, NULL, &errcode);
-  //pthread_mutex_unlock(&ocl_mtx);
-  checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
-  DEBUG(cout<< "\nMemory allocated on device: " << d_input << flush << "\n");
-  if(isInput) {
-    DEBUG(cout << "\tCopying ...");
-    //pthread_mutex_lock(&ocl_mtx);
-    errcode = clEnqueueWriteBuffer(Context->clCommandQue,
-                                  d_input,
-                                  CL_TRUE,
-                                  0,
-                                  size,MTE->getAddress(),
-                                  0,NULL,NULL);
-    //pthread_mutex_unlock(&ocl_mtx);
-    checkErr(errcode, CL_SUCCESS, "Failure to copy memory to device");
-  }
-
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
-  DEBUG(cout << " done\n");
-  MTE->update(MemTrackerEntry::DEVICE, (void*) d_input, Context);
-  DEBUG(cout << "Updated Table\n");
-  DEBUG(MTracker.print());
-  pthread_mutex_unlock(&ocl_mtx);
-  return d_input;
-}
-
-void* llvm_visc_x86_argument_ptr(void* ptr, size_t size) {
-  return llvm_visc_request_mem(ptr, size);
-}
-
-void* llvm_visc_request_mem(void* ptr, size_t size) {
-  // Ignore objects whose size is 0 - no memory is requested.
-  if (size == 0) {
-    DEBUG(cout << "[X86] Request memory (ignored): " << ptr << flush << "\n");
-    return ptr;
-  }
-
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "[X86] Request memory: " << ptr << flush << "\n");
-  MemTrackerEntry* MTE = MTracker.lookup(ptr);
-  if(MTE == NULL) {
-    cout << "ERROR: Requesting memory not present in Table\n";
-    pthread_mutex_unlock(&ocl_mtx);
-    exit(EXIT_FAILURE);
-  }
-  // If already on host
-  if(MTE->getLocation() == MemTrackerEntry::HOST) {
-    DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush << "\n"); 
-    pthread_mutex_unlock(&ocl_mtx);
-    return MTE->getAddress();
-  }
-
-  // Else copy from device and update table
-  DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush << "\n");
-  DEBUG(cout << "\tCopying ...");
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY);
-  //pthread_mutex_lock(&ocl_mtx);
-  cl_int errcode = clEnqueueReadBuffer(((DFNodeContext_OCL*)MTE->getContext())->clCommandQue,
-                                      (cl_mem) MTE->getAddress(),
-                                      CL_TRUE,
-                                      0,
-                                      size,
-                                      ptr,
-                                      0, NULL, NULL);
-  //pthread_mutex_unlock(&ocl_mtx);
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
-  DEBUG(cout << " done\n");
-  checkErr(errcode, CL_SUCCESS, "[request mem] Failure to read output");
-  DEBUG(cout << "Free mem object on device\n");
-  clReleaseMemObject((cl_mem) MTE->getAddress());
-  DEBUG(cout << "Updated Table\n");
-  MTE->update(MemTrackerEntry::HOST, ptr);
-  DEBUG(MTracker.print());
-  pthread_mutex_unlock(&ocl_mtx);
-  return ptr;
-}
-
-/*************************** Timer Routines **********************************/
-
-static int is_async(enum visc_TimerID timer)
-{
-  return (timer == visc_TimerID_KERNEL) ||
-             (timer == visc_TimerID_COPY_ASYNC);
-}
-
-static int is_blocking(enum visc_TimerID timer)
-{
-  return (timer == visc_TimerID_COPY) || (timer == visc_TimerID_NONE);
-}
-
-#define INVALID_TIMERID visc_TimerID_LAST
-
-static int asyncs_outstanding(struct visc_TimerSet* timers)
-{
-  return (timers->async_markers != NULL) &&
-           (timers->async_markers->timerID != INVALID_TIMERID);
-}
-
-static struct visc_async_time_marker_list *
-get_last_async(struct visc_TimerSet* timers)
-{
-  /* Find the last event recorded thus far */
-  struct visc_async_time_marker_list * last_event = timers->async_markers;
-  if(last_event != NULL && last_event->timerID != INVALID_TIMERID) {
-    while(last_event->next != NULL &&
-            last_event->next->timerID != INVALID_TIMERID)
-      last_event = last_event->next;
-    return last_event;
-  } else
-    return NULL;
-}
-
-static void insert_marker(struct visc_TimerSet* tset, enum visc_TimerID timer)
-{
-  cl_int ciErrNum = CL_SUCCESS;
-  struct visc_async_time_marker_list ** new_event = &(tset->async_markers);
-
-  while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
-    new_event = &((*new_event)->next);
-  }
-
-  if(*new_event == NULL) {
-    *new_event = (struct visc_async_time_marker_list *)
-      			malloc(sizeof(struct visc_async_time_marker_list));
-    (*new_event)->marker = calloc(1, sizeof(cl_event));
-    /*
-    // I don't think this is needed at all. I believe clEnqueueMarker 'creates' the event
-#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 )
-fprintf(stderr, "Creating Marker [%d]\n", timer);
-    *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Creating User Event Object!\n");
-    }
-    ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Setting User Event Status!\n");
-    }
-#endif
-*/
-    (*new_event)->next = NULL;
-  }
-
-  /* valid event handle now aquired: insert the event record */
-  (*new_event)->label = NULL;
-  (*new_event)->timerID = timer;
-  //pthread_mutex_lock(&ocl_mtx);
-  ciErrNum = clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker);
-  //pthread_mutex_unlock(&ocl_mtx);
-  if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Enqueueing Marker!\n");
-  }
-
-}
-
-static void insert_submarker(struct visc_TimerSet* tset, char *label, enum visc_TimerID timer)
-{
-  cl_int ciErrNum = CL_SUCCESS;
-  struct visc_async_time_marker_list ** new_event = &(tset->async_markers);
-
-  while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
-    new_event = &((*new_event)->next);
-  }
-
-  if(*new_event == NULL) {
-    *new_event = (struct visc_async_time_marker_list *)
-      			malloc(sizeof(struct visc_async_time_marker_list));
-    (*new_event)->marker = calloc(1, sizeof(cl_event));
-    /*
-#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 )
-fprintf(stderr, "Creating SubMarker %s[%d]\n", label, timer);
-    *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Creating User Event Object!\n");
-    }
-    ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Setting User Event Status!\n");
-    }
-#endif
-*/
-    (*new_event)->next = NULL;
-  }
-
-  /* valid event handle now aquired: insert the event record */
-  (*new_event)->label = label;
-  (*new_event)->timerID = timer;
-  //pthread_mutex_lock(&ocl_mtx);
-  ciErrNum = clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker);
-  //pthread_mutex_unlock(&ocl_mtx);
-  if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Enqueueing Marker!\n");
-  }
-
-}
-
-
-/* Assumes that all recorded events have completed */
-static visc_Timestamp record_async_times(struct visc_TimerSet* tset)
-{
-  struct visc_async_time_marker_list * next_interval = NULL;
-  struct visc_async_time_marker_list * last_marker = get_last_async(tset);
-  visc_Timestamp total_async_time = 0;
-
-  for(next_interval = tset->async_markers; next_interval != last_marker;
-      next_interval = next_interval->next) {
-    cl_ulong command_start=0, command_end=0;
-    cl_int ciErrNum = CL_SUCCESS;
-
-    ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_start, NULL);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error getting first EventProfilingInfo: %d\n", ciErrNum);
-    }
-
-    ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_end, NULL);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error getting second EventProfilingInfo: %d\n", ciErrNum);
-    }
-
-    visc_Timestamp interval = (visc_Timestamp) (((double)(command_end - command_start)));
-    tset->timers[next_interval->timerID].elapsed += interval;
-    if (next_interval->label != NULL) {
-      struct visc_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list;
-      while (subtimer != NULL) {
-        if ( strcmp(subtimer->label, next_interval->label) == 0) {
-          subtimer->timer.elapsed += interval;
-          break;
-        }
-        subtimer = subtimer->next;
-      }
-    }
-    total_async_time += interval;
-    next_interval->timerID = INVALID_TIMERID;
-  }
-
-  if(next_interval != NULL)
-    next_interval->timerID = INVALID_TIMERID;
-
-  return total_async_time;
-}
-
-static void
-accumulate_time(visc_Timestamp *accum,
-		visc_Timestamp start,
-		visc_Timestamp end)
-{
-#if _POSIX_VERSION >= 200112L
-  *accum += end - start;
-#else
-# error "Timestamps not implemented for this system"
-#endif
-}
-
-#if _POSIX_VERSION >= 200112L
-static visc_Timestamp get_time()
-{
-  struct timespec tv;
-  clock_gettime(CLOCK_MONOTONIC, &tv);
-  return (visc_Timestamp) (tv.tv_sec * BILLION + tv.tv_nsec);
-}
-#else
-# error "no supported time libraries are available on this platform"
-#endif
-
-void
-visc_ResetTimer(struct visc_Timer *timer)
-{
-  timer->state = visc_Timer_STOPPED;
-
-#if _POSIX_VERSION >= 200112L
-  timer->elapsed = 0;
-#else
-# error "visc_ResetTimer: not implemented for this system"
-#endif
-}
-
-void
-visc_StartTimer(struct visc_Timer *timer)
-{
-  if (timer->state != visc_Timer_STOPPED) {
-    // FIXME: Removing warning statement to avoid printing this error
-    // fputs("Ignoring attempt to start a running timer\n", stderr);
-    return;
-  }
-
-  timer->state = visc_Timer_RUNNING;
-
-#if _POSIX_VERSION >= 200112L
-  {
-    struct timespec tv;
-    clock_gettime(CLOCK_MONOTONIC, &tv);
-    timer->init = tv.tv_sec * BILLION + tv.tv_nsec;
-  }
-#else
-# error "visc_StartTimer: not implemented for this system"
-#endif
-}
-
-void
-visc_StartTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer)
-{
-
-  unsigned int numNotStopped = 0x3; // 11
-  if (timer->state != visc_Timer_STOPPED) {
-    fputs("Warning: Timer was not stopped\n", stderr);
-    numNotStopped &= 0x1; // Zero out 2^1
-  }
-  if (subtimer->state != visc_Timer_STOPPED) {
-    fputs("Warning: Subtimer was not stopped\n", stderr);
-    numNotStopped &= 0x2; // Zero out 2^0
-  }
-  if (numNotStopped == 0x0) {
-    //fputs("Ignoring attempt to start running timer and subtimer\n", stderr);
-    return;
-  }
-
-  timer->state = visc_Timer_RUNNING;
-  subtimer->state = visc_Timer_RUNNING;
-
-#if _POSIX_VERSION >= 200112L
-  {
-    struct timespec tv;
-    clock_gettime(CLOCK_MONOTONIC, &tv);
-
-    if (numNotStopped & 0x2) {
-      timer->init = tv.tv_sec * BILLION + tv.tv_nsec;
-    }
-
-    if (numNotStopped & 0x1) {
-      subtimer->init = tv.tv_sec * BILLION + tv.tv_nsec;
-    }
-  }
-#else
-# error "visc_StartTimer: not implemented for this system"
-#endif
-
-}
-
-void
-visc_StopTimer(struct visc_Timer *timer)
-{
-  visc_Timestamp fini;
-
-  if (timer->state != visc_Timer_RUNNING) {
-    //fputs("Ignoring attempt to stop a stopped timer\n", stderr);
-    return;
-  }
-
-  timer->state = visc_Timer_STOPPED;
-
-#if _POSIX_VERSION >= 200112L
-  {
-    struct timespec tv;
-    clock_gettime(CLOCK_MONOTONIC, &tv);
-    fini = tv.tv_sec * BILLION + tv.tv_nsec;
-  }
-#else
-# error "visc_StopTimer: not implemented for this system"
-#endif
-
-  accumulate_time(&timer->elapsed, timer->init, fini);
-  timer->init = fini;
-}
-
-void visc_StopTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer) {
-
-  visc_Timestamp fini;
-
-  unsigned int numNotRunning = 0x3; // 11
-  if (timer->state != visc_Timer_RUNNING) {
-    fputs("Warning: Timer was not running\n", stderr);
-    numNotRunning &= 0x1; // Zero out 2^1
-  }
-  if (subtimer->state != visc_Timer_RUNNING) {
-    fputs("Warning: Subtimer was not running\n", stderr);
-    numNotRunning &= 0x2; // Zero out 2^0
-  }
-  if (numNotRunning == 0x0) {
-    //fputs("Ignoring attempt to stop stopped timer and subtimer\n", stderr);
-    return;
-  }
-
-
-  timer->state = visc_Timer_STOPPED;
-  subtimer->state = visc_Timer_STOPPED;
-
-#if _POSIX_VERSION >= 200112L
-  {
-    struct timespec tv;
-    clock_gettime(CLOCK_MONOTONIC, &tv);
-    fini = tv.tv_sec * BILLION + tv.tv_nsec;
-  }
-#else
-# error "visc_StopTimer: not implemented for this system"
-#endif
-
-  if (numNotRunning & 0x2) {
-    accumulate_time(&timer->elapsed, timer->init, fini);
-    timer->init = fini;
-  }
-
-  if (numNotRunning & 0x1) {
-    accumulate_time(&subtimer->elapsed, subtimer->init, fini);
-    subtimer->init = fini;
-  }
-
-}
-
-/* Get the elapsed time in seconds. */
-double
-visc_GetElapsedTime(struct visc_Timer *timer)
-{
-  double ret;
-
-  if (timer->state != visc_Timer_STOPPED) {
-    fputs("Elapsed time from a running timer is inaccurate\n", stderr);
-  }
-
-#if _POSIX_VERSION >= 200112L
-  ret = timer->elapsed / 1e9;
-#else
-# error "visc_GetElapsedTime: not implemented for this system"
-#endif
-  return ret;
-}
-
-void
-visc_InitializeTimerSet(struct visc_TimerSet *timers)
-{
-  int n;
-
-  timers->wall_begin = get_time();
-  timers->current = visc_TimerID_NONE;
-
-  timers->async_markers = NULL;
-
-  for (n = 0; n < visc_TimerID_LAST; n++) {
-    visc_ResetTimer(&timers->timers[n]);
-    timers->sub_timer_list[n] = NULL;
-  }
-}
-
-
-void
-visc_AddSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID visc_Category) {
-
-  struct visc_SubTimer *subtimer = (struct visc_SubTimer *) malloc
-    (sizeof(struct visc_SubTimer));
-
-  int len = strlen(label);
-
-  subtimer->label = (char *) malloc (sizeof(char)*(len+1));
-  sprintf(subtimer->label, "%s", label);
-
-  visc_ResetTimer(&subtimer->timer);
-  subtimer->next = NULL;
-
-  struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[visc_Category];
-  if (subtimerlist == NULL) {
-    subtimerlist = (struct visc_SubTimerList *) calloc
-      (1, sizeof(struct visc_SubTimerList));
-    subtimerlist->subtimer_list = subtimer;
-    timers->sub_timer_list[visc_Category] = subtimerlist;
-  } else {
-    // Append to list
-    struct visc_SubTimer *element = subtimerlist->subtimer_list;
-    while (element->next != NULL) {
-      element = element->next;
-    }
-    element->next = subtimer;
-  }
-
-}
-
-void
-visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer)
-{
-  //cerr << "Switch to timer: " << timer << flush << "\n";
-  /* Stop the currently running timer */
-  if (timers->current != visc_TimerID_NONE) {
-    struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
-    struct visc_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL;
-
-    if (!is_async(timers->current) ) {
-      if (timers->current != timer) {
-        if (currSubTimer != NULL) {
-          visc_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer);
-        } else {
-          visc_StopTimer(&timers->timers[timers->current]);
-        }
-      } else {
-        if (currSubTimer != NULL) {
-          visc_StopTimer(&currSubTimer->timer);
-        }
-      }
-    } else {
-      insert_marker(timers, timer);
-      if (!is_async(timer)) { // if switching to async too, keep driver going
-        visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]);
-      }
-    }
-  }
-
-  visc_Timestamp currentTime = get_time();
-
-  /* The only cases we check for asynchronous task completion is
-   * when an overlapping CPU operation completes, or the next
-   * segment blocks on completion of previous async operations */
-  if( asyncs_outstanding(timers) &&
-      (!is_async(timers->current) || is_blocking(timer) ) ) {
-
-    struct visc_async_time_marker_list * last_event = get_last_async(timers);
-    /* CL_COMPLETE if completed */
-
-    cl_int ciErrNum = CL_SUCCESS;
-    cl_int async_done = CL_COMPLETE;
-
-    ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stdout, "Error Querying EventInfo1!\n");
-    }
-
-
-    if(is_blocking(timer)) {
-      /* Async operations completed after previous CPU operations:
-       * overlapped time is the total CPU time since this set of async
-       * operations were first issued */
-
-      // timer to switch to is COPY or NONE
-      if(async_done != CL_COMPLETE) {
-        accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed),
-	                  timers->async_begin,currentTime);
-      }
-
-      /* Wait on async operation completion */
-      ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker);
-      if (ciErrNum != CL_SUCCESS) {
-        fprintf(stderr, "Error Waiting for Events!\n");
-      }
-
-      visc_Timestamp total_async_time = record_async_times(timers);
-
-      /* Async operations completed before previous CPU operations:
-       * overlapped time is the total async time */
-      if(async_done == CL_COMPLETE) {
-        //fprintf(stderr, "Async_done: total_async_type = %lld\n", total_async_time);
-        timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time;
-      }
-
-    } else
-    /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
-    // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding
-    // so something is deeper in stack
-    if(async_done == CL_COMPLETE ) {
-      /* Async operations completed before previous CPU operations:
-       * overlapped time is the total async time */
-      timers->timers[visc_TimerID_OVERLAP].elapsed += record_async_times(timers);
-    }
-  }
-
-  /* Start the new timer */
-  if (timer != visc_TimerID_NONE) {
-    if(!is_async(timer)) {
-      visc_StartTimer(&timers->timers[timer]);
-    } else {
-      // toSwitchTo Is Async (KERNEL/COPY_ASYNC)
-      if (!asyncs_outstanding(timers)) {
-        /* No asyncs outstanding, insert a fresh async marker */
-
-        insert_marker(timers, timer);
-        timers->async_begin = currentTime;
-      } else if(!is_async(timers->current)) {
-        /* Previous asyncs still in flight, but a previous SwitchTo
-         * already marked the end of the most recent async operation,
-         * so we can rename that marker as the beginning of this async
-         * operation */
-
-        struct visc_async_time_marker_list * last_event = get_last_async(timers);
-        last_event->label = NULL;
-        last_event->timerID = timer;
-      }
-      if (!is_async(timers->current)) {
-        visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]);
-      }
-    }
-  }
-  timers->current = timer;
-
-}
-
-void
-visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID category)
-{
-  struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
-  struct visc_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL;
-
-  if (timers->current != visc_TimerID_NONE) {
-    if (!is_async(timers->current) ) {
-      if (timers->current != category) {
-        if (curr != NULL) {
-          visc_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer);
-        } else {
-          visc_StopTimer(&timers->timers[timers->current]);
-        }
-      } else {
-        if (curr != NULL) {
-          visc_StopTimer(&curr->timer);
-        }
-      }
-    } else {
-      insert_submarker(timers, label, category);
-      if (!is_async(category)) { // if switching to async too, keep driver going
-        visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]);
-      }
-    }
-  }
-
-  visc_Timestamp currentTime = get_time();
-
-  /* The only cases we check for asynchronous task completion is
-   * when an overlapping CPU operation completes, or the next
-   * segment blocks on completion of previous async operations */
-  if( asyncs_outstanding(timers) &&
-      (!is_async(timers->current) || is_blocking(category) ) ) {
-
-    struct visc_async_time_marker_list * last_event = get_last_async(timers);
-    /* CL_COMPLETE if completed */
-
-    cl_int ciErrNum = CL_SUCCESS;
-    cl_int async_done = CL_COMPLETE;
-
-    ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stdout, "Error Querying EventInfo2!\n");
-    }
-
-    if(is_blocking(category)) {
-      /* Async operations completed after previous CPU operations:
-       * overlapped time is the total CPU time since this set of async
-       * operations were first issued */
-
-      // timer to switch to is COPY or NONE
-      // if it hasn't already finished, then just take now and use that as the elapsed time in OVERLAP
-      // anything happening after now isn't OVERLAP because everything is being stopped to wait for synchronization
-      // it seems that the extra sync wall time isn't being recorded anywhere
-      if(async_done != CL_COMPLETE)
-        accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed),
-	                  timers->async_begin,currentTime);
-
-      /* Wait on async operation completion */
-      ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker);
-      if (ciErrNum != CL_SUCCESS) {
-        fprintf(stderr, "Error Waiting for Events!\n");
-      }
-      visc_Timestamp total_async_time = record_async_times(timers);
-
-      /* Async operations completed before previous CPU operations:
-       * overlapped time is the total async time */
-       // If it did finish, then accumulate all the async time that did happen into OVERLAP
-       // the immediately preceding EventSynchronize theoretically didn't have any effect since it was already completed.
-      if(async_done == CL_COMPLETE /*cudaSuccess*/)
-        timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time;
-
-    } else
-    /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
-    // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding
-    // so something is deeper in stack
-    if(async_done == CL_COMPLETE /*cudaSuccess*/) {
-      /* Async operations completed before previous CPU operations:
-       * overlapped time is the total async time */
-      timers->timers[visc_TimerID_OVERLAP].elapsed += record_async_times(timers);
-    }
-    // else, this isn't blocking, so just check the next time around
-  }
-
-  subtimerlist = timers->sub_timer_list[category];
-  struct visc_SubTimer *subtimer = NULL;
-
-  if (label != NULL) {
-    subtimer = subtimerlist->subtimer_list;
-    while (subtimer != NULL) {
-      if (strcmp(subtimer->label, label) == 0) {
-        break;
-      } else {
-        subtimer = subtimer->next;
-      }
-    }
-  }
-
-  /* Start the new timer */
-  if (category != visc_TimerID_NONE) {
-    if(!is_async(category)) {
-      if (subtimerlist != NULL) {
-        subtimerlist->current = subtimer;
-      }
-
-      if (category != timers->current && subtimer != NULL) {
-        visc_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer);
-      } else if (subtimer != NULL) {
-        visc_StartTimer(&subtimer->timer);
-      } else {
-        visc_StartTimer(&timers->timers[category]);
-      }
-    } else {
-      if (subtimerlist != NULL) {
-        subtimerlist->current = subtimer;
-      }
-
-      // toSwitchTo Is Async (KERNEL/COPY_ASYNC)
-      if (!asyncs_outstanding(timers)) {
-        /* No asyncs outstanding, insert a fresh async marker */
-        insert_submarker(timers, label, category);
-        timers->async_begin = currentTime;
-      } else if(!is_async(timers->current)) {
-        /* Previous asyncs still in flight, but a previous SwitchTo
-         * already marked the end of the most recent async operation,
-         * so we can rename that marker as the beginning of this async
-         * operation */
-
-        struct visc_async_time_marker_list * last_event = get_last_async(timers);
-        last_event->timerID = category;
-        last_event->label = label;
-      } // else, marker for switchToThis was already inserted
-
-      //toSwitchto is already asynchronous, but if current/prev state is async too, then DRIVER is already running
-      if (!is_async(timers->current)) {
-        visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]);
-      }
-    }
-  }
-
-  timers->current = category;
-}
-
-void
-visc_PrintTimerSet(struct visc_TimerSet *timers)
-{
-  visc_Timestamp wall_end = get_time();
-
-  struct visc_Timer *t = timers->timers;
-  struct visc_SubTimer* sub = NULL;
-
-  int maxSubLength;
-
-  const char *categories[] = {
-    "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute", "Overlap",
-    "Init_Ctx", "Clear_Ctx", "Copy_Scalar", "Copy_Ptr", "Mem_Free",
-    "Read_Output", "Setup", "Mem_Track", "Mem_Untrack", "Misc",
-    "Pthread_Create", "Arg_Pack", "Arg_Unpack", "Computation", "Output_Pack", "Output_Unpack"
-
-  };
-
-  const int maxCategoryLength = 20;
-
-  int i;
-  for(i = 1; i < visc_TimerID_LAST; ++i) { // exclude NONE and OVRELAP from this format
-    if(visc_GetElapsedTime(&t[i]) != 0 || true) {
-
-      // Print Category Timer
-      printf("%-*s: %.9f\n", maxCategoryLength, categories[i-1], visc_GetElapsedTime(&t[i]));
-
-      if (timers->sub_timer_list[i] != NULL) {
-        sub = timers->sub_timer_list[i]->subtimer_list;
-        maxSubLength = 0;
-        while (sub != NULL) {
-          // Find longest SubTimer label
-          if (strlen(sub->label) > (unsigned long) maxSubLength) {
-            maxSubLength = strlen(sub->label);
-          }
-          sub = sub->next;
-        }
-
-        // Fit to Categories
-        if (maxSubLength <= maxCategoryLength) {
-         maxSubLength = maxCategoryLength;
-        }
-
-        sub = timers->sub_timer_list[i]->subtimer_list;
-
-        // Print SubTimers
-        while (sub != NULL) {
-          printf(" -%-*s: %.9f\n", maxSubLength, sub->label, visc_GetElapsedTime(&sub->timer));
-          sub = sub->next;
-        }
-      }
-    }
-  }
-
-  if(visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]) != 0)
-    printf("CPU/Kernel Overlap: %.9f\n", visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]));
-
-  float walltime = (wall_end - timers->wall_begin)/ 1e9;
-  printf("Timer Wall Time: %.9f\n", walltime);
-
-}
-
-void visc_DestroyTimerSet(struct visc_TimerSet * timers)
-{
-  /* clean up all of the async event markers */
-  struct visc_async_time_marker_list* event = timers->async_markers;
-  while(event != NULL) {
-
-    cl_int ciErrNum = CL_SUCCESS;
-    ciErrNum = clWaitForEvents(1, (cl_event *)(event)->marker);
-    if (ciErrNum != CL_SUCCESS) {
-      //fprintf(stderr, "Error Waiting for Events!\n");
-    }
-
-    ciErrNum = clReleaseEvent( *((cl_event *)(event)->marker) );
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Release Events!\n");
-    }
-
-    free((event)->marker);
-    struct visc_async_time_marker_list* next = ((event)->next);
-
-    free(event);
-
-    // (*event) = NULL;
-    event = next;
-  }
-
-  int i = 0;
-  for(i = 0; i < visc_TimerID_LAST; ++i) {
-    if (timers->sub_timer_list[i] != NULL) {
-      struct visc_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list;
-      struct visc_SubTimer *prev = NULL;
-      while (subtimer != NULL) {
-        free(subtimer->label);
-        prev = subtimer;
-        subtimer = subtimer->next;
-        free(prev);
-      }
-      free(timers->sub_timer_list[i]);
-    }
-  }
-}
-
-/**************************** Pipeline API ************************************/
-#define BUFFER_SIZE 1
-
-// Launch API for a streaming dataflow graph
-void* llvm_visc_streamLaunch(void(*LaunchFunc)(void*, void*), void* args) {
-  DFNodeContext_X86* Context = (DFNodeContext_X86*) malloc(sizeof(DFNodeContext_X86));
-
-  Context->threads = new std::vector<pthread_t>();
-  Context->ArgInPortSizeMap = new std::map<unsigned, uint64_t>();
-  //Context->BindInSizes = new std::vector<uint64_t>();
-  Context->BindInSourcePort = new std::vector<unsigned>();
-  Context->BindOutSizes = new std::vector<uint64_t>();
-  Context->EdgeSizes = new std::vector<uint64_t>();
-  Context->BindInputBuffers = new std::vector<CircularBuffer<uint64_t>*>();
-  Context->BindOutputBuffers = new std::vector<CircularBuffer<uint64_t>*>();
-  Context->EdgeBuffers = new std::vector<CircularBuffer<uint64_t>*>();
-  Context->isLastInputBuffers = new std::vector<CircularBuffer<uint64_t>*>();
-
-  DEBUG(cout << "StreamLaunch -- Graph: " << Context << ", Arguments: " << args << flush << "\n");
-  LaunchFunc(args, Context);
-  return Context;
-}
-
-// Push API for a streaming dataflow graph
-void llvm_visc_streamPush(void* graphID, void* args) {
-  DEBUG(cout << "StreamPush -- Graph: " << graphID << ", Arguments: " << args << flush << "\n");
-  DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
-  unsigned offset = 0;
-  for (unsigned i=0; i< Ctx->ArgInPortSizeMap->size(); i++) {
-    uint64_t element;
-    memcpy(&element, (char*)args+offset, Ctx->ArgInPortSizeMap->at(i));
-    offset += Ctx->ArgInPortSizeMap->at(i);
-    for(unsigned j=0; j<Ctx->BindInputBuffers->size();j++) {
-      if(Ctx->BindInSourcePort->at(j) == i) {
-        // Push to all bind buffers connected to parent node at this port
-        //DEBUG(cout << "\tPushing Value " << element << " to buffer\n");
-        llvm_visc_bufferPush(Ctx->BindInputBuffers->at(j), element);
-      }
-    }
-  }
-  // Push 0 in isLastInput buffers of all child nodes
-  for (CircularBuffer<uint64_t>* buffer: *(Ctx->isLastInputBuffers))
-    llvm_visc_bufferPush(buffer, 0);
-}
-
-// Pop API for a streaming dataflow graph
-void* llvm_visc_streamPop(void* graphID) {
-  DEBUG(cout << "StreamPop -- Graph: " << graphID << flush << "\n");
-  DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
-  unsigned totalBytes = 0;
-  for(uint64_t size:  *(Ctx->BindOutSizes))
-    totalBytes+= size;
-  void* output = malloc(totalBytes);
-  unsigned offset = 0;
-  for (unsigned i=0; i< Ctx->BindOutputBuffers->size(); i++) {
-    uint64_t element = llvm_visc_bufferPop(Ctx->BindOutputBuffers->at(i));
-    //DEBUG(cout << "\tPopped Value " << element << " from buffer\n");
-    memcpy((char*)output+offset, &element, Ctx->BindOutSizes->at(i));
-    offset += Ctx->BindOutSizes->at(i);
-  }
-  return output;
-}
-
-// Wait API for a streaming dataflow graph
-void llvm_visc_streamWait(void* graphID) {
-  DEBUG(cout << "StreamWait -- Graph: " << graphID << flush << "\n");
-  DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
-  // Push garbage to all other input buffers
-  for (unsigned i=0; i< Ctx->BindInputBuffers->size(); i++) {
-    uint64_t element = 0;
-    //DEBUG(cout << "\tPushing Value " << element << " to buffer\n");
-    llvm_visc_bufferPush(Ctx->BindInputBuffers->at(i), element);
-  }
-  // Push 1 in isLastInput buffers of all child nodes
-  for (unsigned i=0; i < Ctx->isLastInputBuffers->size(); i++)
-    llvm_visc_bufferPush(Ctx->isLastInputBuffers->at(i), 1);
-
-  llvm_visc_freeThreads(graphID);
-}
-
-// Create a buffer and return the bufferID
-void* llvm_visc_createBindInBuffer(void* graphID, uint64_t size, unsigned inArgPort) {
-  DEBUG(cout << "Create BindInBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n");
-  DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
-  CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindIn");
-  DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n");
-  Context->BindInputBuffers->push_back(bufferID);
-  (*(Context->ArgInPortSizeMap))[inArgPort] = size;
-  Context->BindInSourcePort->push_back(inArgPort);
-  //Context->BindInSizes->push_back(size);
-  return bufferID;
-}
-
-void* llvm_visc_createBindOutBuffer(void* graphID, uint64_t size) {
-  DEBUG(cout << "Create BindOutBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n");
-  DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
-  //Twine name = Twine("Bind.Out.")+Twine(Context->BindOutputBuffers->size());
-  CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindOut");
-  DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n");
-  Context->BindOutputBuffers->push_back(bufferID);
-  Context->BindOutSizes->push_back(size);
-  return bufferID;
-}
-void* llvm_visc_createEdgeBuffer(void* graphID, uint64_t size) {
-  DEBUG(cout << "Create EdgeBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n");
-  DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
-  //Twine name = Twine("Edge.")+Twine(Context->EdgeBuffers->size());
-  CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "Edge");
-  DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n");
-  Context->EdgeBuffers->push_back(bufferID);
-  Context->EdgeSizes->push_back(size);
-  return bufferID;
-}
-
-void* llvm_visc_createLastInputBuffer(void* graphID, uint64_t size) {
-  DEBUG(cout << "Create isLastInputBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n");
-  DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
-  //Twine name = Twine("isLastInput.")+Twine(Context->EdgeBuffers->size());
-  CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "LastInput");
-  DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n");
-  Context->isLastInputBuffers->push_back(bufferID);
-  return bufferID;
-}
-
-// Free buffers 
-void llvm_visc_freeBuffers(void* graphID) {
-  DEBUG(cout << "Free all buffers -- Graph: " << graphID << flush << "\n");
-  DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
-  for(CircularBuffer<uint64_t>* bufferID: *(Context->BindInputBuffers))
-    delete bufferID;
-  for(CircularBuffer<uint64_t>* bufferID: *(Context->BindOutputBuffers))
-    delete bufferID;
-  for(CircularBuffer<uint64_t>* bufferID: *(Context->EdgeBuffers))
-    delete bufferID;
-  for(CircularBuffer<uint64_t>* bufferID: *(Context->isLastInputBuffers))
-    delete bufferID;
-}
-
-// Pop an element from the buffer
-uint64_t llvm_visc_bufferPop(void* bufferID) {
-  CircularBuffer<uint64_t>* buffer = (CircularBuffer<uint64_t>*) bufferID;
-  return buffer->pop();
-}
-
-// Push an element into the buffer
-void llvm_visc_bufferPush(void* bufferID, uint64_t element) {
-  CircularBuffer<uint64_t>* buffer = (CircularBuffer<uint64_t>*) bufferID;
-  buffer->push(element);
-}
-
-// Create a thread
-void llvm_visc_createThread(void* graphID, void* (*Func)(void*), void* arguments) {
-  DEBUG(cout << "Create Thread -- Graph: " << graphID << ", Func: " << Func << ", Args: " << arguments << flush << "\n");
-  DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
-  int err;
-  pthread_t threadID;
-  if((err = pthread_create(&threadID, NULL, Func, arguments)) != 0)
-    cout << "Failed to create thread. Error code = " << err << flush << "\n";
-
-  Ctx->threads->push_back(threadID);
-}
-
-// Wait for thread to finish
-void llvm_visc_freeThreads(void* graphID) {
-  DEBUG(cout << "Free Threads -- Graph: " << graphID << flush << "\n");
-  DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
-  for(pthread_t thread: *(Ctx->threads))
-    pthread_join(thread, NULL);
-}
-
-/************************ OPENCL & PTHREAD API ********************************/
-
-void* llvm_visc_x86_launch(void* (*rootFunc)(void*), void* arguments) {
-  DFNodeContext_X86 *Context = (DFNodeContext_X86*) malloc(sizeof(DFNodeContext_X86));
-  //int err;
-  //if((err = pthread_create(&Context->threadID, NULL, rootFunc, arguments)) != 0)
-    //cout << "Failed to create pthread. Error code = " << err << flush << "\n";
-  rootFunc(arguments);
-  return Context;
-}
-
-void llvm_visc_x86_wait(void* graphID) {
-  DEBUG(cout << "Waiting for pthread to finish ...\n");
-  //DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
-  //pthread_join(Context->threadID, NULL);
-  free(graphID);
-  DEBUG(cout << "\t... pthread Done!\n");
-}
-
-void* llvm_visc_ocl_initContext(enum visc::Target T) {
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(std::string Target = T == visc::GPU_TARGET? "GPU" : "SPIR");
-  DEBUG(cout << "Initializing Context for " << Target << " device\n");
-  cl_uint numPlatforms;
-  cl_int errcode;
-  errcode = clGetPlatformIDs(0, NULL, &numPlatforms);
-  checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms");
-
-  // now get all the platform IDs
-  cl_platform_id* platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id)*numPlatforms);
-  errcode = clGetPlatformIDs(numPlatforms, platforms, NULL);
-  checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs");
-
-
-  for(unsigned i=0; i < numPlatforms; i++) {
-    char buffer[10240];
-    DEBUG(cout << "Device " << i << " Info -->\n");
-    clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL);
-    DEBUG(cout << "\tPROFILE = " << buffer << flush << "\n");
-    clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL);
-    DEBUG(cout << "\tVERSION = "<< buffer << flush << "\n");
-    clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL);
-    DEBUG(cout << "\tNAME = " << buffer << flush << "\n");
-    clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL);
-    DEBUG(cout << "\tVENDOR = " << buffer << flush << "\n");
-    clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL);
-    DEBUG(cout << "\tEXTENSIONS = " << buffer << flush << "\n");
-  }
-  // set platform property - just pick the first one
-  //cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
-                                        //(long) platforms[0],
-                                        //0};
-  //globalOCLContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU,
-                                         //NULL, NULL, &errcode);
-  assert(numPlatforms >= 2 && "Expecting two OpenCL platforms");
-  // Choose second one which is X86 AVX
-  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
-                                        (long) platforms[T == visc::GPU_TARGET? 0 : 1],
-                                        0};
-  globalOCLContext = clCreateContextFromType(properties,
-                                            T == visc::GPU_TARGET?
-                                              CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU,
-                                       NULL, NULL, &errcode);
-  // get the list of OCL devices associated with context
-  size_t dataBytes;
-  errcode = clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, 0,
-                              NULL, &dataBytes);
-  checkErr(errcode, CL_SUCCESS, "Failure to get context info length");
-
-  clDevices = (cl_device_id *) malloc(dataBytes);
-  errcode |= clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, dataBytes,
-                              clDevices, NULL);
-  checkErr(errcode, CL_SUCCESS, "Failure to get context info");
-  if(false && T == visc::SPIR_TARGET) {
-    cl_device_partition_property props[4];
-    props[0] = CL_DEVICE_PARTITION_BY_COUNTS;
-    props[1] = NUM_CORES;
-    props[2] = CL_DEVICE_PARTITION_BY_COUNTS_LIST_END;
-    props[3] = 0;
-    cl_device_id subdevice_id[8];
-    cl_uint num_entries = 8;
-
-    cl_uint numDevices;
-    clCreateSubDevices(clDevices[0], props, num_entries, subdevice_id, &numDevices);
-    //printf("Num of devices = %d\n", numDevices);
-    //for(unsigned i =0 ; i< numDevices; i++)
-      //printf("Subdevice id %d = %p\n", i, subdevice_id[i]);
-    clDevices[0] = subdevice_id[0];
-    globalOCLContext = clCreateContext(properties, 1, clDevices, NULL, NULL, &errcode);
-    checkErr(errcode, CL_SUCCESS, "Failure to create OCL context");
-  }
-
-  free(platforms);
-  DEBUG(cout << "\tContext " << globalOCLContext << flush << "\n");
-  checkErr(errcode, CL_SUCCESS, "Failure to create OCL context");
-
-  DEBUG(cout << "Initialize Kernel Timer\n");
-  visc_InitializeTimerSet(&kernel_timer);
-
-  pthread_mutex_unlock(&ocl_mtx);
-  return globalOCLContext;
-}
-
-void llvm_visc_ocl_clearContext(void* graphID) {
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "Clear Context\n");
-  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
-  // FIXME: Have separate function to release command queue and clear context.
-  // Would be useful when a context has multiple command queues
-  clReleaseKernel(Context->clKernel);
-  //clReleaseProgram(Context->clProgram);
-  //clReleaseCommandQueue(Context->clCommandQue);
-  //clReleaseContext(globalOCLContext);
-  //DEBUG(cout << "Released context at: " << globalOCLContext);
-  free(Context);
-  DEBUG(cout << "Done with OCL kernel\n");
-  cout << "Printing VISC Timer: KernelTimer\n";
-  visc_PrintTimerSet(&kernel_timer);
-  pthread_mutex_unlock(&ocl_mtx);
-
-}
-
-void llvm_visc_ocl_argument_shared(void* graphID, int arg_index, size_t size) {
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "Set Shared Memory Input:");
-  DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size << flush << "\n");
-  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
-  DEBUG(cout << "Using Context: " << Context << flush << "\n");
-  DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n");
-  //pthread_mutex_lock(&ocl_mtx);
-  cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, NULL);
-  //pthread_mutex_unlock(&ocl_mtx);
-  checkErr(errcode, CL_SUCCESS, "Failure to set shared memory argument");
-  pthread_mutex_unlock(&ocl_mtx);
-}
-
-void llvm_visc_ocl_argument_scalar(void* graphID, void* input, int arg_index, size_t size) {
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "Set Scalar Input:");
-  DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size << flush << "\n");
-  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
-  DEBUG(cout << "Using Context: " << Context << flush << "\n");
-  DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n");
-  //pthread_mutex_lock(&ocl_mtx);
-  cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input);
-  //pthread_mutex_unlock(&ocl_mtx);
-  checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument");
-  pthread_mutex_unlock(&ocl_mtx);
-}
-
-void* llvm_visc_ocl_argument_ptr(void* graphID, void* input, int arg_index, size_t size, bool isInput, bool isOutput) {
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "Set Pointer Input:");
-  DEBUG(cout << "\tArgument Index = " << arg_index << ", Ptr = " << input << ", Size = "<< size << flush << "\n");
-  // Size should be non-zero
-  assert(size != 0 && "Size of data pointed to has to be non-zero!");
-  DEBUG(cout << "\tInput = "<< isInput << "\tOutput = " << isOutput << flush << "\n");
-  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
-  
-  pthread_mutex_unlock(&ocl_mtx);
-  // Check with runtime the location of this memory 
-  cl_mem d_input = (cl_mem) llvm_visc_ocl_request_mem(input, size, Context, isInput, isOutput);
-  
-  pthread_mutex_lock(&ocl_mtx);
-  // Set Kernel Argument
-  //pthread_mutex_lock(&ocl_mtx);
-  cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input);
-  //pthread_mutex_unlock(&ocl_mtx);
-  checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument");
-  DEBUG(cout << "\tDevicePtr = " << d_input << flush << "\n");
-  pthread_mutex_unlock(&ocl_mtx);
-  return d_input;
-}
-
-void* llvm_visc_ocl_output_ptr(void* graphID, int arg_index, size_t size) {
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "Set device memory for Output Struct:");
-  DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = "<< size << flush << "\n");
-  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
-  cl_int errcode;
-  //pthread_mutex_lock(&ocl_mtx);
-  cl_mem d_output = clCreateBuffer(Context->clOCLContext, CL_MEM_WRITE_ONLY, size, NULL, &errcode);
-  //pthread_mutex_unlock(&ocl_mtx);
-  checkErr(errcode, CL_SUCCESS, "Failure to create output buffer on device");
-  //pthread_mutex_lock(&ocl_mtx);
-  errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output);
-  //pthread_mutex_unlock(&ocl_mtx);
-  checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument");
-  DEBUG(cout << "\tDevicePtr = " << d_output << flush << "\n");
-  pthread_mutex_unlock(&ocl_mtx);
-  return d_output;
-}
-
-void llvm_visc_ocl_free(void* ptr) {
-  //DEBUG(cout << "Release Device Pointer: " << ptr << flush << "\n");
-  //cl_mem d_ptr = (cl_mem) ptr;
-  //clReleaseMemObject(d_ptr);
-}
-
-void* llvm_visc_ocl_getOutput(void* graphID, void* h_output, void* d_output, size_t size) {
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "Get Output:\n");
-  DEBUG(cout << "\tHostPtr = " << h_output << ", DevicePtr = " << d_output << ", Size = "<< size << flush << "\n");
-  if(h_output == NULL)
-    h_output = malloc(size);
-  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
-  //pthread_mutex_lock(&ocl_mtx);
-  cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size,
-                                h_output, 0, NULL, NULL);
-  //pthread_mutex_unlock(&ocl_mtx);
-  checkErr(errcode, CL_SUCCESS, "[getOutput] Failure to read output");
-  pthread_mutex_unlock(&ocl_mtx);
-  return h_output;
-}
-
-void* llvm_visc_ocl_executeNode(void* graphID, unsigned workDim , const size_t*
-                                localWorkSize, const size_t* globalWorkSize) {
-  pthread_mutex_lock(&ocl_mtx);
-
-  size_t GlobalWG[3];
-  size_t LocalWG[3];
-
-  // OpenCL EnqeueNDRangeKernel function results in segementation fault if we
-  // directly use local and global work groups arguments. Hence, allocating it
-  // on stack and copying.
-  for(unsigned i=0; i<workDim; i++) {
-    GlobalWG[i] = globalWorkSize[i];
-  }
-
-  // OpenCL allows local workgroup to be null.
-  if(localWorkSize != NULL) {
-    for(unsigned i=0; i<workDim; i++) {
-      LocalWG[i] = localWorkSize[i];
-    }
-  }
-
-  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
-  // TODO: Would like to use event to ensure better scheduling of kernels.
-  // Currently passing the event paratemeter results in seg fault with
-  // clEnqueueNDRangeKernel.
-  cl_event* event;
-  DEBUG(cout << "Enqueuing kernel:\n");
-  DEBUG(cout << "\tCommand Queue: " << Context->clCommandQue << flush << "\n");
-  DEBUG(cout << "\tKernel: " << Context->clKernel << flush << "\n");
-  DEBUG(cout << "\tNumber of dimensions: " << workDim << flush << "\n");
-  DEBUG(cout << "\tGlobal Work Group: ( ");
-  for(unsigned i = 0; i<workDim; i++) {
-    DEBUG(cout << GlobalWG[i] << " ");
-  }
-  DEBUG(cout << ")\n");
-  if(localWorkSize != NULL) {
-    DEBUG(cout << "\tLocal Work Group: ( ");
-    for(unsigned i = 0; i<workDim; i++) {
-      DEBUG(cout << LocalWG[i] << " ");
-    }
-    DEBUG(cout << ")\n");
-  }
-  //pthread_mutex_lock(&ocl_mtx);
-  clFinish(Context->clCommandQue);
-  //pthread_mutex_unlock(&ocl_mtx);
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_COMPUTATION);
-  //for(int i=0 ;i < NUM_TESTS; i++) {
-    //cout << "Iteration = " << i << flush << "\n";
-    //pthread_mutex_lock(&ocl_mtx);
-    cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue,
-        Context->clKernel, workDim, NULL, GlobalWG, (localWorkSize == NULL)? NULL :  LocalWG, 0, NULL, NULL);
-    //pthread_mutex_unlock(&ocl_mtx);
-    checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
-  //}
-  //pthread_mutex_lock(&ocl_mtx);
-  clFinish(Context->clCommandQue);
-  //pthread_mutex_unlock(&ocl_mtx);
-  visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
-  
-  pthread_mutex_unlock(&ocl_mtx);
-  return event;
-}
-
-
-//////////////////////////////////////////////////////////////////////////////
-//! Loads a Program binary file.
-//!
-//! @return the source string if succeeded, 0 otherwise
-//! @param Filename        program filename
-//! @param szFinalLength    returned length of the code string
-//////////////////////////////////////////////////////////////////////////////
-static char* LoadProgSource(const char* Filename, size_t* szFinalLength)
-{
-  DEBUG(cout << "Load Prog Source\n");
-  // locals
-  FILE* pFileStream = NULL;
-  size_t szSourceLength;
-
-  // open the OpenCL source code file
-  pFileStream = fopen(Filename, "rb");
-  if(pFileStream == 0)
-  {
-    return NULL;
-  }
-
-  // get the length of the source code
-  fseek(pFileStream, 0, SEEK_END);
-  szSourceLength = ftell(pFileStream);
-  fseek(pFileStream, 0, SEEK_SET);
-
-  // allocate a buffer for the source code string and read it in
-  char* cSourceString = (char *)malloc(szSourceLength + 1);
-  if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1)
-  {
-      fclose(pFileStream);
-      free(cSourceString);
-      return 0;
-  }
-
-  // close the file and return the total length of the combined (preamble + source) string
-  fclose(pFileStream);
-  if(szFinalLength != 0)
-  {
-      *szFinalLength = szSourceLength;
-  }
-  cSourceString[szSourceLength] = '\0';
-
-  return cSourceString;
-}
-
-void* llvm_visc_ocl_launch(const char* FileName, const char* KernelName) {
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "Launch OCL Kernel\n");
-  // Initialize OpenCL
-
-  // OpenCL specific variables
-  DFNodeContext_OCL *Context = (DFNodeContext_OCL *) malloc(sizeof(DFNodeContext_OCL));
-
-  size_t kernelLength;
-  cl_int errcode;
-
-  // For a single context for all kernels
-  Context->clOCLContext = globalOCLContext;
-
-  //Create a command-queue
-  //pthread_mutex_lock(&ocl_mtx);
-  Context->clCommandQue = clCreateCommandQueue(Context->clOCLContext, clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode);
-  globalCommandQue = Context->clCommandQue;
-  //pthread_mutex_unlock(&ocl_mtx);
-  checkErr(errcode, CL_SUCCESS, "Failure to create command queue");
-
-  DEBUG(cout << "Loading program binary: " << FileName << flush << "\n");
-  char *programSource = LoadProgSource(FileName, &kernelLength);
-  checkErr(programSource != NULL, 1 /*bool true*/, "Failure to load Program Binary");
-
-  cl_int binaryStatus;
-  //pthread_mutex_lock(&ocl_mtx);
-  Context->clProgram = clCreateProgramWithBinary(Context->clOCLContext, 1, &clDevices[0],
-                                        &kernelLength,
-                                        (const unsigned char **)&programSource,
-                                        &binaryStatus, &errcode);
-  //pthread_mutex_unlock(&ocl_mtx);
-  checkErr(errcode, CL_SUCCESS, "Failure to create program from binary");
-
-  DEBUG(cout << "Building kernel - " << KernelName << " from file " << FileName << flush << "\n");
-  errcode = clBuildProgram(Context->clProgram, 0, NULL, NULL, NULL, NULL);
-  // If build fails, get build log from device
-  if(errcode != CL_SUCCESS) {
-    cout << "ERROR: Failure to build program\n";
-    size_t len = 0;
-    errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0] , CL_PROGRAM_BUILD_LOG, 0,
-        NULL, &len);
-    cout << "LOG LENGTH: " << len << flush << "\n";
-    checkErr(errcode, CL_SUCCESS, "Failure to collect program build log length");
-    char *log = (char*) malloc(len*sizeof(char));
-    errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0], CL_PROGRAM_BUILD_LOG, len,
-        log, NULL);
-    checkErr(errcode, CL_SUCCESS, "Failure to collect program build log");
-
-    cout << "Device Build Log:\n" << log << flush << "\n";
-    free(log);
-    pthread_mutex_unlock(&ocl_mtx);
-    exit(EXIT_FAILURE);
-  }
-
-  Context->clKernel = clCreateKernel(Context->clProgram, KernelName, &errcode);
-  checkErr(errcode, CL_SUCCESS, "Failure to create kernel");
-
-  DEBUG(cout << "Kernel ID = " << Context->clKernel << "\n");
-  //free(clDevices);
-  free(programSource);
-
-  pthread_mutex_unlock(&ocl_mtx);
-  return Context;
-}
-
-
-void llvm_visc_ocl_wait(void* graphID) {
-  pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "Wait\n");
-  DFNodeContext_OCL *Context = (DFNodeContext_OCL*) graphID;
-  //pthread_mutex_lock(&ocl_mtx);
-  clFinish(Context->clCommandQue);
-  //pthread_mutex_unlock(&ocl_mtx);
-  pthread_mutex_unlock(&ocl_mtx);
-}
-
-void llvm_visc_switchToTimer(void** timerSet, enum visc_TimerID timer) {
-  //cout << "Switching to timer " << timer << flush << "\n";
-  pthread_mutex_lock(&ocl_mtx);
-  //visc_SwitchToTimer((visc_TimerSet*)(*timerSet), timer);
-  pthread_mutex_unlock(&ocl_mtx);
-}
-void llvm_visc_printTimerSet(void** timerSet, char* timerName) {
-  pthread_mutex_lock(&ocl_mtx);
-  cout << "Printing VISC Timer: ";
-  if(timerName != NULL)
-    cout << timerName << flush << "\n";
-  else
-    cout << "Anonymous\n";
-  visc_PrintTimerSet((visc_TimerSet*) (*timerSet));
-  pthread_mutex_unlock(&ocl_mtx);
-}
-
-void* llvm_visc_initializeTimerSet() {
-  pthread_mutex_lock(&ocl_mtx);
-  visc_TimerSet* TS = (visc_TimerSet*) malloc (sizeof(visc_TimerSet));
-  visc_InitializeTimerSet(TS);
-  pthread_mutex_unlock(&ocl_mtx);
-  return TS;
-}
-
-
-
diff --git a/hpvm/visc-rt/visc-rt.h b/hpvm/visc-rt/visc-rt.h
deleted file mode 100644
index 0a6cadebd3..0000000000
--- a/hpvm/visc-rt/visc-rt.h
+++ /dev/null
@@ -1,303 +0,0 @@
-/*
- *
- * (c) 2010 The Board of Trustees of the University of Illinois.
- */
-#ifndef VISC_RT_HEADER
-#define VISC_RT_HEADER
-
-#include <iostream>
-#include <map>
-#include <ctime>
-#include <vector>
-#include <pthread.h>
-#include <string>
-//#include <condition_variable>
-
-#include "../../include/SupportVISC/VISCHint.h"
-#include "../../include/SupportVISC/VISCTimer.h"
-#include "device_abstraction.h"
-#include "policy.h"
-
-#ifndef DEBUG_BUILD
-#define DEBUG(s) {}
-#else
-#define DEBUG(s) s
-#endif
-
-
-
-using namespace std;
-
-extern "C" {
-
-/************************* Policies *************************************/
-
-void llvm_visc_policy_init();
-void llvm_visc_policy_clear();
-int llvm_visc_policy_getVersion(const char *, int64_t);
-
-/******************** Device Abstraction ********************************/
-void llvm_visc_deviceAbstraction_start();
-void llvm_visc_deviceAbstraction_end();
-void llvm_visc_deviceAbstraction_waitOnDeviceStatus();
-
-/********************* DFG Depth Stack **********************************/
-class DFGDepth {
-  private:
-    unsigned numDim;
-    unsigned dimLimit[3];
-    unsigned dimInstance[3];
-  public:
-    DFGDepth() {}
-    DFGDepth(unsigned n, unsigned dimX = 0, unsigned iX = 0, unsigned dimY = 0, unsigned iY = 0,
-        unsigned dimZ = 0, unsigned iZ = 0) {
-      assert(n <= 3 && "Error! More than 3 dimensions not supported");
-      numDim = n;
-      dimLimit[0] = dimX;
-      dimLimit[1] = dimY;
-      dimLimit[2] = dimZ;
-      dimInstance[0] = iX;
-      dimInstance[1] = iY;
-      dimInstance[2] = iZ;
-    }
-    unsigned getDimLimit(unsigned dim) {
-      assert(dim <= numDim && "Error! Requested dimension limit is not specified");
-      return dimLimit[dim];
-    }
-
-    unsigned getDimInstance(unsigned dim) {
-      assert(dim <= numDim && "Error! Requested dimension instance is not specified");
-      return dimInstance[dim];
-    }
-
-    unsigned getNumDim() {
-      return numDim;
-    }
-};
-
-void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX = 0, uint64_t iX = 0,
-    uint64_t limitY = 0, uint64_t iY = 0, uint64_t limitZ = 0, uint64_t iZ = 0);
-void llvm_visc_x86_dstack_pop();
-uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim);
-uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim);
-
-
-/********************* Memory Tracker **********************************/
-class MemTrackerEntry {
-public:
-  enum Location {HOST, DEVICE};
-  private:
-    size_t size;
-    Location loc;
-    void* addr;
-    void* Context;
-
-  public:
-    MemTrackerEntry(size_t _size, Location _loc, void* _addr, void* _Context):
-      size(_size), loc(_loc), addr(_addr), Context(_Context) {
-    }
-
-    size_t getSize() {
-      return size;
-    }
-
-    Location getLocation() {
-      return loc;
-    }
-
-    void* getAddress() {
-      return addr;
-    }
-
-    void* getContext() {
-      return Context;
-    }
-
-    void update(Location _loc, void* _addr, void* _Context = NULL) {
-      loc = _loc;
-      addr = _addr;
-      Context = _Context;
-    }
-
-    void print() {
-      cout << "Size = " << size << "\tLocation = " << loc << "\tAddress = " << addr << "\tContext = " << Context;
-    }
-};
-
-
-class MemTracker {
-
-private:
-  std::map<void*, MemTrackerEntry*> Table;
-
-public:
-  MemTracker() {
-  }
-
-  bool insert(void* ID, size_t size, MemTrackerEntry::Location loc, void* addr, void* Context = NULL) {
-    MemTrackerEntry* MTE = new MemTrackerEntry(size, loc, addr, Context);
-    Table.insert(std::pair<void*, MemTrackerEntry*>(ID, MTE));
-    return MTE != NULL;
-  }
-
-  MemTrackerEntry* lookup(void* ID) {
-    if(Table.count(ID) == 0)
-      return NULL;
-    return Table[ID];
-  }
-
-  void remove(void* ID) {
-    MemTrackerEntry* MTE = Table[ID];
-    free(MTE);
-    Table.erase(ID);
-  }
-
-  void print() {
-    cout << "Printing Table ... Size = " << Table.size() << flush << "\n";
-    for(auto& Entry: Table) {
-      cout << Entry.first << ":\t" ;
-      Entry.second->print();
-      cout << flush << "\n";
-    }
-  }
-
-};
-
-void llvm_visc_track_mem(void*, size_t);
-void llvm_visc_untrack_mem(void*);
-void* llvm_visc_request_mem(void*, size_t);
-
-/*********************** OPENCL & PTHREAD API **************************/
-void* llvm_visc_x86_launch(void* (void*), void*);
-void llvm_visc_x86_wait(void*);
-void* llvm_visc_ocl_initContext(enum visc::Target);
-
-void* llvm_visc_x86_argument_ptr(void*, size_t);
-
-void llvm_visc_ocl_clearContext(void*);
-void llvm_visc_ocl_argument_shared(void*, int, size_t);
-void llvm_visc_ocl_argument_scalar(void*, void*, int, size_t);
-void* llvm_visc_ocl_argument_ptr(void*, void*, int, size_t, bool, bool);
-void* llvm_visc_ocl_output_ptr(void*, int, size_t);
-void llvm_visc_ocl_free(void*);
-void* llvm_visc_ocl_getOutput(void*, void*, void*, size_t);
-void* llvm_visc_ocl_executeNode(void*, unsigned, const size_t*, const size_t*);
-void* llvm_visc_ocl_launch(const char*, const char*);
-void llvm_visc_ocl_wait(void*);
-
-void llvm_visc_switchToTimer(void** timerSet, enum visc_TimerID);
-void llvm_visc_printTimerSet(void** timerSet, char* timerName = NULL);
-void* llvm_visc_initializeTimerSet();
-
-}
-
-/*************************** Pipeline API ******************************/
-// Circular Buffer class
-unsigned counter = 0;
-template <class ElementType>
-class CircularBuffer {
-private:
-    int numElements;
-    int bufferSize;
-    int Head;
-    int Tail;
-    pthread_mutex_t mtx;
-    pthread_cond_t cv;
-    vector<ElementType> buffer;
-    std::string name;
-    unsigned ID;
-
-public:
-    CircularBuffer(int maxElements, std::string _name =  "ANON") {
-        ID = counter;
-        Head = 0;
-        Tail = 0;
-        numElements = 0;
-        name = _name;
-        bufferSize = maxElements+1;
-        buffer.reserve(bufferSize);
-        pthread_mutex_init(&mtx, NULL);
-        pthread_cond_init(&cv, NULL);
-        counter++;
-
-    }
-
-    bool push(ElementType E);
-    ElementType pop();
-
-};
-
-template <class ElementType>
-bool CircularBuffer<ElementType>::push(ElementType E) {
-    //DEBUG(cout << name << " Buffer[" << ID << "]: Push " << E << flush << "\n");
-    //unique_lock<mutex> lk(mtx);
-    pthread_mutex_lock(&mtx);
-    if((Head +1) % bufferSize == Tail) {
-        //DEBUG(cout << name << " Buffer[" << ID << "]: Push going to sleep ...\n");
-        //cv.wait(lk);
-        pthread_cond_wait(&cv, &mtx);
-        //DEBUG(cout << name << " Buffer[" << ID << "]: Push woke up\n");
-    }
-    buffer[Head] = E;
-    Head = (Head+1) % bufferSize;
-    numElements++;
-    //DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " << numElements << flush << "\n");
-    //lk.unlock();
-    pthread_mutex_unlock(&mtx);
-    //cv.notify_one();
-    pthread_cond_signal(&cv);
-    return true;
-}
-
-template <class ElementType>
-ElementType CircularBuffer<ElementType>::pop() {
-    //unique_lock<mutex> lk(mtx);
-    //DEBUG(cout << name << " Buffer[" << ID << "]: Pop\n");
-    pthread_mutex_lock(&mtx);
-    if(Tail == Head) {
-        //DEBUG(cout << name << " Buffer[" << ID << "]: Pop going to sleep ...\n");
-        //cv.wait(lk);
-        pthread_cond_wait(&cv, &mtx);
-        //DEBUG(cout << name << " Buffer[" << ID << "]: Pop woke up\n");
-    }
-    ElementType E = buffer[Tail];
-    Tail = (Tail + 1) % bufferSize;
-    numElements--;
-    //DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " << numElements << flush << "\n");
-    //lk.unlock();
-    pthread_mutex_unlock(&mtx);
-    //cv.notify_one();
-    pthread_cond_signal(&cv);
-    return E;
-}
-
-extern "C" {
-// Functions to push and pop values from pipeline buffers
-uint64_t llvm_visc_bufferPop(void*);
-void llvm_visc_bufferPush(void*, uint64_t);
-
-// Functions to create and destroy buffers
-void* llvm_visc_createBindInBuffer(void*, uint64_t, unsigned);
-void* llvm_visc_createBindOutBuffer(void*, uint64_t);
-void* llvm_visc_createEdgeBuffer(void*, uint64_t);
-void* llvm_visc_createLastInputBuffer(void*, uint64_t);
-
-void llvm_visc_freeBuffers(void*);
-
-// Functions to create and destroy threads
-void llvm_visc_createThread(void* graphID, void*(*Func)(void*), void*);
-void llvm_visc_freeThreads(void*);
-
-// Launch API for a streaming graph.
-// Arguments:
-// (1) Launch Function: void* (void*, void*)
-// (2) Push Function:   void (void*, std::vector<uint64_t>**, unsgined)
-// (3) Pop Function:    void* (std::vector<uint64_t>**, unsigned)
-void* llvm_visc_streamLaunch(void(*LaunchFunc)(void*, void*), void*);
-void llvm_visc_streamPush(void* graphID, void* args);
-void* llvm_visc_streamPop(void* graphID);
-void llvm_visc_streamWait(void* graphID);
-
-}
-
-#endif //VISC_RT_HEADER
-- 
GitLab