diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7945230f264f1ec6a5ca9fd7fe02d0e3b7fbb42c
--- /dev/null
+++ b/llvm/projects/visc-rt/visc-rt.cpp
@@ -0,0 +1,255 @@
+#include <pthread.h>
+#include <cstdlib>
+#include <cstdio>
+#include <CL/cl.h>
+
+typedef struct {
+  pthread_t threadID; 
+} DFNodeContext_X86;
+
+typedef struct {
+  cl_context clGPUContext;
+  cl_command_queue clCommandQue;
+  cl_program clProgram;
+  cl_kernel clKernel;
+} DFNodeContext_PTX;
+
+extern "C"
+__int32_t llvm_visc_launch_x86(size_t* graphID, void* (*rootFunc)(void*), void* arguments) {
+  DFNodeContext_X86 *Context = (DFNodeContext_X86 *) malloc(sizeof(DFNodeContext_X86));
+  return pthread_create(&Context->threadID, NULL, rootFunc, arguments);
+  *graphID = (size_t) Context;
+}
+
+extern "C"
+__int32_t llvm_visc_wait_x86(size_t graphID) {
+  DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
+  return pthread_join(Context->threadID, NULL);
+}
+
+static inline void checkErr(cl_int err, cl_int success, const char * name) {
+    if (err != success) {
+          printf("ERROR: %s\n", name);
+              exit(EXIT_FAILURE);
+                }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Loads a Program binary file.
+//!
+//! @return the source string if succeeded, 0 otherwise
+//! @param cFilename        program filename
+//! @param szFinalLength    returned length of the code string
+//////////////////////////////////////////////////////////////////////////////
+static char* LoadProgSource(const char* cFilename, size_t* szFinalLength)
+{
+    // locals
+    FILE* pFileStream = NULL;
+    size_t szSourceLength;
+
+    // open the OpenCL source code file
+    #ifdef _WIN32   // Windows version
+        if(fopen_s(&pFileStream, cFilename, "rb") != 0)
+        {
+            return NULL;
+        }
+    #else           // Linux version
+        pFileStream = fopen(cFilename, "rb");
+        if(pFileStream == 0)
+        {
+            return NULL;
+        }
+    #endif
+
+    // get the length of the source code
+    fseek(pFileStream, 0, SEEK_END);
+    szSourceLength = ftell(pFileStream);
+    fseek(pFileStream, 0, SEEK_SET);
+
+    // allocate a buffer for the source code string and read it in
+    char* cSourceString = (char *)malloc(szSourceLength + 1);
+    if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1)
+    {
+        fclose(pFileStream);
+        free(cSourceString);
+        return 0;
+    }
+
+    // close the file and return the total length of the combined (preamble + source) string
+    fclose(pFileStream);
+    if(szFinalLength != 0)
+    {
+        *szFinalLength = szSourceLength;
+    }
+    cSourceString[szSourceLength] = '\0';
+
+    return cSourceString;
+}
+
+extern "C"
+__int32_t llvm_visc_launch_ptx(size_t* graphID, void* (*rootFunc) (void*), void* arguments) {
+  // Initialize OpenCL
+
+  // OpenCL specific variables
+  DFNodeContext_PTX *Context = (DFNodeContext_PTX *) malloc(sizeof(DFNodeContext_PTX));
+  
+  // Return Context pointer as grpahID;
+  *graphID = (size_t) Context;
+
+
+  size_t dataBytes;
+  size_t kernelLength;
+  cl_int errcode;
+
+  /* Application Specific Code
+  // OpenCL device memory for matrices
+  cl_mem d_A;
+  cl_mem d_B;
+  cl_mem d_C;
+  */
+
+  /*****************************************/
+  /* Initialize OpenCL */
+  /*****************************************/
+  // query the number of platforms
+  cl_uint numPlatforms;
+  errcode = clGetPlatformIDs(0, NULL, &numPlatforms);
+  checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms");
+
+  // now get all the platform IDs
+  cl_platform_id platforms[numPlatforms];
+  errcode = clGetPlatformIDs(numPlatforms, platforms, NULL);
+  checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs");
+
+  for(unsigned i=0; i < numPlatforms; i++) {
+    char buffer[10240];
+    printf("  -- %d --\n", i);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL);
+    printf("  PROFILE = %s\n", buffer);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL);
+    printf("  VERSION = %s\n", buffer);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL);
+    printf("  NAME = %s\n", buffer);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL);
+    printf("  VENDOR = %s\n", buffer);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL);
+    printf("  EXTENSIONS = %s\n", buffer);
+  }
+  // set platform property - just pick the first one
+  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
+                                        (long) platforms[0],
+                                        0};
+  Context->clGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU,
+                                         NULL, NULL, &errcode);
+  checkErr(errcode, CL_SUCCESS, "Failure to create GPU context");
+
+  // get the list of GPU devices associated with context
+  errcode = clGetContextInfo(Context->clGPUContext, CL_CONTEXT_DEVICES, 0,
+                              NULL, &dataBytes);
+  cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes);
+  errcode |= clGetContextInfo(Context->clGPUContext, CL_CONTEXT_DEVICES, dataBytes,
+                              clDevices, NULL);
+  checkErr(errcode, CL_SUCCESS, "Failure to get context info");
+
+  //Create a command-queue
+  Context->clCommandQue = clCreateCommandQueue(Context->clGPUContext, clDevices[0], 0, &errcode);
+  checkErr(errcode, CL_SUCCESS, "Failure to create command queue");
+
+  /* Application specific code
+  // Setup device memory
+  d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, bytes_C, NULL,
+                       &errcode);
+  d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                       bytes_A, h_A, &errcode);
+  d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                       bytes_B, h_B, &errcode);
+  */
+
+  char *clMatrixMul = LoadProgSource("matrixMul.nvptx.s", &kernelLength);
+  checkErr(clMatrixMul != NULL, 1 /*bool true*/, "Failure to load Program Binary");
+  
+  cl_int binaryStatus;
+  Context->clProgram = clCreateProgramWithBinary(Context->clGPUContext, 1, &clDevices[0],
+                                        &kernelLength,
+                                        (const unsigned char **)&clMatrixMul,
+                                        &binaryStatus, &errcode);
+  checkErr(errcode, CL_SUCCESS, "Failure to create program from binary");
+
+  errcode = clBuildProgram(Context->clProgram, 0, NULL, NULL, NULL, NULL);
+  checkErr(errcode, CL_SUCCESS, "Failure to build program");
+
+  Context->clKernel = clCreateKernel(Context->clProgram, "matrixMul", &errcode);
+  checkErr(errcode, CL_SUCCESS, "Failure to create kernel");
+
+  
+
+  // Invoke the callback function to put memory allocations in place
+  rootFunc(graphID);
+
+  /* Application Specific Code
+  // Launch OpenCL kernel
+  size_t localWorkSize[2], globalWorkSize[2];
+
+  int wA = WA;
+  int wC = WC;
+  errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C);
+  errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A);
+  errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B);
+  errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA);
+  errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC);
+  checkErr(errcode, CL_SUCCESS, "Failure to set kernel arguments");
+ 
+  localWorkSize[0] = BLOCK_SIZE;
+  localWorkSize[1] = BLOCK_SIZE;
+  globalWorkSize[0] = ((WB-1)/BLOCK_SIZE + 1) * BLOCK_SIZE;
+  globalWorkSize[1] = ((HA-1)/BLOCK_SIZE + 1) * BLOCK_SIZE;
+ 
+  errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, 
+                                   globalWorkSize, localWorkSize,
+                                   0, NULL, NULL);
+  checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
+  
+ 
+  // Retrieve result from device
+  errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, bytes_C, 
+                                h_C, 0, NULL, NULL);
+  checkErr(errcode, CL_SUCCESS, "Failure to read buffer");
+  */ 
+
+  /* App specific code
+  // Deallocate memory
+  free(h_A);
+  free(h_B);
+  free(h_C);
+
+  clReleaseMemObject(d_A);
+  clReleaseMemObject(d_C);
+  clReleaseMemObject(d_B);
+  */
+
+  free(clDevices);
+  free(clMatrixMul);
+
+  /*
+  // Free in wait implementation
+  clReleaseContext(Context->clGPUContext);
+  clReleaseKernel(Context->clKernel);
+  clReleaseProgram(Context->clProgram);
+  */
+
+  return 0;
+}
+
+
+extern "C" 
+__int32_t llvm_visc_wait_ptx(size_t graphID) {
+  DFNodeContext_PTX *Context = (DFNodeContext_PTX*) graphID;
+  clFinish(Context->clCommandQue);
+
+  // Release
+  clReleaseContext(Context->clGPUContext);
+  clReleaseKernel(Context->clKernel);
+  clReleaseProgram(Context->clProgram);
+
+  return 0;
+}