diff --git a/llvm/include/llvm/IR/IntrinsicsVISC.td b/llvm/include/llvm/IR/IntrinsicsVISC.td
index e40bec31faa8ecfa21f8024fa67917d72c774691..ed848f99b5b4f18967fe779afe249657c95dad1c 100644
--- a/llvm/include/llvm/IR/IntrinsicsVISC.td
+++ b/llvm/include/llvm/IR/IntrinsicsVISC.td
@@ -18,12 +18,16 @@ let TargetPrefix = "visc" in {
    */
 
   /* Launch intrinsic -
-   * i8* llvm.visc.launch(i8*, i8*, int);
+   * i32 llvm.visc.launch(graphID*, function* , ArgList*);
    */
-  def int_visc_launch : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_ptr_ty,
-                                   llvm_i32_ty], []>;
+  def int_visc_launch : Intrinsic<[llvm_i32_ty], [llvm_ptrptr_ty, llvm_ptr_ty,
+                                  llvm_ptr_ty], []>;
+
+  /* Wait intrinsic -
+   * i32 llvm.visc.wait(graphID*, returnVal*);
+   */
+  def int_visc_wait : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], []>;
 
-  def int_visc_test : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], []>;
   /* Create Node intrinsic -
    * i8* llvm.visc.createNode(function*);
    */
diff --git a/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp b/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp
index 8081f4065eb6b51e2fd61e866f7255510b480617..466bc41e14dee6d7d4ba68aee643495a3ebbb7f2 100644
--- a/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp
+++ b/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp
@@ -49,7 +49,7 @@ bool BuildDFG::runOnModule(Module &M) {
 
   // Intrinsic Instruction has been initialized from this point on.
 
-  Function* F = cast<Function>((II->getOperand(0))->stripPointerCasts());
+  Function* F = cast<Function>((II->getOperand(1))->stripPointerCasts());
   Root = DFInternalNode::Create(II, F);
   BuildGraph(Root, F);
 
@@ -313,9 +313,6 @@ void BuildDFG::BuildGraph (DFInternalNode* N, Function *F) {
     if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(I)) {
       errs() << "IntrinsicID = " << II->getIntrinsicID() << ": " << II->getCalledFunction()->getName()<<"\n";
       switch(II->getIntrinsicID()) {
-      case Intrinsic::visc_test:
-        errs() << "Found Test Intrinsic";
-        break;
 
       case Intrinsic::visc_createNode:
       case Intrinsic::visc_createNode1D:
@@ -335,8 +332,9 @@ void BuildDFG::BuildGraph (DFInternalNode* N, Function *F) {
         break;
 
       //TODO: Reconsider launch within a dataflow graph (recursion?)
+      case Intrinsic::visc_wait:
       case Intrinsic::visc_launch:
-        errs() << "Error: Launch intrinsic used within a dataflow graph\n";
+        errs() << "Error: Launch/wait intrinsic used within a dataflow graph\n";
         break;
 
       default:
diff --git a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
index d4cfeccad46faa9ca5606ed7ce03ed1a2cdbe5cd..8c111a64b54b024d584e11b9f1793c391d1cfc20 100644
--- a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -63,10 +63,6 @@ private:
   Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = "");
   Argument* getArgumentFromEnd(Function* F, unsigned offset);
   Argument* getArgumentAt(Function* F, unsigned offset);
-  Constant* getOrInsertPThreadCreate();
-  Constant* getOrInsertPThreadJoin();
-  Constant* getOrInsertPThreadExit();
-  Constant* getOrInsertMalloc();
   void codeGenLaunch(DFInternalNode* Root);
   void codeGen(DFInternalNode* N);
   void codeGen(DFLeafNode* N);
@@ -117,6 +113,41 @@ bool DFG2LLVM_X86::runOnModule(Module &M) {
   return true;
 }
 
+/* Returns vector of all wait instructions
+ */
+std::vector<CallInst*>* getWaitList(CallInst* LI) {
+  Value* GraphIDAddr = LI->getArgOperand(0);
+  std::vector<CallInst*>* WaitList = new std::vector<CallInst*>();
+  // It must have been loaded from memory somewhere
+  GraphIDAddr->use_begin();
+  for(Value::use_iterator ui = GraphIDAddr->use_begin(),
+      ue = GraphIDAddr->use_end(); ui!=ue; ++ui) {
+    if(LoadInst* LI = dyn_cast<LoadInst>(*ui)) {
+      DEBUG(errs() << *LI << "\n");
+      for(Value::use_iterator i = LI->use_begin(), e = LI->use_end(); i!=e; ++i) {
+        if(CallInst* waitI = dyn_cast<CallInst>(*i)) {
+          DEBUG(errs() << *waitI << "\n");
+          WaitList->push_back(waitI);
+        }
+      }
+    }
+    // If graphID memory address is used by another launch, then break
+    if(CallInst* CI = dyn_cast<CallInst>(*ui)) {
+      if(LI != CI) {
+        DEBUG(errs()<< "Warning: Overwriting graph ID in memory  -- " << *CI << "\n" << *LI << "\n");
+        break;
+      }
+    }
+    // If graphID in memory is overwritten using store, it's an error
+    if(StoreInst* SI =dyn_cast<StoreInst>(*ui)) {
+      assert(SI->getPointerOperand() == GraphIDAddr
+          && "Error: Do not manually write over graphID in memory!");
+    }
+
+  }
+  return WaitList;
+}
+
 void CodeGenTraversal::addIdxDimArgs(Function* F) {
   // Add Index and Dim arguments
   std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"};
@@ -224,6 +255,10 @@ Value* CodeGenTraversal::addLoop(Instruction* I, Value* limit, const Twine& inde
 }
 
 void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
+  // TODO: Place an assert to check if the constant passed bu launch intrinsic
+  // as the number of arguments to DFG is same as the number of arguments of the
+  // root of DFG
+
   // Get Launch Instruction
   IntrinsicInst* LI = Root->getInstruction();
 
@@ -233,54 +268,42 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
   Type* i8Ty = Type::getInt8Ty(LI->getContext());
   Type* voidTy = Type::getVoidTy(LI->getContext());
 
-  /* Get or Insert pthread utilities necessary to run DFG as a separate thread
-   * (1) pthread_create
-   * (2) pthread_join
-   * (3) pthread_exit
-   * Also requires a new struct for pthread_attr_t
+  /* Get or Insert visc runtime utilities necessary to run DFG as a separate thread
+   * (1) llvm_visc_launch_x86
+   * (2) llvm_visc_wait_x86
    */
-  Type *PThreadTy, *PThreadAttrTy, *PThreadArgTy;
+  Type *GraphIDTy;
   std::vector<Type*>Elements;
   // PThreads use different attribute types for 32-bit and 64-bit machines
   if(M.getPointerSize() == Module::Pointer64) {
-    PThreadTy = Type::getInt64Ty(LI->getContext());
-    Elements.push_back(ArrayType::get(Type::getInt8Ty(LI->getContext()), 56));
+    GraphIDTy = i64Ty;
   }
   else {
-    PThreadTy = Type::getInt32Ty(LI->getContext());
-    Elements.push_back(ArrayType::get(Type::getInt8Ty(LI->getContext()), 36));
+    GraphIDTy = i32Ty;
   }
 
-  PThreadAttrTy = StructType::create(LI->getContext(), Elements, "union.pthread_attr_t");
-  FunctionType* PThreadFuncTy = FunctionType::get(i8Ty->getPointerTo(),
-                                                  ArrayRef<Type*>(i8Ty->getPointerTo()),
-                                                  false);
-  PThreadArgTy = i8Ty->getPointerTo();
-
-  // Argument types for pthread_create
-  Type* ArgTypesPTCreate[] = {PThreadTy->getPointerTo(),
-                              PThreadAttrTy->getPointerTo(),
-                              PThreadFuncTy->getPointerTo(),
-                              i8Ty->getPointerTo()};
-  // Construct FunctionType of pthread_create call
-  FunctionType* PThreadCreateTy = FunctionType::get(i32Ty,
-                                                    ArrayRef<Type*>(ArgTypesPTCreate, 4),
-                                                    false);
-  // Argument types for pthread_join
-  Type* ArgTypesPTJoin[] = {PThreadTy,
-                            i8Ty->getPointerTo()->getPointerTo()};
-  // Construct FunctionType for pthread_join call
-  FunctionType* PThreadJoinTy = FunctionType::get(i32Ty,
-                                                  ArrayRef<Type*>(ArgTypesPTJoin, 2),
-                                                  false);
-  // Construct FunctionType for pthread_exit call
-  FunctionType* PThreadExitTy = FunctionType::get(voidTy,
+  FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(),
                                                   ArrayRef<Type*>(i8Ty->getPointerTo()),
                                                   false);
+  // Argument types for llvm_visc_launch_x86
+  Type* ArgTypesLaunch[] = {i8Ty->getPointerTo()->getPointerTo(),
+                            AppFuncTy->getPointerTo(),
+                            i8Ty->getPointerTo()};
+
+  // Construct FunctionType of llvm_visc_launch_x86 call
+  FunctionType* LaunchFuncTy = FunctionType::get(i32Ty,
+                                                 ArrayRef<Type*>(ArgTypesLaunch, 3),
+                                                 false);
+
+  // Construct FunctionType for llvm_visc_wait_x86 call
+  FunctionType* WaitFuncTy = FunctionType::get(i32Ty,
+                                              ArrayRef<Type*>(i8Ty->getPointerTo()),
+                                              false);
+
   // Get or insert the global declarations for pthread functions
-  Constant* PThreadCreate = M.getOrInsertFunction("pthread_create", PThreadCreateTy);
-  Constant* PThreadJoin = M.getOrInsertFunction("pthread_join", PThreadJoinTy);
-  Constant* PThreadExit = M.getOrInsertFunction("pthread_exit", PThreadExitTy);
+  Constant* Launch = M.getOrInsertFunction("llvm_visc_launch_x86", LaunchFuncTy);
+  Constant* Wait = M.getOrInsertFunction("llvm_visc_wait_x86", WaitFuncTy);
+
   // Construct FunctionType for malloc call
   FunctionType* MallocTy = FunctionType::get(i8Ty->getPointerTo(),
                                             ArrayRef<Type*>(i64Ty),
@@ -301,17 +324,17 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
    * passed to pthread_exit call.
    */
   // Create Launch Function of type i8*(i8*) which calls the root function 
-  Function* LaunchFunc = Function::Create(PThreadFuncTy,
-                                          Root->getFuncPointer()->getLinkage(),
-                                          "LaunchDataflowGraph",
-                                          &M);
+  Function* AppFunc = Function::Create(AppFuncTy,
+                                       Root->getFuncPointer()->getLinkage(),
+                                       "LaunchDataflowGraph",
+                                       &M);
   // Give a name to the argument which is used pass data to this thread
-  Value* data = LaunchFunc->arg_begin();
+  Value* data = AppFunc->arg_begin();
   data->setName("data.addr");
   // Add a basic block to this empty function and a return null statement to it
-  BasicBlock *BB = BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc);
-  ReturnInst* RI = ReturnInst::Create(LaunchFunc->getContext(),
-                                      Constant::getNullValue(LaunchFunc->getReturnType()),
+  BasicBlock *BB = BasicBlock::Create(AppFunc->getContext(), "entry", AppFunc);
+  ReturnInst* RI = ReturnInst::Create(AppFunc->getContext(),
+                                      Constant::getNullValue(AppFunc->getReturnType()),
                                       BB);
   // Find the X86 function generated for Root and 
   Function* RootF_X86 = FMap[Root->getFuncPointer()];
@@ -351,46 +374,38 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
     argNum++;
     data = GEP;
   }
+
   // Code for returning the output
   Constant* SizeOf = ConstantExpr::getSizeOf(CI->getType());
-  CallInst* OutputAddr = CallInst::Create(Malloc, ArrayRef<Value*>(SizeOf), "output.addr", RI);
-  CastInst* OutputAddrCast = CastInst::CreatePointerCast(OutputAddr,
+  CastInst* OutputAddrCast = CastInst::CreatePointerCast(data,
                                                         CI->getType()->getPointerTo(),
                                                         CI->getName()+".addr",
                                                         RI);
   new StoreInst(CI, OutputAddrCast, RI);
 
-  CallInst::Create(PThreadExit, ArrayRef<Value*>(OutputAddr), "", RI);
-  DEBUG(errs() << "Launch Function:\n");
-  DEBUG(errs() << *LaunchFunc << "\n");
+  DEBUG(errs() << "Application specific function:\n");
+  DEBUG(errs() << *AppFunc << "\n");
  
   // Substitute launch intrinsic main
-  AllocaInst* AI = new AllocaInst(PThreadTy, "DFG_threadID.addr", LI);
-  DEBUG(errs() << *AI << "\n");
-  Value* PTCreateArgs[] = {AI,
-                          Constant::getNullValue(PThreadCreateTy->getParamType(1)),
-                          LaunchFunc,
-                          LI->getArgOperand(1)};
-  CallInst* PTCreateInst = CallInst::Create(PThreadCreate,
-                                            ArrayRef<Value*>(PTCreateArgs,4),
-                                            "", LI);
-
-  DEBUG(errs() << *PTCreateInst << "\n");
-  // Place Join
-  LoadInst* LoadPThreadID = new LoadInst(AI, "DFG_threadID", LI->getParent()->getTerminator());
-  AllocaInst* DFGOut = new AllocaInst(RootF_X86->getReturnType()->getPointerTo(),
-                                      LaunchFunc->getName()+".out",
-                                      LI->getParent()->getTerminator());
-  CastInst* DFGOutCast = CastInst::CreatePointerCast(DFGOut,
-      i8Ty->getPointerTo()->getPointerTo(), "DFG_return",
-      LI->getParent()->getTerminator());
-
-  Value* PTJoinArgs[] = {LoadPThreadID, DFGOutCast};
-  CallInst* PTJoinInst = CallInst::Create(PThreadJoin,
-                                            ArrayRef<Value*>(PTJoinArgs,2),
-                                            "",
-                                            LI->getParent()->getTerminator());
-  DEBUG(errs() << *PTJoinInst << "\n");
+  Value* LaunchInstArgs[] = {LI->getArgOperand(0),
+                            AppFunc,
+                            LI->getArgOperand(2)};
+  CallInst* LaunchInst = CallInst::Create(Launch,
+                                          ArrayRef<Value*>(LaunchInstArgs,3),
+                                          "", LI);
+  //ReplaceInstWithInst(LI, LaunchInst);
+
+  DEBUG(errs() << *LaunchInst << "\n");
+  // Replace all wait instructions with x86 specific wait instructions
+  std::vector<CallInst*>* WaitList = getWaitList(LaunchInst);
+  for(unsigned i=0; i < WaitList->size(); ++i) {
+    CallInst* waitI = WaitList->at(i);
+    CallInst* waitI_X86 = CallInst::Create(Wait,
+                                          ArrayRef<Value*>(waitI->getArgOperand(0)),
+                                          "");
+    ReplaceInstWithInst(waitI, waitI_X86);
+    DEBUG(errs() << *waitI_X86 << "\n");
+  }
 
 }
 
diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp
index 7945230f264f1ec6a5ca9fd7fe02d0e3b7fbb42c..9080a31b4a4a39aaea7ef4dffac4f055a24ad48d 100644
--- a/llvm/projects/visc-rt/visc-rt.cpp
+++ b/llvm/projects/visc-rt/visc-rt.cpp
@@ -15,14 +15,14 @@ typedef struct {
 } DFNodeContext_PTX;
 
 extern "C"
-__int32_t llvm_visc_launch_x86(size_t* graphID, void* (*rootFunc)(void*), void* arguments) {
+__int32_t llvm_visc_launch_x86(void** graphID, void* (*rootFunc)(void*), void* arguments) {
   DFNodeContext_X86 *Context = (DFNodeContext_X86 *) malloc(sizeof(DFNodeContext_X86));
+  *graphID = Context;
   return pthread_create(&Context->threadID, NULL, rootFunc, arguments);
-  *graphID = (size_t) Context;
 }
 
 extern "C"
-__int32_t llvm_visc_wait_x86(size_t graphID) {
+__int32_t llvm_visc_wait_x86(void* graphID) {
   DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
   return pthread_join(Context->threadID, NULL);
 }
@@ -87,14 +87,14 @@ static char* LoadProgSource(const char* cFilename, size_t* szFinalLength)
 }
 
 extern "C"
-__int32_t llvm_visc_launch_ptx(size_t* graphID, void* (*rootFunc) (void*), void* arguments) {
+__int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void* arguments) {
   // Initialize OpenCL
 
   // OpenCL specific variables
   DFNodeContext_PTX *Context = (DFNodeContext_PTX *) malloc(sizeof(DFNodeContext_PTX));
   
   // Return Context pointer as grpahID;
-  *graphID = (size_t) Context;
+  *graphID = Context;
 
 
   size_t dataBytes;
@@ -117,7 +117,7 @@ __int32_t llvm_visc_launch_ptx(size_t* graphID, void* (*rootFunc) (void*), void*
   checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms");
 
   // now get all the platform IDs
-  cl_platform_id platforms[numPlatforms];
+  cl_platform_id* platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id)*numPlatforms);
   errcode = clGetPlatformIDs(numPlatforms, platforms, NULL);
   checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs");
 
@@ -242,7 +242,7 @@ __int32_t llvm_visc_launch_ptx(size_t* graphID, void* (*rootFunc) (void*), void*
 
 
 extern "C" 
-__int32_t llvm_visc_wait_ptx(size_t graphID) {
+__int32_t llvm_visc_wait_ptx(void* graphID) {
   DFNodeContext_PTX *Context = (DFNodeContext_PTX*) graphID;
   clFinish(Context->clCommandQue);
 
diff --git a/llvm/test/VISC/unitTests/3level.ll b/llvm/test/VISC/unitTests/3level.ll
index dbb13c69510888f2a10866e800058a23cb332b50..c884acf32cafdce42a5b8e219c3a1ab32676afae 100644
--- a/llvm/test/VISC/unitTests/3level.ll
+++ b/llvm/test/VISC/unitTests/3level.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: clang %t.ll -lpthread -o %t.bin
+; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/3level.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -16,7 +17,10 @@ declare i8* @llvm.visc.createNode(i8*) #0
 declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i32) #0
+declare i32 @llvm.visc.launch(i8**, i8*, i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.wait(i8*) #0
 
 ; Function Attrs: nounwind
 declare i8* @llvm.visc.getNode() #0
@@ -30,15 +34,25 @@ declare void @llvm.visc.bind.output(i8*, i32, i32)
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  %in.addr = alloca i32
+  %in.addr = alloca { i32, %rtype }
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
   %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0
   %conv.i = trunc i64 %call.i to i32
-  store i32 %conv.i, i32* %in.addr
-  %args = bitcast i32* %in.addr to i8*
-  %launch = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args, i32 1)
+  %1 = bitcast { i32, %rtype }* %in.addr to i32*
+  store i32 %conv.i, i32* %1
+  %args = bitcast { i32, %rtype }* %in.addr to i8*
+  %graphIDloc = alloca i8*
+  %launch = call i32 @llvm.visc.launch(i8** %graphIDloc, i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
+  %graphID = load i8** %graphIDloc
+  %wait = call i32 @llvm.visc.wait(i8* %graphID)
+  %2 = getelementptr { i32, %rtype }* %in.addr, i32 0, i32 1
+  %outputstruct = load %rtype* %2
+  %output1 = extractvalue %rtype %outputstruct, 0
+  %output2 = extractvalue %rtype %outputstruct, 1
+  %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output1) #0
+  %call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output2) #0
   ret i32 0
 }
 
diff --git a/llvm/test/VISC/unitTests/query2D.ll b/llvm/test/VISC/unitTests/query2D.ll
index c5d4bd2025b66144780e933f0b6b5c9b5032a5d0..9b2ad72c5abac5dd6488a3d8f0848d262d45f521 100644
--- a/llvm/test/VISC/unitTests/query2D.ll
+++ b/llvm/test/VISC/unitTests/query2D.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: clang %t.ll -lpthread -o %t.bin
+; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/query2D.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -22,7 +23,10 @@ declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0
 declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i32) #0
+declare i32 @llvm.visc.launch(i8**, i8*, i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.wait(i8*) #0
 
 ; Function Attrs: nounwind
 declare i8* @llvm.visc.getNode() #0
@@ -45,15 +49,23 @@ declare void @llvm.visc.bind.output(i8*, i32, i32)
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  %in.addr = alloca i32
+  %in.addr = alloca { i32, %rtype }
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
   %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0
   %conv.i = trunc i64 %call.i to i32
-  store i32 %conv.i, i32* %in.addr
-  %args = bitcast i32* %in.addr to i8*
-  %launch = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args, i32 1)
+  %1 = bitcast { i32, %rtype }* %in.addr to i32*
+  store i32 %conv.i, i32* %1
+  %args = bitcast { i32, %rtype }* %in.addr to i8*
+  %graphIDloc = alloca i8*
+  %launch = call i32 @llvm.visc.launch(i8** %graphIDloc, i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
+  %graphID = load i8** %graphIDloc
+  %wait = call i32 @llvm.visc.wait(i8* %graphID)
+  %2 = getelementptr { i32, %rtype }* %in.addr, i32 0, i32 1
+  %outputstruct = load %rtype* %2
+  %output = extractvalue %rtype %outputstruct, 0
+  %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
   ret i32 0
 }
 
diff --git a/llvm/test/VISC/unitTests/query3D.ll b/llvm/test/VISC/unitTests/query3D.ll
index 9dc2e1d01ca2160265a544dd3e5ec088b5ad5457..bec2cb9ce94cf2be87fa8b2c107a26e7acae43c0 100644
--- a/llvm/test/VISC/unitTests/query3D.ll
+++ b/llvm/test/VISC/unitTests/query3D.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: clang %t.ll -lpthread -o %t.bin
+; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/query3D.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -25,7 +26,10 @@ declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0
 declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i32) #0
+declare i32 @llvm.visc.launch(i8**, i8*, i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.wait(i8*) #0
 
 ; Function Attrs: nounwind
 declare i8* @llvm.visc.getNode() #0
@@ -48,15 +52,23 @@ declare void @llvm.visc.bind.output(i8*, i32, i32)
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  %in.addr = alloca i32
+  %in.addr = alloca { i32, %rtype }
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
   %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0
   %conv.i = trunc i64 %call.i to i32
-  store i32 %conv.i, i32* %in.addr
-  %args = bitcast i32* %in.addr to i8*
-  %launch = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args, i32 1)
+  %1 = bitcast { i32, %rtype }* %in.addr to i32*
+  store i32 %conv.i, i32* %1
+  %args = bitcast { i32, %rtype }* %in.addr to i8*
+  %graphIDloc = alloca i8*
+  %launch = call i32 @llvm.visc.launch(i8** %graphIDloc, i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
+  %graphID = load i8** %graphIDloc
+  %wait = call i32 @llvm.visc.wait(i8* %graphID)
+  %2 = getelementptr { i32, %rtype }* %in.addr, i32 0, i32 1
+  %outputstruct = load %rtype* %2
+  %output = extractvalue %rtype %outputstruct, 0
+  %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
   ret i32 0
 }
 
diff --git a/llvm/test/VISC/unitTests/queryNodeInst.ll b/llvm/test/VISC/unitTests/queryNodeInst.ll
index 4418391e44d3dd941837dcc9c3833f5ccbc3f20d..258dff23dd3cd4b20891c04d9f990ae32397b041 100644
--- a/llvm/test/VISC/unitTests/queryNodeInst.ll
+++ b/llvm/test/VISC/unitTests/queryNodeInst.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: clang %t.ll -lpthread -o %t.bin
+; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -16,7 +17,10 @@ declare i8* @llvm.visc.createNode(i8*) #0
 declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i32) #0
+declare i32 @llvm.visc.launch(i8**, i8*, i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.wait(i8*) #0
 
 ; Function Attrs: nounwind
 declare i8* @llvm.visc.getNode() #0
@@ -36,15 +40,23 @@ declare void @llvm.visc.bind.output(i8*, i32, i32)
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  %in.addr = alloca i32
+  %in.addr = alloca { i32, %rtype }
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
   %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0
   %conv.i = trunc i64 %call.i to i32
-  store i32 %conv.i, i32* %in.addr
-  %args = bitcast i32* %in.addr to i8*
-  %launch = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args, i32 1)
+  %1 = bitcast { i32, %rtype }* %in.addr to i32*
+  store i32 %conv.i, i32* %1
+  %args = bitcast { i32, %rtype }* %in.addr to i8*
+  %graphIDloc = alloca i8*
+  %launch = call i32 @llvm.visc.launch(i8** %graphIDloc, i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
+  %graphID = load i8** %graphIDloc
+  %wait = call i32 @llvm.visc.wait(i8* %graphID)
+  %2 = getelementptr { i32, %rtype }* %in.addr, i32 0, i32 1
+  %outputstruct = load %rtype* %2
+  %output = extractvalue %rtype %outputstruct, 0
+  %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
   ret i32 0
 }
 
diff --git a/llvm/test/VISC/unitTests/queryNumDim.ll b/llvm/test/VISC/unitTests/queryNumDim.ll
index 4418391e44d3dd941837dcc9c3833f5ccbc3f20d..258dff23dd3cd4b20891c04d9f990ae32397b041 100644
--- a/llvm/test/VISC/unitTests/queryNumDim.ll
+++ b/llvm/test/VISC/unitTests/queryNumDim.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: clang %t.ll -lpthread -o %t.bin
+; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -16,7 +17,10 @@ declare i8* @llvm.visc.createNode(i8*) #0
 declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i32) #0
+declare i32 @llvm.visc.launch(i8**, i8*, i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.wait(i8*) #0
 
 ; Function Attrs: nounwind
 declare i8* @llvm.visc.getNode() #0
@@ -36,15 +40,23 @@ declare void @llvm.visc.bind.output(i8*, i32, i32)
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  %in.addr = alloca i32
+  %in.addr = alloca { i32, %rtype }
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
   %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0
   %conv.i = trunc i64 %call.i to i32
-  store i32 %conv.i, i32* %in.addr
-  %args = bitcast i32* %in.addr to i8*
-  %launch = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args, i32 1)
+  %1 = bitcast { i32, %rtype }* %in.addr to i32*
+  store i32 %conv.i, i32* %1
+  %args = bitcast { i32, %rtype }* %in.addr to i8*
+  %graphIDloc = alloca i8*
+  %launch = call i32 @llvm.visc.launch(i8** %graphIDloc, i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
+  %graphID = load i8** %graphIDloc
+  %wait = call i32 @llvm.visc.wait(i8* %graphID)
+  %2 = getelementptr { i32, %rtype }* %in.addr, i32 0, i32 1
+  %outputstruct = load %rtype* %2
+  %output = extractvalue %rtype %outputstruct, 0
+  %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
   ret i32 0
 }
 
diff --git a/llvm/test/VISC/unitTests/queryNumNodeInst.ll b/llvm/test/VISC/unitTests/queryNumNodeInst.ll
index 3432daf5104291508ff01db5fe94ebc6948cf92a..ce6a639215f8ad2602452581d46d719980db36bf 100644
--- a/llvm/test/VISC/unitTests/queryNumNodeInst.ll
+++ b/llvm/test/VISC/unitTests/queryNumNodeInst.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: clang %t.ll -lpthread -o %t.bin
+; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -19,7 +20,10 @@ declare i8* @llvm.visc.createNode1D(i8*, i32) #0
 declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i32) #0
+declare i32 @llvm.visc.launch(i8**, i8*, i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.wait(i8*) #0
 
 ; Function Attrs: nounwind
 declare i8* @llvm.visc.getNode() #0
@@ -42,15 +46,23 @@ declare void @llvm.visc.bind.output(i8*, i32, i32)
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  %in.addr = alloca i32
+  %in.addr = alloca { i32, %rtype }
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
   %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0
   %conv.i = trunc i64 %call.i to i32
-  store i32 %conv.i, i32* %in.addr
-  %args = bitcast i32* %in.addr to i8*
-  %launch = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args, i32 1)
+  %1 = bitcast { i32, %rtype }* %in.addr to i32*
+  store i32 %conv.i, i32* %1
+  %args = bitcast { i32, %rtype }* %in.addr to i8*
+  %graphIDloc = alloca i8*
+  %launch = call i32 @llvm.visc.launch(i8** %graphIDloc, i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
+  %graphID = load i8** %graphIDloc
+  %wait = call i32 @llvm.visc.wait(i8* %graphID)
+  %2 = getelementptr { i32, %rtype }* %in.addr, i32 0, i32 1
+  %outputstruct = load %rtype* %2
+  %output = extractvalue %rtype %outputstruct, 0
+  %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
   ret i32 0
 }
 
diff --git a/llvm/test/VISC/unitTests/singleNode.ll b/llvm/test/VISC/unitTests/singleNode.ll
index 162661793beb53186c94f9e5202fdabd2bc3dd53..1d45ff55cb467301dc1ab49b642c221e7933ab5b 100644
--- a/llvm/test/VISC/unitTests/singleNode.ll
+++ b/llvm/test/VISC/unitTests/singleNode.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: clang %t.ll -lpthread -o %t.bin
+; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/singleNode.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -16,17 +17,25 @@ declare i8* @llvm.visc.createNode(i8*) #0
 declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i32) #0
+declare i32 @llvm.visc.launch(i8**, i8*, i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.wait(i8*) #0
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
+  %in.addr = alloca { %rtype }
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
-  %launch = call i8* @llvm.visc.launch(i8* bitcast (%rtype ()* @Root to i8*), i8* undef, i32 0)
   %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0
   %conv.i = trunc i64 %call.i to i32
+  %args = bitcast { %rtype }* %in.addr to i8*
+  %graphIDloc = alloca i8*
+  %launch = call i32 @llvm.visc.launch(i8** %graphIDloc, i8* bitcast (%rtype ()* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
+  %graphID = load i8** %graphIDloc
+  %wait = call i32 @llvm.visc.wait(i8* %graphID)
   ret i32 0
 }
 
diff --git a/llvm/test/VISC/unitTests/twoNode.ll b/llvm/test/VISC/unitTests/twoNode.ll
index 31c131970e18e1fd0e5659d8c4cafc56ad875123..b0626a988ff68ee8aebb79a7edf85fc9bbb40a55 100644
--- a/llvm/test/VISC/unitTests/twoNode.ll
+++ b/llvm/test/VISC/unitTests/twoNode.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: clang %t.ll -lpthread -o %t.bin
+; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNode.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -16,20 +17,27 @@ declare i8* @llvm.visc.createNode(i8*) #0
 declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i32) #0
+declare i32 @llvm.visc.launch(i8**, i8*, i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.wait(i8*) #0
 
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  %in.addr = alloca i32
+  %in.addr = alloca { i32, %rtype }
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
   %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0
   %conv.i = trunc i64 %call.i to i32
-  store i32 %conv.i, i32* %in.addr
-  %args = bitcast i32* %in.addr to i8*
-  %launch = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args, i32 1)
+  %1 = bitcast { i32, %rtype }* %in.addr to i32*
+  store i32 %conv.i, i32* %1
+  %args = bitcast { i32, %rtype }* %in.addr to i8*
+  %graphIDloc = alloca i8*
+  %launch = call i32 @llvm.visc.launch(i8** %graphIDloc, i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
+  %graphID = load i8** %graphIDloc
+  %wait = call i32 @llvm.visc.wait(i8* %graphID)
   ret i32 0
 }
 
diff --git a/llvm/test/VISC/unitTests/twoNodeConnect.ll b/llvm/test/VISC/unitTests/twoNodeConnect.ll
index b5103faa8efe504963d2e931316653f534427ae0..a005c8fadf11e555a892ab82114b09b6b8d75e39 100644
--- a/llvm/test/VISC/unitTests/twoNodeConnect.ll
+++ b/llvm/test/VISC/unitTests/twoNodeConnect.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: clang %t.ll -lpthread -o %t.bin
+; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeConnect.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -16,7 +17,10 @@ declare i8* @llvm.visc.createNode(i8*) #0
 declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i32) #0
+declare i32 @llvm.visc.launch(i8**, i8*, i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.wait(i8*) #0
 
 ; Function Attrs: nounwind
 declare i8* @llvm.visc.getNode() #0
@@ -30,15 +34,23 @@ declare void @llvm.visc.bind.output(i8*, i32, i32)
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  %in.addr = alloca i32
+  %in.addr = alloca { i32, %rtype }
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
   %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0
   %conv.i = trunc i64 %call.i to i32
-  store i32 %conv.i, i32* %in.addr
-  %args = bitcast i32* %in.addr to i8*
-  %launch = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args, i32 1)
+  %1 = bitcast { i32, %rtype }* %in.addr to i32*
+  store i32 %conv.i, i32* %1
+  %args = bitcast { i32, %rtype }* %in.addr to i8*
+  %graphIDloc = alloca i8*
+  %launch = call i32 @llvm.visc.launch(i8** %graphIDloc, i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
+  %graphID = load i8** %graphIDloc
+  %wait = call i32 @llvm.visc.wait(i8* %graphID)
+  %2 = getelementptr { i32, %rtype }* %in.addr, i32 0, i32 1
+  %outputstruct = load %rtype* %2
+  %output = extractvalue %rtype %outputstruct, 0
+  %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
   ret i32 0
 }
 
diff --git a/llvm/test/VISC/unitTests/twoNodeQuery.ll b/llvm/test/VISC/unitTests/twoNodeQuery.ll
index ce38982c0379a610030bcb5b85ae2a599a69960d..08c69507be55c425f53aa8d41ea7f68d62e3ae48 100644
--- a/llvm/test/VISC/unitTests/twoNodeQuery.ll
+++ b/llvm/test/VISC/unitTests/twoNodeQuery.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: clang %t.ll -lpthread -o %t.bin
+; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
+; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
 ; RUN: %t.bin 5
 ; ModuleID = '/home/psrivas2/current-test/unitTests/twoNodeQuery.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -16,7 +17,10 @@ declare i8* @llvm.visc.createNode(i8*) #0
 declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*, i32) #0
+declare i32 @llvm.visc.launch(i8**, i8*, i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @llvm.visc.wait(i8*) #0
 
 ; Function Attrs: nounwind
 declare i8* @llvm.visc.getNode() #0
@@ -36,15 +40,23 @@ declare void @llvm.visc.bind.output(i8*, i32, i32)
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #1 {
 entry:
-  %in.addr = alloca i32
+  %in.addr = alloca { i32, %rtype }
   %arrayidx = getelementptr inbounds i8** %argv, i64 1
   %0 = load i8** %arrayidx, align 8, !tbaa !0
   %call.i = tail call i64 @strtol(i8* nocapture %0, i8** null, i32 10) #0
   %conv.i = trunc i64 %call.i to i32
-  store i32 %conv.i, i32* %in.addr
-  %args = bitcast i32* %in.addr to i8*
-  %launch = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args, i32 1)
+  %1 = bitcast { i32, %rtype }* %in.addr to i32*
+  store i32 %conv.i, i32* %1
+  %args = bitcast { i32, %rtype }* %in.addr to i8*
+  %graphIDloc = alloca i8*
+  %launch = call i32 @llvm.visc.launch(i8** %graphIDloc, i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
+  %graphID = load i8** %graphIDloc
+  %wait = call i32 @llvm.visc.wait(i8* %graphID)
+  %2 = getelementptr { i32, %rtype }* %in.addr, i32 0, i32 1
+  %outputstruct = load %rtype* %2
+  %output = extractvalue %rtype %outputstruct, 0
+  %call2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %output) #0
   ret i32 0
 }