diff --git a/llvm/include/llvm/IR/DFGraph.h b/llvm/include/llvm/IR/DFGraph.h
index 2d935b3c5f3190f2341dd5ac39dfb81eef0f7074..6093fe9297701038396e8c6b366efd3fadb981ed 100644
--- a/llvm/include/llvm/IR/DFGraph.h
+++ b/llvm/include/llvm/IR/DFGraph.h
@@ -154,6 +154,11 @@ public:
     InternalNode,
     LeafNode
   };
+  // To identify for which backend code has been generated
+  enum CodeGenTag {
+    X86,
+    PTX
+  };
 
 private:
   typedef std::vector<DFNode*> DFNodeListType;
@@ -162,6 +167,7 @@ private:
   // Important things that make up a Dataflow Node
   IntrinsicInst* II;              ///< Associated IntrinsicInst/Value
   Function* FuncPointer;          ///< Associated Function
+  Function* GenFunc;              ///< Associated Function generated by backend
   DFInternalNode* Parent;         ///< Pointer to parent dataflow Node
   unsigned NumOfDim;              ///< Number of dimensions
   std::vector<Value*> DimLimits;  ///< Number of instances in each dimension
@@ -179,6 +185,8 @@ private:
                                   ///< hierarchy
   int Rank;                       ///< Ordering based on toplogical sort
   const DFNodeKind Kind;          ///< Kind of Node Internal/Leaf
+  CodeGenTag Tag;                 // Code Generated for which backend
+  
 
 public:
   // Iterators
@@ -247,7 +255,7 @@ public:
   DFNodeKind getKind() const {
     return Kind;
   }
-
+  
   DFNode(IntrinsicInst* _II, Function* _FuncPointer, DFInternalNode* _Parent,
          unsigned _NumOfDim, std::vector<Value*> _DimLimits, DFNodeKind _K);
 
@@ -302,6 +310,23 @@ public:
     return Rank;
   }
 
+  void setTag(CodeGenTag T) {
+    Tag = T; 
+  }
+
+  CodeGenTag getTag() const {
+    return Tag; 
+  }
+
+  void setGenFunc(Function* F, CodeGenTag T) {
+    GenFunc = F;
+    Tag = T;
+  }
+
+  Function* getGenFunc() {
+    return GenFunc;
+  }
+
   bool isDummyNode() {
     return isEntryNode() || isExitNode();
   }
diff --git a/llvm/include/llvm/IR/IntrinsicsVISC.td b/llvm/include/llvm/IR/IntrinsicsVISC.td
index ed848f99b5b4f18967fe779afe249657c95dad1c..62aeb6f32b7216e800e531ea15ee25b58aa8af62 100644
--- a/llvm/include/llvm/IR/IntrinsicsVISC.td
+++ b/llvm/include/llvm/IR/IntrinsicsVISC.td
@@ -18,15 +18,15 @@ let TargetPrefix = "visc" in {
    */
 
   /* Launch intrinsic -
-   * i32 llvm.visc.launch(graphID*, function* , ArgList*);
+   * i8* llvm.visc.launch(function* , ArgList*);
    */
-  def int_visc_launch : Intrinsic<[llvm_i32_ty], [llvm_ptrptr_ty, llvm_ptr_ty,
+  def int_visc_launch : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
                                   llvm_ptr_ty], []>;
 
   /* Wait intrinsic -
-   * i32 llvm.visc.wait(graphID*, returnVal*);
+   * void llvm.visc.wait(graphID*, returnVal*);
    */
-  def int_visc_wait : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], []>;
+  def int_visc_wait : Intrinsic<[], [llvm_ptr_ty], []>;
 
   /* Create Node intrinsic -
    * i8* llvm.visc.createNode(function*);
diff --git a/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp b/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp
index 466bc41e14dee6d7d4ba68aee643495a3ebbb7f2..c236e5094d592b40e723898dc7b1e19be4fc3627 100644
--- a/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp
+++ b/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp
@@ -49,7 +49,7 @@ bool BuildDFG::runOnModule(Module &M) {
 
   // Intrinsic Instruction has been initialized from this point on.
 
-  Function* F = cast<Function>((II->getOperand(1))->stripPointerCasts());
+  Function* F = cast<Function>((II->getOperand(0))->stripPointerCasts());
   Root = DFInternalNode::Create(II, F);
   BuildGraph(Root, F);
 
diff --git a/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp b/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
index 3190876910b96862cc55e4e75fde54f8dad3f3c6..05869fa0dfca5b742d828f330e2f17fd21b28aa6 100644
--- a/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
+++ b/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
@@ -65,7 +65,7 @@ public:
   virtual void visit(DFInternalNode* N) {
     // Follows a bottom-up approach for code generation.
     // First generate code for all the child nodes
-    errs() << "Erasing Node (I) - " << N->getFuncPointer()->getName() << "\n";
+    DEBUG(errs() << "Erasing Node (I) - " << N->getFuncPointer()->getName() << "\n");
     for(DFGraph::children_iterator i = N->getChildGraph()->begin(),
         e = N->getChildGraph()->end(); i != e; ++i) {
       DFNode* child = *i;
@@ -78,7 +78,7 @@ public:
   }
 
   virtual void visit(DFLeafNode* N) {
-    errs() << "Erasing Node (L) - " << N->getFuncPointer()->getName() << "\n";
+    DEBUG(errs() << "Erasing Node (L) - " << N->getFuncPointer()->getName() << "\n");
     deleteNode(N);
     //errs() << "DONE: Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n";
   }
diff --git a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
index 8c111a64b54b024d584e11b9f1793c391d1cfc20..71de15d38546b00fa46c0dc11888a564ee5bef76 100644
--- a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -15,6 +15,11 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/BuildDFG/BuildDFG.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Constant.h"
 
 using namespace llvm;
 using namespace builddfg;
@@ -56,19 +61,44 @@ private:
   // we already have an index and dim extended function copy or not (i.e.,
   // "Have we visited this function before?")
   ValueMap<Function*, Function*> FMap;
-  DenseMap<DFNode*, CallInst*> CallMap;
+  DenseMap<DFNode*, Value*> OutputMap;
+
+  // VISC Runtime API
+  Module* runtimeModule;
+  Constant* llvm_visc_x86_launch;
+  Constant* llvm_visc_x86_wait;
+  Constant* llvm_visc_ptx_launch;
+  Constant* llvm_visc_ptx_wait;
+  Constant* llvm_visc_ptx_initContext;
+  Constant* llvm_visc_ptx_input_scalar;
+  Constant* llvm_visc_ptx_input_ptr;
+  Constant* llvm_visc_ptx_output_ptr;
+  Constant* llvm_visc_ptx_getOutput;
+  Constant* llvm_visc_ptx_executeNode;
+  FunctionType* AppFuncTy;
+
 
   //Functions
+  void initRuntimeAPI();
+  std::vector<IntrinsicInst*>* getWaitList(Value* LI);
   void addIdxDimArgs(Function* F);
   Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = "");
   Argument* getArgumentFromEnd(Function* F, unsigned offset);
   Argument* getArgumentAt(Function* F, unsigned offset);
+  Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
+      Instruction* InsertBefore);
+  void invokeChild_X86(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
+      Instruction* InsertBefore);
+  void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
+      Instruction* InsertBefore);
   void codeGenLaunch(DFInternalNode* Root);
   void codeGen(DFInternalNode* N);
   void codeGen(DFLeafNode* N);
 public:
   // Constructor
-  CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) { }
+  CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {
+    initRuntimeAPI();
+  }
 
   virtual void visit(DFInternalNode* N) {
     // Follows a bottom-up approach for code generation.
@@ -113,37 +143,75 @@ bool DFG2LLVM_X86::runOnModule(Module &M) {
   return true;
 }
 
+// Initialize the VISC runtime API. This makes it easier to insert these calls
+void CodeGenTraversal::initRuntimeAPI() {
+
+  // Load Runtime API Module
+  SMDiagnostic Err;
+  runtimeModule = ParseIRFile("/home/psrivas2/current-src/projects/visc-rt/visc-rt.ll", Err, M.getContext());
+  if(runtimeModule == NULL)
+    DEBUG(errs() << Err.getMessage());
+  else
+    DEBUG(errs() << "Successfully loaded visc-rt API module\n");
+
+  // Get or insert the global declarations for launch/wait functions
+  llvm_visc_x86_launch = M.getOrInsertFunction("llvm_visc_x86_launch", 
+      runtimeModule->getFunction("llvm_visc_x86_launch")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_x86_launch);
+
+  llvm_visc_x86_wait = M.getOrInsertFunction("llvm_visc_x86_wait",
+      runtimeModule->getFunction("llvm_visc_x86_wait")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_x86_wait);
+ 
+  llvm_visc_ptx_launch = M.getOrInsertFunction("llvm_visc_ptx_launch",
+      runtimeModule->getFunction("llvm_visc_ptx_launch")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_ptx_launch);
+
+  llvm_visc_ptx_wait = M.getOrInsertFunction("llvm_visc_ptx_wait",
+      runtimeModule->getFunction("llvm_visc_ptx_wait")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_ptx_wait);
+
+  llvm_visc_ptx_initContext = M.getOrInsertFunction("llvm_visc_ptx_initContext"  ,
+      runtimeModule->getFunction("llvm_visc_ptx_initContext")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_ptx_initContext);
+
+  llvm_visc_ptx_input_scalar = M.getOrInsertFunction("llvm_visc_ptx_input_scalar",
+      runtimeModule->getFunction("llvm_visc_ptx_input_scalar")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_ptx_input_scalar);
+  
+  llvm_visc_ptx_input_ptr = M.getOrInsertFunction("llvm_visc_ptx_input_ptr",
+      runtimeModule->getFunction("llvm_visc_ptx_input_ptr")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_ptx_input_ptr);
+
+  llvm_visc_ptx_output_ptr = M.getOrInsertFunction("llvm_visc_ptx_output_ptr",
+      runtimeModule->getFunction("llvm_visc_ptx_output_ptr")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_ptx_output_ptr);
+
+  llvm_visc_ptx_getOutput = M.getOrInsertFunction("llvm_visc_ptx_getOutput",
+    runtimeModule->getFunction("llvm_visc_ptx_getOutput")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_ptx_getOutput);
+
+  llvm_visc_ptx_executeNode = M.getOrInsertFunction("llvm_visc_ptx_executeNode",
+      runtimeModule->getFunction("llvm_visc_ptx_executeNode")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_ptx_executeNode);
+
+}
+
 /* Returns vector of all wait instructions
  */
-std::vector<CallInst*>* getWaitList(CallInst* LI) {
-  Value* GraphIDAddr = LI->getArgOperand(0);
-  std::vector<CallInst*>* WaitList = new std::vector<CallInst*>();
+std::vector<IntrinsicInst*>* CodeGenTraversal::getWaitList(Value* GraphID) {
+  std::vector<IntrinsicInst*>* WaitList = new std::vector<IntrinsicInst*>();
   // It must have been loaded from memory somewhere
-  GraphIDAddr->use_begin();
-  for(Value::use_iterator ui = GraphIDAddr->use_begin(),
-      ue = GraphIDAddr->use_end(); ui!=ue; ++ui) {
-    if(LoadInst* LI = dyn_cast<LoadInst>(*ui)) {
-      DEBUG(errs() << *LI << "\n");
-      for(Value::use_iterator i = LI->use_begin(), e = LI->use_end(); i!=e; ++i) {
-        if(CallInst* waitI = dyn_cast<CallInst>(*i)) {
-          DEBUG(errs() << *waitI << "\n");
-          WaitList->push_back(waitI);
-        }
-      }
+  for(Value::use_iterator ui = GraphID->use_begin(),
+      ue = GraphID->use_end(); ui!=ue; ++ui) {
+    if(IntrinsicInst* waitI = dyn_cast<IntrinsicInst>(*ui)) {
+      assert(waitI->getIntrinsicID() == Intrinsic::visc_wait
+          && "GraphID can only be used by llvm.visc.wait intrinsic");
+      WaitList->push_back(waitI);
     }
-    // If graphID memory address is used by another launch, then break
-    if(CallInst* CI = dyn_cast<CallInst>(*ui)) {
-      if(LI != CI) {
-        DEBUG(errs()<< "Warning: Overwriting graph ID in memory  -- " << *CI << "\n" << *LI << "\n");
-        break;
-      }
-    }
-    // If graphID in memory is overwritten using store, it's an error
-    if(StoreInst* SI =dyn_cast<StoreInst>(*ui)) {
-      assert(SI->getPointerOperand() == GraphIDAddr
-          && "Error: Do not manually write over graphID in memory!");
+    else {
+      llvm_unreachable("Error: Operation on Graph ID not supported!\n");
     }
-
   }
   return WaitList;
 }
@@ -258,60 +326,12 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
   // TODO: Place an assert to check if the constant passed bu launch intrinsic
   // as the number of arguments to DFG is same as the number of arguments of the
   // root of DFG
-
+  DEBUG(errs() << "Generating Launch Function\n");
   // Get Launch Instruction
   IntrinsicInst* LI = Root->getInstruction();
+  DEBUG(errs() << "Generating Launch Function\n");
 
-  // Get frequently used types
-  Type* i64Ty = Type::getInt64Ty(LI->getContext());
-  Type* i32Ty = Type::getInt32Ty(LI->getContext());
-  Type* i8Ty = Type::getInt8Ty(LI->getContext());
-  Type* voidTy = Type::getVoidTy(LI->getContext());
-
-  /* Get or Insert visc runtime utilities necessary to run DFG as a separate thread
-   * (1) llvm_visc_launch_x86
-   * (2) llvm_visc_wait_x86
-   */
-  Type *GraphIDTy;
-  std::vector<Type*>Elements;
-  // PThreads use different attribute types for 32-bit and 64-bit machines
-  if(M.getPointerSize() == Module::Pointer64) {
-    GraphIDTy = i64Ty;
-  }
-  else {
-    GraphIDTy = i32Ty;
-  }
-
-  FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(),
-                                                  ArrayRef<Type*>(i8Ty->getPointerTo()),
-                                                  false);
-  // Argument types for llvm_visc_launch_x86
-  Type* ArgTypesLaunch[] = {i8Ty->getPointerTo()->getPointerTo(),
-                            AppFuncTy->getPointerTo(),
-                            i8Ty->getPointerTo()};
-
-  // Construct FunctionType of llvm_visc_launch_x86 call
-  FunctionType* LaunchFuncTy = FunctionType::get(i32Ty,
-                                                 ArrayRef<Type*>(ArgTypesLaunch, 3),
-                                                 false);
-
-  // Construct FunctionType for llvm_visc_wait_x86 call
-  FunctionType* WaitFuncTy = FunctionType::get(i32Ty,
-                                              ArrayRef<Type*>(i8Ty->getPointerTo()),
-                                              false);
-
-  // Get or insert the global declarations for pthread functions
-  Constant* Launch = M.getOrInsertFunction("llvm_visc_launch_x86", LaunchFuncTy);
-  Constant* Wait = M.getOrInsertFunction("llvm_visc_wait_x86", WaitFuncTy);
-
-  // Construct FunctionType for malloc call
-  FunctionType* MallocTy = FunctionType::get(i8Ty->getPointerTo(),
-                                            ArrayRef<Type*>(i64Ty),
-                                            false);
-  // Get or insert the global declaration for malloc call
-  Constant* Malloc = M.getOrInsertFunction("malloc", MallocTy);
-
-  /* Now we have all the necessary global declarations necessary to generate the
+   /* Now we have all the necessary global declarations necessary to generate the
    * Launch function, pointer to which can be passed to pthread utils to execute
    * DFG. The Launch function has just one input: i8* data.addr
    * This is the address of the all the input data that needs to be passed to
@@ -324,10 +344,15 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
    * passed to pthread_exit call.
    */
   // Create Launch Function of type i8*(i8*) which calls the root function 
+  Type* i8Ty = Type::getInt8Ty(M.getContext());
+  AppFuncTy = FunctionType::get(i8Ty->getPointerTo(),
+                                ArrayRef<Type*>(i8Ty->getPointerTo()),
+                                false);
   Function* AppFunc = Function::Create(AppFuncTy,
                                        Root->getFuncPointer()->getLinkage(),
                                        "LaunchDataflowGraph",
                                        &M);
+  DEBUG(errs() << "Generating Launch Function\n");
   // Give a name to the argument which is used pass data to this thread
   Value* data = AppFunc->arg_begin();
   data->setName("data.addr");
@@ -336,6 +361,8 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
   ReturnInst* RI = ReturnInst::Create(AppFunc->getContext(),
                                       Constant::getNullValue(AppFunc->getReturnType()),
                                       BB);
+
+  DEBUG(errs() << "Created Empty Launch Function\n");
   // Find the X86 function generated for Root and 
   Function* RootF_X86 = FMap[Root->getFuncPointer()];
   // Generate a call to RootF_X86 with null parameters for now 
@@ -364,7 +391,7 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
     // as no more arguments left
     // Increment using GEP: %nextArg = getelementptr <ptr-to-argType> %arg.addr, i64 1
     // This essentially takes us to the next argument in memory
-    Constant* IntOne = ConstantInt::get(i64Ty, 1);
+    Constant* IntOne = ConstantInt::get(Type::getInt64Ty(M.getContext()), 1);
     GetElementPtrInst* GEP = GetElementPtrInst::Create(BI,
                                                       ArrayRef<Value*>(IntOne),
                                                       "nextArg",
@@ -374,9 +401,8 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
     argNum++;
     data = GEP;
   }
-
+  
   // Code for returning the output
-  Constant* SizeOf = ConstantExpr::getSizeOf(CI->getType());
   CastInst* OutputAddrCast = CastInst::CreatePointerCast(data,
                                                         CI->getType()->getPointerTo(),
                                                         CI->getName()+".addr",
@@ -387,21 +413,20 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
   DEBUG(errs() << *AppFunc << "\n");
  
   // Substitute launch intrinsic main
-  Value* LaunchInstArgs[] = {LI->getArgOperand(0),
-                            AppFunc,
-                            LI->getArgOperand(2)};
-  CallInst* LaunchInst = CallInst::Create(Launch,
-                                          ArrayRef<Value*>(LaunchInstArgs,3),
-                                          "", LI);
+  Value* LaunchInstArgs[] = {AppFunc,
+                            LI->getArgOperand(1)};
+  CallInst* LaunchInst = CallInst::Create(llvm_visc_x86_launch,
+                                          ArrayRef<Value*>(LaunchInstArgs,2),
+                                          "graph"+Root->getFuncPointer()->getName(), LI);
   //ReplaceInstWithInst(LI, LaunchInst);
 
   DEBUG(errs() << *LaunchInst << "\n");
   // Replace all wait instructions with x86 specific wait instructions
-  std::vector<CallInst*>* WaitList = getWaitList(LaunchInst);
+  std::vector<IntrinsicInst*>* WaitList = getWaitList(LI);
   for(unsigned i=0; i < WaitList->size(); ++i) {
-    CallInst* waitI = WaitList->at(i);
-    CallInst* waitI_X86 = CallInst::Create(Wait,
-                                          ArrayRef<Value*>(waitI->getArgOperand(0)),
+    IntrinsicInst* waitI = WaitList->at(i);
+    CallInst* waitI_X86 = CallInst::Create(llvm_visc_x86_wait,
+                                          ArrayRef<Value*>(LaunchInst),
                                           "");
     ReplaceInstWithInst(waitI, waitI_X86);
     DEBUG(errs() << *waitI_X86 << "\n");
@@ -409,6 +434,171 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
 
 }
 
+Value* CodeGenTraversal::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore) {
+  // TODO: Assumption is that each input port of a node has just one
+  // incoming edge. May change later on.
+
+  // Find the incoming edge at the requested input port
+  DFEdge* E = Child->getInDFEdgeAt(i);
+  assert(E && "No incoming edge or binding for input element!");
+  // Find the Source DFNode associated with the incoming edge
+  DFNode* SrcDF = E->getSourceDF();
+
+  // If Source DFNode is a dummyNode, edge is from parent. Get the
+  // argument from argument list of this internal node
+  Value* inputVal;
+  if(SrcDF->isEntryNode()) {
+    inputVal = getArgumentAt(ParentF_X86, i);
+    DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+  }
+  else {
+    // edge is from a sibling
+    // Check - code should already be generated for this source dfnode
+    assert(OutputMap.count(SrcDF)
+           && "Source node call not found. Dependency violation!");
+
+    // Find CallInst associated with the Source DFNode using FMap
+    Value* CI = OutputMap[SrcDF];
+
+    // Extract element at source position from this call instruction
+    std::vector<unsigned> IndexList;
+    IndexList.push_back(E->getSourcePosition());
+    DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
+    ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
+                           "", InsertBefore);
+    inputVal = EI;
+  }
+  return inputVal;
+}
+
+void CodeGenTraversal::invokeChild_X86(DFNode* C, Function* F_X86,
+    ValueToValueMapTy &VMap,Instruction* IB) {
+  Function* CF = C->getFuncPointer();
+
+  Function* CF_X86 = C->getGenFunc();
+  DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n");
+  assert(CF_X86 != NULL
+      && "Found leaf node for which code generation has not happened yet!");
+
+  std::vector<Value*> Args;
+  // Create argument list to pass to call instruction
+  // First find the correct values using the edges
+  // The remaing six values are inserted as constants for now.
+  for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
+    Args.push_back(getInValueAt(C, i, F_X86, IB));
+  }
+
+  Value* I32Zero = ConstantInt::get(Type::getInt32Ty(F_X86->getContext()), 0);
+  for(unsigned j=0; j<6; j++)
+    Args.push_back(I32Zero);
+
+  // Call the F_X86 function associated with this node
+  CallInst* CI = CallInst::Create(CF_X86, Args,
+                                  CF_X86->getName()+"_output",
+                                  IB);
+  DEBUG(errs() << *CI << "\n");
+  OutputMap[C] = CI;
+
+  // Find num of dimensions this node is replicated in.
+  // Based on number of dimensions, insert loop instructions
+  std::string varNames[3] = {"x", "y", "z"};
+  for(unsigned j=0; j < C->getNumOfDim(); j++) {
+    Value* indexLimit;
+    // Limit can either be a constant or an arguement of the internal node.
+    // In case of constant we can use that constant value directly in the
+    // new F_X86 function. In case of an argument, we need to get the mapped
+    // value using VMap
+    if(isa<Constant>(C->getDimLimits()[j]))
+      indexLimit = C->getDimLimits()[j];
+    else
+      indexLimit = VMap[C->getDimLimits()[j]];
+    assert(indexLimit && "Invalid dimension limit!");
+    // Insert loop
+    Value* indexVar = addLoop(CI, indexLimit, varNames[j]);
+    unsigned numArgs = CI->getNumArgOperands();
+    // Insert index variable and limit arguments
+    CI->setArgOperand(numArgs-6+j, indexVar);
+    CI->setArgOperand(numArgs-3+j, indexLimit);
+  }
+
+}
+
+void CodeGenTraversal::invokeChild_PTX(DFNode* C, Function* F_X86,
+    ValueToValueMapTy &VMap, Instruction* IB) {
+  Function* CF = C->getFuncPointer();
+
+  //FIXME: A way to check if PTX code has been generated for this child node
+  /*assert(FMap.count(CF)
+         && "Found leaf node for which code generation has not happened yet!");
+  */
+  //assert(C->getTag() == DFNode::PTX && "Cannot generate GPU call for non PTX nodes");
+
+  // Initialize context
+  CallInst::Create(llvm_visc_ptx_initContext, None, "", IB);
+
+  // Initialize command queue
+  // Filename = <DFNode function name>.nvptx.ll
+  Twine file = CF->getName() + ".nvptx.ll";
+  DEBUG(errs() << file << "\n");
+  Constant* filename = ConstantDataArray::get(M.getContext(),
+        ArrayRef<uint8_t>((uint8_t*)file.str().c_str(), file.str().length()));
+
+  CallInst* GraphID = CallInst::Create(llvm_visc_ptx_launch,
+                                      ArrayRef<Value*>(filename),
+                                      "graph"+CF->getName(),
+                                      IB);
+
+  // Iterate over the required input edges of the node and use the visc-rt API
+  // to set inputs
+  for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
+
+    Value* inputVal = getInValueAt(C, i, F_X86, IB);
+    // input value has been obtained.
+    // Check if input is a scalar value or a pointer operand
+    // For scalar values such as int, float, etc. the size is simply the size of
+    // type on target machine, but for pointers, the size of data would be the
+    // next integer argument
+    Value* inputSize;
+    if(inputVal->getType()->isPointerTy()) {
+      // Pointer Input
+      inputSize = getInValueAt(C, i+1, F_X86, IB);
+      assert(inputSize->getType()->isIntegerTy()
+          && "Pointer type input must always be followed by size (integer type)");
+    }
+    else { // Scalar Input
+      inputSize = ConstantExpr::getSizeOf(inputVal->getType());
+    }
+
+    Value* setInputArgs[] = {GraphID,
+                            inputVal,
+                            ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                            inputSize
+                            };
+    CallInst::Create(llvm_visc_ptx_input_ptr,
+          ArrayRef<Value*>(setInputArgs, 4), "", IB);
+
+  }
+  // Setup output
+  // FIXME: Note - There is a tricky question. In X86 we do not need to care
+  // about pointer inputs which modify data in memory implicitly (without
+  // showing it as output). There is no extra cost needed to handle such inputs
+  // For PTX, we need to read back such data from device memory to host memory.
+  // The cost is huge and hence we need to differentiate between readonly
+  // pointer inputs vs read/write pointer inputs. Currently supporting only a
+  // simple model in which all input edges are readonly and output is
+  // writeonly.
+  StructType* OutputTy = C->getOutputType();
+  for(unsigned i=0; OutputTy->getNumElements(); i++) {
+    Type* elemTy = OutputTy->getElementType(i);
+  }
+
+  // Enqueue kernel
+  // Read Output
+  // return output
+  // free data structures
+
+}
+
 void CodeGenTraversal::codeGen(DFInternalNode* N) {
   Function* F = N->getFuncPointer();
 
@@ -443,7 +633,10 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
                                       UndefValue::get(F_X86->getReturnType()), BB);
 
   //Add old func: new func pair to the FMap
+  // FIXME: We do not require the FMap probably. Only one of setGenFunc or FMap
+  // is required
   FMap[F] = F_X86;
+  N->setGenFunc(F_X86, DFNode::X86);
 
   // Add Index and Dim arguments except for the root node
   if(!N->isRoot())
@@ -459,86 +652,12 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
     if (C->isDummyNode())
       continue;
 
-    Function* CF = C->getFuncPointer();
-
-    assert(FMap.count(CF)
-           && "Found leaf node for which code generation has not happened yet!");
-    Function* CF_X86 = FMap[CF];
-    std::vector<Value*> Args;
-    // Create argument list to pass to call instruction
-    // First find the correct values using the edges
-    // The remaing six values are inserted as constants for now.
-    for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
-
-      // TODO: Assumption is that each input port of a node has just one
-      // incoming edge. May change later on.
-
-      // Find the incoming edge at the requested input port
-      DFEdge* E = C->getInDFEdgeAt(i);
-      assert(E && "No incoming edge or binding for input element!");
-      // Find the Source DFNode associated with the incoming edge
-      DFNode* SrcDF = E->getSourceDF();
-
-      // If Source DFNode is a dummyNode, edge is from parent. Get the
-      // argument from argument list of this internal node
-      Value* inputVal;
-      if(SrcDF->isEntryNode()) {
-        inputVal = getArgumentAt(F_X86, i);
-        DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
-      }
-      else {
-        // edge is from a sibling
-        // Check - code should already be generated for this source dfnode
-        assert(CallMap.count(SrcDF)
-               && "Source node call not found. Dependency violation!");
-
-        // Find CallInst associated with the Source DFNode using FMap
-        CallInst* CI = CallMap[SrcDF];
-
-        // Extract element at source position from this call instruction
-        std::vector<unsigned> IndexList;
-        IndexList.push_back(E->getSourcePosition());
-        DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
-        ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
-                               "", RI);
-        inputVal = EI;
-      }
-      // input value has been obtained.
-      Args.push_back(inputVal);
-    }
-
-    Value* I32Zero = ConstantInt::get(Type::getInt32Ty(F_X86->getContext()), 0);
-    for(unsigned j=0; j<6; j++)
-      Args.push_back(I32Zero);
-
-    // Call the F_X86 function associated with this node
-    CallInst* CI = CallInst::Create(CF_X86, Args,
-                                    CF_X86->getName()+"_output",
-                                    RI);
-    DEBUG(errs() << *CI << "\n");
-    CallMap[C] = CI;
-
-    // Find num of dimensions this node is replicated in.
-    // Based on number of dimensions, insert loop instructions
-    std::string varNames[3] = {"x", "y", "z"};
-    for(unsigned j=0; j < C->getNumOfDim(); j++) {
-      Value* indexLimit;
-      // Limit can either be a constant or an arguement of the internal node.
-      // In case of constant we can use that constant value directly in the
-      // new F_X86 function. In case of an argument, we need to get the mapped
-      // value using VMap
-      if(isa<Constant>(C->getDimLimits()[j]))
-        indexLimit = C->getDimLimits()[j];
-      else
-        indexLimit = VMap[C->getDimLimits()[j]];
-      assert(indexLimit && "Invalid dimension limit!");
-      // Insert loop
-      Value* indexVar = addLoop(CI, indexLimit, varNames[j]);
-      unsigned numArgs = CI->getNumArgOperands();
-      // Insert index variable and limit arguments
-      CI->setArgOperand(numArgs-6+j, indexVar);
-      CI->setArgOperand(numArgs-3+j, indexLimit);
-    }
+    // Check if Child Node has PTX tag or X86 tag
+    invokeChild_PTX(C, F_X86, VMap, RI);
+    if (C->getTag() == DFNode::PTX) 
+      invokeChild_PTX(C, F_X86, VMap, RI);
+    else 
+      invokeChild_X86(C, F_X86, VMap, RI);
   }
 
   DEBUG(errs() << "*** Generating epilogue code for the function****\n");
@@ -550,6 +669,7 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
   Value *retVal = UndefValue::get(F_X86->getReturnType());
   // Find all the input edges to exit node
   for (unsigned i=0; i < OutTy->getNumElements(); i++) {
+    DEBUG(errs() << "Output Edge " << i << "\n");
     // Find the incoming edge at the requested input port
     DFEdge* E = C->getInDFEdgeAt(i);
 
@@ -557,6 +677,8 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
     // Find the Source DFNode associated with the incoming edge
     DFNode* SrcDF = E->getSourceDF();
 
+    DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
+
     // If Source DFNode is a dummyNode, edge is from parent. Get the
     // argument from argument list of this internal node
     Value* inputVal;
@@ -567,11 +689,11 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
     else {
       // edge is from a internal node
       // Check - code should already be generated for this source dfnode
-      assert(CallMap.count(SrcDF)
+      assert(OutputMap.count(SrcDF)
              && "Source node call not found. Dependency violation!");
 
-      // Find CallInst associated with the Source DFNode using FMap
-      CallInst* CI = CallMap[SrcDF];
+      // Find Output Value associated with the Source DFNode using OutputMap
+      Value* CI = OutputMap[SrcDF];
 
       // Extract element at source position from this call instruction
       std::vector<unsigned> IndexList;
@@ -585,6 +707,7 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
     IdxList.push_back(i);
     retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
   }
+  DEBUG(errs() << "Extracted all\n");
   retVal->setName("output");
   ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
   ReplaceInstWithInst(RI, newRI);
@@ -628,7 +751,10 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
   // Insert the cloned function into the module
   M.getFunctionList().push_back(F_X86);
   // Add old func: new func pair to the FMap
+  // FIXME: We do not require the FMap probably. Only one of setGenFunc or FMap
+  // is required
   FMap[F] = F_X86;
+  N->setGenFunc(F_X86, DFNode::X86);
 
   // Add the new argument to the argument list
   addIdxDimArgs(F_X86);
diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp
index 9080a31b4a4a39aaea7ef4dffac4f055a24ad48d..4b05c8178bfedd860728ebdb97546b398e4fa33c 100644
--- a/llvm/projects/visc-rt/visc-rt.cpp
+++ b/llvm/projects/visc-rt/visc-rt.cpp
@@ -1,6 +1,7 @@
 #include <pthread.h>
 #include <cstdlib>
 #include <cstdio>
+#include <string>
 #include <CL/cl.h>
 
 typedef struct {
@@ -14,89 +15,175 @@ typedef struct {
   cl_kernel clKernel;
 } DFNodeContext_PTX;
 
+cl_context globalGPUContext;
+
+static inline void checkErr(cl_int err, cl_int success, const char * name) {
+  if (err != success) {
+    printf("ERROR: %s\n", name);
+    exit(EXIT_FAILURE);
+  }
+}
+
 extern "C"
-__int32_t llvm_visc_launch_x86(void** graphID, void* (*rootFunc)(void*), void* arguments) {
+void* llvm_visc_x86_launch(void* (*rootFunc)(void*), void* arguments) {
   DFNodeContext_X86 *Context = (DFNodeContext_X86 *) malloc(sizeof(DFNodeContext_X86));
-  *graphID = Context;
-  return pthread_create(&Context->threadID, NULL, rootFunc, arguments);
+  int err;
+  if((err = pthread_create(&Context->threadID, NULL, rootFunc, arguments)) != 0)
+    printf("Failed to create pthread. Error code = %d\n", err);
+  return Context;
 }
 
 extern "C"
-__int32_t llvm_visc_wait_x86(void* graphID) {
+void llvm_visc_x86_wait(void* graphID) {
   DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
-  return pthread_join(Context->threadID, NULL);
+  pthread_join(Context->threadID, NULL);
 }
 
-static inline void checkErr(cl_int err, cl_int success, const char * name) {
-    if (err != success) {
-          printf("ERROR: %s\n", name);
-              exit(EXIT_FAILURE);
-                }
+extern "C"
+void* llvm_visc_ptx_initContext() {
+  cl_uint numPlatforms;
+  cl_int errcode;
+  errcode = clGetPlatformIDs(0, NULL, &numPlatforms);
+  checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms");
+
+  // now get all the platform IDs
+  cl_platform_id* platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id)*numPlatforms);
+  errcode = clGetPlatformIDs(numPlatforms, platforms, NULL);
+  checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs");
+
+  for(unsigned i=0; i < numPlatforms; i++) {
+    char buffer[10240];
+    printf("  -- %d --\n", i);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL);
+    printf("  PROFILE = %s\n", buffer);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL);
+    printf("  VERSION = %s\n", buffer);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL);
+    printf("  NAME = %s\n", buffer);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL);
+    printf("  VENDOR = %s\n", buffer);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL);
+    printf("  EXTENSIONS = %s\n", buffer);
+  }
+  // set platform property - just pick the first one
+  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
+                                        (long) platforms[0],
+                                        0};
+  globalGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU,
+                                         NULL, NULL, &errcode);
+  checkErr(errcode, CL_SUCCESS, "Failure to create GPU context");
+  return globalGPUContext;
+}
+
+extern "C"
+void llvm_visc_ptx_clearContext() {
+  clReleaseContext(globalGPUContext);
 }
 
+extern "C"
+void llvm_visc_ptx_input_scalar(void* graphID, void* input, int arg_index, size_t size) {
+  DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
+  cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input);
+  checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument");
+}
+
+extern "C"
+void* llvm_visc_ptx_input_ptr(void* graphID, void* input, int arg_index, size_t size) {
+  DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
+  cl_int errcode;
+  cl_mem d_input = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE |
+      CL_MEM_COPY_HOST_PTR, size, input, &errcode);
+  checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
+  errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input);
+  checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument");
+  return d_input;
+}
+
+extern "C"
+void* llvm_visc_ptx_output_ptr(void* graphID, int arg_index, size_t size) {
+  DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
+  cl_int errcode;
+  cl_mem d_output = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE,
+      size, NULL, &errcode);
+  checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
+  errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output);
+  checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument");
+  return d_output;
+}
+
+extern "C"
+void* llvm_visc_ptx_getOutput(void* graphID, void* d_output, size_t size) {
+  void* h_output = malloc(size);
+  DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
+  cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size,
+                                h_output, 0, NULL, NULL);
+  checkErr(errcode, CL_SUCCESS, "Failure to read output");
+  return h_output;
+}
+
+extern "C"
+void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t* localWorkSize, const size_t* globalWorkSize) {
+  DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
+  cl_event* event;
+  cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue,
+      Context->clKernel, workDim, NULL, globalWorkSize, localWorkSize, 0, NULL, event);
+  checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
+  return event;
+}
+
+
 //////////////////////////////////////////////////////////////////////////////
 //! Loads a Program binary file.
 //!
 //! @return the source string if succeeded, 0 otherwise
-//! @param cFilename        program filename
+//! @param Filename        program filename
 //! @param szFinalLength    returned length of the code string
 //////////////////////////////////////////////////////////////////////////////
-static char* LoadProgSource(const char* cFilename, size_t* szFinalLength)
+static char* LoadProgSource(const char* Filename, size_t* szFinalLength)
 {
-    // locals
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-
-    // open the OpenCL source code file
-    #ifdef _WIN32   // Windows version
-        if(fopen_s(&pFileStream, cFilename, "rb") != 0)
-        {
-            return NULL;
-        }
-    #else           // Linux version
-        pFileStream = fopen(cFilename, "rb");
-        if(pFileStream == 0)
-        {
-            return NULL;
-        }
-    #endif
-
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END);
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET);
-
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + 1);
-    if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1)
-    {
-        fclose(pFileStream);
-        free(cSourceString);
-        return 0;
-    }
-
-    // close the file and return the total length of the combined (preamble + source) string
-    fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength;
-    }
-    cSourceString[szSourceLength] = '\0';
-
-    return cSourceString;
+  // locals
+  FILE* pFileStream = NULL;
+  size_t szSourceLength;
+
+  // open the OpenCL source code file
+  pFileStream = fopen(Filename, "rb");
+  if(pFileStream == 0)
+  {
+    return NULL;
+  }
+
+  // get the length of the source code
+  fseek(pFileStream, 0, SEEK_END);
+  szSourceLength = ftell(pFileStream);
+  fseek(pFileStream, 0, SEEK_SET);
+
+  // allocate a buffer for the source code string and read it in
+  char* cSourceString = (char *)malloc(szSourceLength + 1);
+  if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1)
+  {
+      fclose(pFileStream);
+      free(cSourceString);
+      return 0;
+  }
+
+  // close the file and return the total length of the combined (preamble + source) string
+  fclose(pFileStream);
+  if(szFinalLength != 0)
+  {
+      *szFinalLength = szSourceLength;
+  }
+  cSourceString[szSourceLength] = '\0';
+
+  return cSourceString;
 }
 
 extern "C"
-__int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void* arguments) {
+void* llvm_visc_ptx_launch(const char* Filename) {
   // Initialize OpenCL
 
   // OpenCL specific variables
   DFNodeContext_PTX *Context = (DFNodeContext_PTX *) malloc(sizeof(DFNodeContext_PTX));
   
-  // Return Context pointer as grpahID;
-  *graphID = Context;
-
-
   size_t dataBytes;
   size_t kernelLength;
   cl_int errcode;
@@ -112,7 +199,7 @@ __int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void*
   /* Initialize OpenCL */
   /*****************************************/
   // query the number of platforms
-  cl_uint numPlatforms;
+  /*cl_uint numPlatforms;
   errcode = clGetPlatformIDs(0, NULL, &numPlatforms);
   checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms");
 
@@ -142,6 +229,9 @@ __int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void*
   Context->clGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU,
                                          NULL, NULL, &errcode);
   checkErr(errcode, CL_SUCCESS, "Failure to create GPU context");
+  */
+  // For a single context for all kernels
+  Context->clGPUContext = globalGPUContext;
 
   // get the list of GPU devices associated with context
   errcode = clGetContextInfo(Context->clGPUContext, CL_CONTEXT_DEVICES, 0,
@@ -155,23 +245,13 @@ __int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void*
   Context->clCommandQue = clCreateCommandQueue(Context->clGPUContext, clDevices[0], 0, &errcode);
   checkErr(errcode, CL_SUCCESS, "Failure to create command queue");
 
-  /* Application specific code
-  // Setup device memory
-  d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, bytes_C, NULL,
-                       &errcode);
-  d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
-                       bytes_A, h_A, &errcode);
-  d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
-                       bytes_B, h_B, &errcode);
-  */
-
-  char *clMatrixMul = LoadProgSource("matrixMul.nvptx.s", &kernelLength);
-  checkErr(clMatrixMul != NULL, 1 /*bool true*/, "Failure to load Program Binary");
+  char *programSource = LoadProgSource(Filename, &kernelLength);
+  checkErr(programSource != NULL, 1 /*bool true*/, "Failure to load Program Binary");
   
   cl_int binaryStatus;
   Context->clProgram = clCreateProgramWithBinary(Context->clGPUContext, 1, &clDevices[0],
                                         &kernelLength,
-                                        (const unsigned char **)&clMatrixMul,
+                                        (const unsigned char **)&programSource,
                                         &binaryStatus, &errcode);
   checkErr(errcode, CL_SUCCESS, "Failure to create program from binary");
 
@@ -184,37 +264,7 @@ __int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void*
   
 
   // Invoke the callback function to put memory allocations in place
-  rootFunc(graphID);
-
-  /* Application Specific Code
-  // Launch OpenCL kernel
-  size_t localWorkSize[2], globalWorkSize[2];
-
-  int wA = WA;
-  int wC = WC;
-  errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C);
-  errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A);
-  errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B);
-  errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA);
-  errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC);
-  checkErr(errcode, CL_SUCCESS, "Failure to set kernel arguments");
- 
-  localWorkSize[0] = BLOCK_SIZE;
-  localWorkSize[1] = BLOCK_SIZE;
-  globalWorkSize[0] = ((WB-1)/BLOCK_SIZE + 1) * BLOCK_SIZE;
-  globalWorkSize[1] = ((HA-1)/BLOCK_SIZE + 1) * BLOCK_SIZE;
- 
-  errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, 
-                                   globalWorkSize, localWorkSize,
-                                   0, NULL, NULL);
-  checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
-  
- 
-  // Retrieve result from device
-  errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, bytes_C, 
-                                h_C, 0, NULL, NULL);
-  checkErr(errcode, CL_SUCCESS, "Failure to read buffer");
-  */ 
+  //rootFunc(Context);
 
   /* App specific code
   // Deallocate memory
@@ -228,28 +278,19 @@ __int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void*
   */
 
   free(clDevices);
-  free(clMatrixMul);
+  free(programSource);
 
-  /*
-  // Free in wait implementation
-  clReleaseContext(Context->clGPUContext);
-  clReleaseKernel(Context->clKernel);
-  clReleaseProgram(Context->clProgram);
-  */
-
-  return 0;
+  return Context;
 }
 
 
 extern "C" 
-__int32_t llvm_visc_wait_ptx(void* graphID) {
+void llvm_visc_ptx_wait(void* graphID) {
   DFNodeContext_PTX *Context = (DFNodeContext_PTX*) graphID;
   clFinish(Context->clCommandQue);
 
   // Release
-  clReleaseContext(Context->clGPUContext);
+  // clReleaseContext(Context->clGPUContext);
   clReleaseKernel(Context->clKernel);
   clReleaseProgram(Context->clProgram);
-
-  return 0;
 }
diff --git a/llvm/test/VISC/unitTests/3level.ll b/llvm/test/VISC/unitTests/3level.ll
index c884acf32cafdce42a5b8e219c3a1ab32676afae..72248b3b95d837cbbc6b0671c984b341b7690ba5 100644
--- a/llvm/test/VISC/unitTests/3level.ll
+++ b/llvm/test/VISC/unitTests/3level.ll
@@ -17,10 +17,10 @@ declare i8* @llvm.visc.createNode(i8*) #0
 declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.launch(i8**, i8*, i8*) #0
+declare i8* @llvm.visc.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.wait(i8*) #0
+declare void @llvm.visc.wait(i8*) #0
 
 ; Function Attrs: nounwind
 declare i8* @llvm.visc.getNode() #0
@@ -42,11 +42,9 @@ entry:
   %1 = bitcast { i32, %rtype }* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast { i32, %rtype }* %in.addr to i8*
-  %graphIDloc = alloca i8*
-  %launch = call i32 @llvm.visc.launch(i8** %graphIDloc, i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  %graphID = load i8** %graphIDloc
-  %wait = call i32 @llvm.visc.wait(i8* %graphID)
+  call void @llvm.visc.wait(i8* %graphID)
   %2 = getelementptr { i32, %rtype }* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output1 = extractvalue %rtype %outputstruct, 0