From 667d659d508c88ad6b65473221d1ca713101124c Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <psrivas2@illinois.edu>
Date: Tue, 7 Oct 2014 16:24:05 +0000
Subject: [PATCH] (1) Modified the launch and wait intrinsics to a different
 format. No need to pass the graphID as reference. It is returned by the
 launch functions. Makes it easier to write code (2) BuildDFG support for data
 transfer to PTX nodes (3) ClearDFG bugs fixed (4) visc-rt modified (5) Test
 case 3level.ll modified to match the new launch and wait intrinsic format

M    llvm/test/VISC/unitTests/3level.ll
M    llvm/include/llvm/IR/DFGraph.h
M    llvm/include/llvm/IR/IntrinsicsVISC.td
M    llvm/projects/visc-rt/visc-rt.cpp
M    llvm/lib/Transforms/BuildDFG/BuildDFG.cpp
M    llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
M    llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
---
 llvm/include/llvm/IR/DFGraph.h                |  27 +-
 llvm/include/llvm/IR/IntrinsicsVISC.td        |   8 +-
 llvm/lib/Transforms/BuildDFG/BuildDFG.cpp     |   2 +-
 llvm/lib/Transforms/ClearDFG/ClearDFG.cpp     |   4 +-
 .../Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp  | 476 +++++++++++-------
 llvm/projects/visc-rt/visc-rt.cpp             | 273 +++++-----
 llvm/test/VISC/unitTests/3level.ll            |  10 +-
 7 files changed, 495 insertions(+), 305 deletions(-)

diff --git a/llvm/include/llvm/IR/DFGraph.h b/llvm/include/llvm/IR/DFGraph.h
index 2d935b3c5f..6093fe9297 100644
--- a/llvm/include/llvm/IR/DFGraph.h
+++ b/llvm/include/llvm/IR/DFGraph.h
@@ -154,6 +154,11 @@ public:
     InternalNode,
     LeafNode
   };
+  // To identify for which backend code has been generated
+  enum CodeGenTag {
+    X86,
+    PTX
+  };
 
 private:
   typedef std::vector<DFNode*> DFNodeListType;
@@ -162,6 +167,7 @@ private:
   // Important things that make up a Dataflow Node
   IntrinsicInst* II;              ///< Associated IntrinsicInst/Value
   Function* FuncPointer;          ///< Associated Function
+  Function* GenFunc;              ///< Associated Function generated by backend
   DFInternalNode* Parent;         ///< Pointer to parent dataflow Node
   unsigned NumOfDim;              ///< Number of dimensions
   std::vector<Value*> DimLimits;  ///< Number of instances in each dimension
@@ -179,6 +185,8 @@ private:
                                   ///< hierarchy
   int Rank;                       ///< Ordering based on toplogical sort
   const DFNodeKind Kind;          ///< Kind of Node Internal/Leaf
+  CodeGenTag Tag;                 // Code Generated for which backend
+  
 
 public:
   // Iterators
@@ -247,7 +255,7 @@ public:
   DFNodeKind getKind() const {
     return Kind;
   }
-
+  
   DFNode(IntrinsicInst* _II, Function* _FuncPointer, DFInternalNode* _Parent,
          unsigned _NumOfDim, std::vector<Value*> _DimLimits, DFNodeKind _K);
 
@@ -302,6 +310,23 @@ public:
     return Rank;
   }
 
+  void setTag(CodeGenTag T) {
+    Tag = T; 
+  }
+
+  CodeGenTag getTag() const {
+    return Tag; 
+  }
+
+  void setGenFunc(Function* F, CodeGenTag T) {
+    GenFunc = F;
+    Tag = T;
+  }
+
+  Function* getGenFunc() {
+    return GenFunc;
+  }
+
   bool isDummyNode() {
     return isEntryNode() || isExitNode();
   }
diff --git a/llvm/include/llvm/IR/IntrinsicsVISC.td b/llvm/include/llvm/IR/IntrinsicsVISC.td
index ed848f99b5..62aeb6f32b 100644
--- a/llvm/include/llvm/IR/IntrinsicsVISC.td
+++ b/llvm/include/llvm/IR/IntrinsicsVISC.td
@@ -18,15 +18,15 @@ let TargetPrefix = "visc" in {
    */
 
   /* Launch intrinsic -
-   * i32 llvm.visc.launch(graphID*, function* , ArgList*);
+   * i8* llvm.visc.launch(function* , ArgList*);
    */
-  def int_visc_launch : Intrinsic<[llvm_i32_ty], [llvm_ptrptr_ty, llvm_ptr_ty,
+  def int_visc_launch : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty,
                                   llvm_ptr_ty], []>;
 
   /* Wait intrinsic -
-   * i32 llvm.visc.wait(graphID*, returnVal*);
+   * void llvm.visc.wait(graphID*, returnVal*);
    */
-  def int_visc_wait : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], []>;
+  def int_visc_wait : Intrinsic<[], [llvm_ptr_ty], []>;
 
   /* Create Node intrinsic -
    * i8* llvm.visc.createNode(function*);
diff --git a/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp b/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp
index 466bc41e14..c236e5094d 100644
--- a/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp
+++ b/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp
@@ -49,7 +49,7 @@ bool BuildDFG::runOnModule(Module &M) {
 
   // Intrinsic Instruction has been initialized from this point on.
 
-  Function* F = cast<Function>((II->getOperand(1))->stripPointerCasts());
+  Function* F = cast<Function>((II->getOperand(0))->stripPointerCasts());
   Root = DFInternalNode::Create(II, F);
   BuildGraph(Root, F);
 
diff --git a/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp b/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
index 3190876910..05869fa0df 100644
--- a/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
+++ b/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
@@ -65,7 +65,7 @@ public:
   virtual void visit(DFInternalNode* N) {
     // Follows a bottom-up approach for code generation.
     // First generate code for all the child nodes
-    errs() << "Erasing Node (I) - " << N->getFuncPointer()->getName() << "\n";
+    DEBUG(errs() << "Erasing Node (I) - " << N->getFuncPointer()->getName() << "\n");
     for(DFGraph::children_iterator i = N->getChildGraph()->begin(),
         e = N->getChildGraph()->end(); i != e; ++i) {
       DFNode* child = *i;
@@ -78,7 +78,7 @@ public:
   }
 
   virtual void visit(DFLeafNode* N) {
-    errs() << "Erasing Node (L) - " << N->getFuncPointer()->getName() << "\n";
+    DEBUG(errs() << "Erasing Node (L) - " << N->getFuncPointer()->getName() << "\n");
     deleteNode(N);
     //errs() << "DONE: Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n";
   }
diff --git a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
index 8c111a64b5..71de15d385 100644
--- a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -15,6 +15,11 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/BuildDFG/BuildDFG.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Constant.h"
 
 using namespace llvm;
 using namespace builddfg;
@@ -56,19 +61,44 @@ private:
   // we already have an index and dim extended function copy or not (i.e.,
   // "Have we visited this function before?")
   ValueMap<Function*, Function*> FMap;
-  DenseMap<DFNode*, CallInst*> CallMap;
+  DenseMap<DFNode*, Value*> OutputMap;
+
+  // VISC Runtime API
+  Module* runtimeModule;
+  Constant* llvm_visc_x86_launch;
+  Constant* llvm_visc_x86_wait;
+  Constant* llvm_visc_ptx_launch;
+  Constant* llvm_visc_ptx_wait;
+  Constant* llvm_visc_ptx_initContext;
+  Constant* llvm_visc_ptx_input_scalar;
+  Constant* llvm_visc_ptx_input_ptr;
+  Constant* llvm_visc_ptx_output_ptr;
+  Constant* llvm_visc_ptx_getOutput;
+  Constant* llvm_visc_ptx_executeNode;
+  FunctionType* AppFuncTy;
+
 
   //Functions
+  void initRuntimeAPI();
+  std::vector<IntrinsicInst*>* getWaitList(Value* LI);
   void addIdxDimArgs(Function* F);
   Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = "");
   Argument* getArgumentFromEnd(Function* F, unsigned offset);
   Argument* getArgumentAt(Function* F, unsigned offset);
+  Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
+      Instruction* InsertBefore);
+  void invokeChild_X86(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
+      Instruction* InsertBefore);
+  void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
+      Instruction* InsertBefore);
   void codeGenLaunch(DFInternalNode* Root);
   void codeGen(DFInternalNode* N);
   void codeGen(DFLeafNode* N);
 public:
   // Constructor
-  CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) { }
+  CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {
+    initRuntimeAPI();
+  }
 
   virtual void visit(DFInternalNode* N) {
     // Follows a bottom-up approach for code generation.
@@ -113,37 +143,75 @@ bool DFG2LLVM_X86::runOnModule(Module &M) {
   return true;
 }
 
+// Initialize the VISC runtime API. This makes it easier to insert these calls
+void CodeGenTraversal::initRuntimeAPI() {
+
+  // Load Runtime API Module
+  SMDiagnostic Err;
+  runtimeModule = ParseIRFile("/home/psrivas2/current-src/projects/visc-rt/visc-rt.ll", Err, M.getContext());
+  if(runtimeModule == NULL)
+    DEBUG(errs() << Err.getMessage());
+  else
+    DEBUG(errs() << "Successfully loaded visc-rt API module\n");
+
+  // Get or insert the global declarations for launch/wait functions
+  llvm_visc_x86_launch = M.getOrInsertFunction("llvm_visc_x86_launch", 
+      runtimeModule->getFunction("llvm_visc_x86_launch")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_x86_launch);
+
+  llvm_visc_x86_wait = M.getOrInsertFunction("llvm_visc_x86_wait",
+      runtimeModule->getFunction("llvm_visc_x86_wait")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_x86_wait);
+ 
+  llvm_visc_ptx_launch = M.getOrInsertFunction("llvm_visc_ptx_launch",
+      runtimeModule->getFunction("llvm_visc_ptx_launch")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_ptx_launch);
+
+  llvm_visc_ptx_wait = M.getOrInsertFunction("llvm_visc_ptx_wait",
+      runtimeModule->getFunction("llvm_visc_ptx_wait")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_ptx_wait);
+
+  llvm_visc_ptx_initContext = M.getOrInsertFunction("llvm_visc_ptx_initContext"  ,
+      runtimeModule->getFunction("llvm_visc_ptx_initContext")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_ptx_initContext);
+
+  llvm_visc_ptx_input_scalar = M.getOrInsertFunction("llvm_visc_ptx_input_scalar",
+      runtimeModule->getFunction("llvm_visc_ptx_input_scalar")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_ptx_input_scalar);
+  
+  llvm_visc_ptx_input_ptr = M.getOrInsertFunction("llvm_visc_ptx_input_ptr",
+      runtimeModule->getFunction("llvm_visc_ptx_input_ptr")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_ptx_input_ptr);
+
+  llvm_visc_ptx_output_ptr = M.getOrInsertFunction("llvm_visc_ptx_output_ptr",
+      runtimeModule->getFunction("llvm_visc_ptx_output_ptr")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_ptx_output_ptr);
+
+  llvm_visc_ptx_getOutput = M.getOrInsertFunction("llvm_visc_ptx_getOutput",
+    runtimeModule->getFunction("llvm_visc_ptx_getOutput")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_ptx_getOutput);
+
+  llvm_visc_ptx_executeNode = M.getOrInsertFunction("llvm_visc_ptx_executeNode",
+      runtimeModule->getFunction("llvm_visc_ptx_executeNode")->getFunctionType());
+  DEBUG(errs() << *llvm_visc_ptx_executeNode);
+
+}
+
 /* Returns vector of all wait instructions
  */
-std::vector<CallInst*>* getWaitList(CallInst* LI) {
-  Value* GraphIDAddr = LI->getArgOperand(0);
-  std::vector<CallInst*>* WaitList = new std::vector<CallInst*>();
+std::vector<IntrinsicInst*>* CodeGenTraversal::getWaitList(Value* GraphID) {
+  std::vector<IntrinsicInst*>* WaitList = new std::vector<IntrinsicInst*>();
   // It must have been loaded from memory somewhere
-  GraphIDAddr->use_begin();
-  for(Value::use_iterator ui = GraphIDAddr->use_begin(),
-      ue = GraphIDAddr->use_end(); ui!=ue; ++ui) {
-    if(LoadInst* LI = dyn_cast<LoadInst>(*ui)) {
-      DEBUG(errs() << *LI << "\n");
-      for(Value::use_iterator i = LI->use_begin(), e = LI->use_end(); i!=e; ++i) {
-        if(CallInst* waitI = dyn_cast<CallInst>(*i)) {
-          DEBUG(errs() << *waitI << "\n");
-          WaitList->push_back(waitI);
-        }
-      }
+  for(Value::use_iterator ui = GraphID->use_begin(),
+      ue = GraphID->use_end(); ui!=ue; ++ui) {
+    if(IntrinsicInst* waitI = dyn_cast<IntrinsicInst>(*ui)) {
+      assert(waitI->getIntrinsicID() == Intrinsic::visc_wait
+          && "GraphID can only be used by llvm.visc.wait intrinsic");
+      WaitList->push_back(waitI);
     }
-    // If graphID memory address is used by another launch, then break
-    if(CallInst* CI = dyn_cast<CallInst>(*ui)) {
-      if(LI != CI) {
-        DEBUG(errs()<< "Warning: Overwriting graph ID in memory  -- " << *CI << "\n" << *LI << "\n");
-        break;
-      }
-    }
-    // If graphID in memory is overwritten using store, it's an error
-    if(StoreInst* SI =dyn_cast<StoreInst>(*ui)) {
-      assert(SI->getPointerOperand() == GraphIDAddr
-          && "Error: Do not manually write over graphID in memory!");
+    else {
+      llvm_unreachable("Error: Operation on Graph ID not supported!\n");
     }
-
   }
   return WaitList;
 }
@@ -258,60 +326,12 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
   // TODO: Place an assert to check if the constant passed bu launch intrinsic
   // as the number of arguments to DFG is same as the number of arguments of the
   // root of DFG
-
+  DEBUG(errs() << "Generating Launch Function\n");
   // Get Launch Instruction
   IntrinsicInst* LI = Root->getInstruction();
+  DEBUG(errs() << "Generating Launch Function\n");
 
-  // Get frequently used types
-  Type* i64Ty = Type::getInt64Ty(LI->getContext());
-  Type* i32Ty = Type::getInt32Ty(LI->getContext());
-  Type* i8Ty = Type::getInt8Ty(LI->getContext());
-  Type* voidTy = Type::getVoidTy(LI->getContext());
-
-  /* Get or Insert visc runtime utilities necessary to run DFG as a separate thread
-   * (1) llvm_visc_launch_x86
-   * (2) llvm_visc_wait_x86
-   */
-  Type *GraphIDTy;
-  std::vector<Type*>Elements;
-  // PThreads use different attribute types for 32-bit and 64-bit machines
-  if(M.getPointerSize() == Module::Pointer64) {
-    GraphIDTy = i64Ty;
-  }
-  else {
-    GraphIDTy = i32Ty;
-  }
-
-  FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(),
-                                                  ArrayRef<Type*>(i8Ty->getPointerTo()),
-                                                  false);
-  // Argument types for llvm_visc_launch_x86
-  Type* ArgTypesLaunch[] = {i8Ty->getPointerTo()->getPointerTo(),
-                            AppFuncTy->getPointerTo(),
-                            i8Ty->getPointerTo()};
-
-  // Construct FunctionType of llvm_visc_launch_x86 call
-  FunctionType* LaunchFuncTy = FunctionType::get(i32Ty,
-                                                 ArrayRef<Type*>(ArgTypesLaunch, 3),
-                                                 false);
-
-  // Construct FunctionType for llvm_visc_wait_x86 call
-  FunctionType* WaitFuncTy = FunctionType::get(i32Ty,
-                                              ArrayRef<Type*>(i8Ty->getPointerTo()),
-                                              false);
-
-  // Get or insert the global declarations for pthread functions
-  Constant* Launch = M.getOrInsertFunction("llvm_visc_launch_x86", LaunchFuncTy);
-  Constant* Wait = M.getOrInsertFunction("llvm_visc_wait_x86", WaitFuncTy);
-
-  // Construct FunctionType for malloc call
-  FunctionType* MallocTy = FunctionType::get(i8Ty->getPointerTo(),
-                                            ArrayRef<Type*>(i64Ty),
-                                            false);
-  // Get or insert the global declaration for malloc call
-  Constant* Malloc = M.getOrInsertFunction("malloc", MallocTy);
-
-  /* Now we have all the necessary global declarations necessary to generate the
+   /* Now we have all the necessary global declarations necessary to generate the
    * Launch function, pointer to which can be passed to pthread utils to execute
    * DFG. The Launch function has just one input: i8* data.addr
    * This is the address of the all the input data that needs to be passed to
@@ -324,10 +344,15 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
    * passed to pthread_exit call.
    */
   // Create Launch Function of type i8*(i8*) which calls the root function 
+  Type* i8Ty = Type::getInt8Ty(M.getContext());
+  AppFuncTy = FunctionType::get(i8Ty->getPointerTo(),
+                                ArrayRef<Type*>(i8Ty->getPointerTo()),
+                                false);
   Function* AppFunc = Function::Create(AppFuncTy,
                                        Root->getFuncPointer()->getLinkage(),
                                        "LaunchDataflowGraph",
                                        &M);
+  DEBUG(errs() << "Generating Launch Function\n");
   // Give a name to the argument which is used pass data to this thread
   Value* data = AppFunc->arg_begin();
   data->setName("data.addr");
@@ -336,6 +361,8 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
   ReturnInst* RI = ReturnInst::Create(AppFunc->getContext(),
                                       Constant::getNullValue(AppFunc->getReturnType()),
                                       BB);
+
+  DEBUG(errs() << "Created Empty Launch Function\n");
   // Find the X86 function generated for Root and 
   Function* RootF_X86 = FMap[Root->getFuncPointer()];
   // Generate a call to RootF_X86 with null parameters for now 
@@ -364,7 +391,7 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
     // as no more arguments left
     // Increment using GEP: %nextArg = getelementptr <ptr-to-argType> %arg.addr, i64 1
     // This essentially takes us to the next argument in memory
-    Constant* IntOne = ConstantInt::get(i64Ty, 1);
+    Constant* IntOne = ConstantInt::get(Type::getInt64Ty(M.getContext()), 1);
     GetElementPtrInst* GEP = GetElementPtrInst::Create(BI,
                                                       ArrayRef<Value*>(IntOne),
                                                       "nextArg",
@@ -374,9 +401,8 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
     argNum++;
     data = GEP;
   }
-
+  
   // Code for returning the output
-  Constant* SizeOf = ConstantExpr::getSizeOf(CI->getType());
   CastInst* OutputAddrCast = CastInst::CreatePointerCast(data,
                                                         CI->getType()->getPointerTo(),
                                                         CI->getName()+".addr",
@@ -387,21 +413,20 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
   DEBUG(errs() << *AppFunc << "\n");
  
   // Substitute launch intrinsic main
-  Value* LaunchInstArgs[] = {LI->getArgOperand(0),
-                            AppFunc,
-                            LI->getArgOperand(2)};
-  CallInst* LaunchInst = CallInst::Create(Launch,
-                                          ArrayRef<Value*>(LaunchInstArgs,3),
-                                          "", LI);
+  Value* LaunchInstArgs[] = {AppFunc,
+                            LI->getArgOperand(1)};
+  CallInst* LaunchInst = CallInst::Create(llvm_visc_x86_launch,
+                                          ArrayRef<Value*>(LaunchInstArgs,2),
+                                          "graph"+Root->getFuncPointer()->getName(), LI);
   //ReplaceInstWithInst(LI, LaunchInst);
 
   DEBUG(errs() << *LaunchInst << "\n");
   // Replace all wait instructions with x86 specific wait instructions
-  std::vector<CallInst*>* WaitList = getWaitList(LaunchInst);
+  std::vector<IntrinsicInst*>* WaitList = getWaitList(LI);
   for(unsigned i=0; i < WaitList->size(); ++i) {
-    CallInst* waitI = WaitList->at(i);
-    CallInst* waitI_X86 = CallInst::Create(Wait,
-                                          ArrayRef<Value*>(waitI->getArgOperand(0)),
+    IntrinsicInst* waitI = WaitList->at(i);
+    CallInst* waitI_X86 = CallInst::Create(llvm_visc_x86_wait,
+                                          ArrayRef<Value*>(LaunchInst),
                                           "");
     ReplaceInstWithInst(waitI, waitI_X86);
     DEBUG(errs() << *waitI_X86 << "\n");
@@ -409,6 +434,171 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
 
 }
 
+Value* CodeGenTraversal::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore) {
+  // TODO: Assumption is that each input port of a node has just one
+  // incoming edge. May change later on.
+
+  // Find the incoming edge at the requested input port
+  DFEdge* E = Child->getInDFEdgeAt(i);
+  assert(E && "No incoming edge or binding for input element!");
+  // Find the Source DFNode associated with the incoming edge
+  DFNode* SrcDF = E->getSourceDF();
+
+  // If Source DFNode is a dummyNode, edge is from parent. Get the
+  // argument from argument list of this internal node
+  Value* inputVal;
+  if(SrcDF->isEntryNode()) {
+    inputVal = getArgumentAt(ParentF_X86, i);
+    DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+  }
+  else {
+    // edge is from a sibling
+    // Check - code should already be generated for this source dfnode
+    assert(OutputMap.count(SrcDF)
+           && "Source node call not found. Dependency violation!");
+
+    // Find CallInst associated with the Source DFNode using FMap
+    Value* CI = OutputMap[SrcDF];
+
+    // Extract element at source position from this call instruction
+    std::vector<unsigned> IndexList;
+    IndexList.push_back(E->getSourcePosition());
+    DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
+    ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
+                           "", InsertBefore);
+    inputVal = EI;
+  }
+  return inputVal;
+}
+
+void CodeGenTraversal::invokeChild_X86(DFNode* C, Function* F_X86,
+    ValueToValueMapTy &VMap,Instruction* IB) {
+  Function* CF = C->getFuncPointer();
+
+  Function* CF_X86 = C->getGenFunc();
+  DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n");
+  assert(CF_X86 != NULL
+      && "Found leaf node for which code generation has not happened yet!");
+
+  std::vector<Value*> Args;
+  // Create argument list to pass to call instruction
+  // First find the correct values using the edges
+  // The remaing six values are inserted as constants for now.
+  for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
+    Args.push_back(getInValueAt(C, i, F_X86, IB));
+  }
+
+  Value* I32Zero = ConstantInt::get(Type::getInt32Ty(F_X86->getContext()), 0);
+  for(unsigned j=0; j<6; j++)
+    Args.push_back(I32Zero);
+
+  // Call the F_X86 function associated with this node
+  CallInst* CI = CallInst::Create(CF_X86, Args,
+                                  CF_X86->getName()+"_output",
+                                  IB);
+  DEBUG(errs() << *CI << "\n");
+  OutputMap[C] = CI;
+
+  // Find num of dimensions this node is replicated in.
+  // Based on number of dimensions, insert loop instructions
+  std::string varNames[3] = {"x", "y", "z"};
+  for(unsigned j=0; j < C->getNumOfDim(); j++) {
+    Value* indexLimit;
+    // Limit can either be a constant or an arguement of the internal node.
+    // In case of constant we can use that constant value directly in the
+    // new F_X86 function. In case of an argument, we need to get the mapped
+    // value using VMap
+    if(isa<Constant>(C->getDimLimits()[j]))
+      indexLimit = C->getDimLimits()[j];
+    else
+      indexLimit = VMap[C->getDimLimits()[j]];
+    assert(indexLimit && "Invalid dimension limit!");
+    // Insert loop
+    Value* indexVar = addLoop(CI, indexLimit, varNames[j]);
+    unsigned numArgs = CI->getNumArgOperands();
+    // Insert index variable and limit arguments
+    CI->setArgOperand(numArgs-6+j, indexVar);
+    CI->setArgOperand(numArgs-3+j, indexLimit);
+  }
+
+}
+
+void CodeGenTraversal::invokeChild_PTX(DFNode* C, Function* F_X86,
+    ValueToValueMapTy &VMap, Instruction* IB) {
+  Function* CF = C->getFuncPointer();
+
+  //FIXME: A way to check if PTX code has been generated for this child node
+  /*assert(FMap.count(CF)
+         && "Found leaf node for which code generation has not happened yet!");
+  */
+  //assert(C->getTag() == DFNode::PTX && "Cannot generate GPU call for non PTX nodes");
+
+  // Initialize context
+  CallInst::Create(llvm_visc_ptx_initContext, None, "", IB);
+
+  // Initialize command queue
+  // Filename = <DFNode function name>.nvptx.ll
+  Twine file = CF->getName() + ".nvptx.ll";
+  DEBUG(errs() << file << "\n");
+  Constant* filename = ConstantDataArray::get(M.getContext(),
+        ArrayRef<uint8_t>((uint8_t*)file.str().c_str(), file.str().length()));
+
+  CallInst* GraphID = CallInst::Create(llvm_visc_ptx_launch,
+                                      ArrayRef<Value*>(filename),
+                                      "graph"+CF->getName(),
+                                      IB);
+
+  // Iterate over the required input edges of the node and use the visc-rt API
+  // to set inputs
+  for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
+
+    Value* inputVal = getInValueAt(C, i, F_X86, IB);
+    // input value has been obtained.
+    // Check if input is a scalar value or a pointer operand
+    // For scalar values such as int, float, etc. the size is simply the size of
+    // type on target machine, but for pointers, the size of data would be the
+    // next integer argument
+    Value* inputSize;
+    if(inputVal->getType()->isPointerTy()) {
+      // Pointer Input
+      inputSize = getInValueAt(C, i+1, F_X86, IB);
+      assert(inputSize->getType()->isIntegerTy()
+          && "Pointer type input must always be followed by size (integer type)");
+    }
+    else { // Scalar Input
+      inputSize = ConstantExpr::getSizeOf(inputVal->getType());
+    }
+
+    Value* setInputArgs[] = {GraphID,
+                            inputVal,
+                            ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                            inputSize
+                            };
+    CallInst::Create(llvm_visc_ptx_input_ptr,
+          ArrayRef<Value*>(setInputArgs, 4), "", IB);
+
+  }
+  // Setup output
+  // FIXME: Note - There is a tricky question. In X86 we do not need to care
+  // about pointer inputs which modify data in memory implicitly (without
+  // showing it as output). There is no extra cost needed to handle such inputs
+  // For PTX, we need to read back such data from device memory to host memory.
+  // The cost is huge and hence we need to differentiate between readonly
+  // pointer inputs vs read/write pointer inputs. Currently supporting only a
+  // simple model in which all input edges are readonly and output is
+  // writeonly.
+  StructType* OutputTy = C->getOutputType();
+  for(unsigned i=0; OutputTy->getNumElements(); i++) {
+    Type* elemTy = OutputTy->getElementType(i);
+  }
+
+  // Enqueue kernel
+  // Read Output
+  // return output
+  // free data structures
+
+}
+
 void CodeGenTraversal::codeGen(DFInternalNode* N) {
   Function* F = N->getFuncPointer();
 
@@ -443,7 +633,10 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
                                       UndefValue::get(F_X86->getReturnType()), BB);
 
   //Add old func: new func pair to the FMap
+  // FIXME: We do not require the FMap probably. Only one of setGenFunc or FMap
+  // is required
   FMap[F] = F_X86;
+  N->setGenFunc(F_X86, DFNode::X86);
 
   // Add Index and Dim arguments except for the root node
   if(!N->isRoot())
@@ -459,86 +652,12 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
     if (C->isDummyNode())
       continue;
 
-    Function* CF = C->getFuncPointer();
-
-    assert(FMap.count(CF)
-           && "Found leaf node for which code generation has not happened yet!");
-    Function* CF_X86 = FMap[CF];
-    std::vector<Value*> Args;
-    // Create argument list to pass to call instruction
-    // First find the correct values using the edges
-    // The remaing six values are inserted as constants for now.
-    for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
-
-      // TODO: Assumption is that each input port of a node has just one
-      // incoming edge. May change later on.
-
-      // Find the incoming edge at the requested input port
-      DFEdge* E = C->getInDFEdgeAt(i);
-      assert(E && "No incoming edge or binding for input element!");
-      // Find the Source DFNode associated with the incoming edge
-      DFNode* SrcDF = E->getSourceDF();
-
-      // If Source DFNode is a dummyNode, edge is from parent. Get the
-      // argument from argument list of this internal node
-      Value* inputVal;
-      if(SrcDF->isEntryNode()) {
-        inputVal = getArgumentAt(F_X86, i);
-        DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
-      }
-      else {
-        // edge is from a sibling
-        // Check - code should already be generated for this source dfnode
-        assert(CallMap.count(SrcDF)
-               && "Source node call not found. Dependency violation!");
-
-        // Find CallInst associated with the Source DFNode using FMap
-        CallInst* CI = CallMap[SrcDF];
-
-        // Extract element at source position from this call instruction
-        std::vector<unsigned> IndexList;
-        IndexList.push_back(E->getSourcePosition());
-        DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
-        ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
-                               "", RI);
-        inputVal = EI;
-      }
-      // input value has been obtained.
-      Args.push_back(inputVal);
-    }
-
-    Value* I32Zero = ConstantInt::get(Type::getInt32Ty(F_X86->getContext()), 0);
-    for(unsigned j=0; j<6; j++)
-      Args.push_back(I32Zero);
-
-    // Call the F_X86 function associated with this node
-    CallInst* CI = CallInst::Create(CF_X86, Args,
-                                    CF_X86->getName()+"_output",
-                                    RI);
-    DEBUG(errs() << *CI << "\n");
-    CallMap[C] = CI;
-
-    // Find num of dimensions this node is replicated in.
-    // Based on number of dimensions, insert loop instructions
-    std::string varNames[3] = {"x", "y", "z"};
-    for(unsigned j=0; j < C->getNumOfDim(); j++) {
-      Value* indexLimit;
-      // Limit can either be a constant or an arguement of the internal node.
-      // In case of constant we can use that constant value directly in the
-      // new F_X86 function. In case of an argument, we need to get the mapped
-      // value using VMap
-      if(isa<Constant>(C->getDimLimits()[j]))
-        indexLimit = C->getDimLimits()[j];
-      else
-        indexLimit = VMap[C->getDimLimits()[j]];
-      assert(indexLimit && "Invalid dimension limit!");
-      // Insert loop
-      Value* indexVar = addLoop(CI, indexLimit, varNames[j]);
-      unsigned numArgs = CI->getNumArgOperands();
-      // Insert index variable and limit arguments
-      CI->setArgOperand(numArgs-6+j, indexVar);
-      CI->setArgOperand(numArgs-3+j, indexLimit);
-    }
+    // Check if Child Node has PTX tag or X86 tag
+    invokeChild_PTX(C, F_X86, VMap, RI);
+    if (C->getTag() == DFNode::PTX) 
+      invokeChild_PTX(C, F_X86, VMap, RI);
+    else 
+      invokeChild_X86(C, F_X86, VMap, RI);
   }
 
   DEBUG(errs() << "*** Generating epilogue code for the function****\n");
@@ -550,6 +669,7 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
   Value *retVal = UndefValue::get(F_X86->getReturnType());
   // Find all the input edges to exit node
   for (unsigned i=0; i < OutTy->getNumElements(); i++) {
+    DEBUG(errs() << "Output Edge " << i << "\n");
     // Find the incoming edge at the requested input port
     DFEdge* E = C->getInDFEdgeAt(i);
 
@@ -557,6 +677,8 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
     // Find the Source DFNode associated with the incoming edge
     DFNode* SrcDF = E->getSourceDF();
 
+    DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
+
     // If Source DFNode is a dummyNode, edge is from parent. Get the
     // argument from argument list of this internal node
     Value* inputVal;
@@ -567,11 +689,11 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
     else {
       // edge is from a internal node
       // Check - code should already be generated for this source dfnode
-      assert(CallMap.count(SrcDF)
+      assert(OutputMap.count(SrcDF)
              && "Source node call not found. Dependency violation!");
 
-      // Find CallInst associated with the Source DFNode using FMap
-      CallInst* CI = CallMap[SrcDF];
+      // Find Output Value associated with the Source DFNode using OutputMap
+      Value* CI = OutputMap[SrcDF];
 
       // Extract element at source position from this call instruction
       std::vector<unsigned> IndexList;
@@ -585,6 +707,7 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
     IdxList.push_back(i);
     retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
   }
+  DEBUG(errs() << "Extracted all\n");
   retVal->setName("output");
   ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
   ReplaceInstWithInst(RI, newRI);
@@ -628,7 +751,10 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
   // Insert the cloned function into the module
   M.getFunctionList().push_back(F_X86);
   // Add old func: new func pair to the FMap
+  // FIXME: We do not require the FMap probably. Only one of setGenFunc or FMap
+  // is required
   FMap[F] = F_X86;
+  N->setGenFunc(F_X86, DFNode::X86);
 
   // Add the new argument to the argument list
   addIdxDimArgs(F_X86);
diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp
index 9080a31b4a..4b05c8178b 100644
--- a/llvm/projects/visc-rt/visc-rt.cpp
+++ b/llvm/projects/visc-rt/visc-rt.cpp
@@ -1,6 +1,7 @@
 #include <pthread.h>
 #include <cstdlib>
 #include <cstdio>
+#include <string>
 #include <CL/cl.h>
 
 typedef struct {
@@ -14,89 +15,175 @@ typedef struct {
   cl_kernel clKernel;
 } DFNodeContext_PTX;
 
+cl_context globalGPUContext;
+
+static inline void checkErr(cl_int err, cl_int success, const char * name) {
+  if (err != success) {
+    printf("ERROR: %s\n", name);
+    exit(EXIT_FAILURE);
+  }
+}
+
 extern "C"
-__int32_t llvm_visc_launch_x86(void** graphID, void* (*rootFunc)(void*), void* arguments) {
+void* llvm_visc_x86_launch(void* (*rootFunc)(void*), void* arguments) {
   DFNodeContext_X86 *Context = (DFNodeContext_X86 *) malloc(sizeof(DFNodeContext_X86));
-  *graphID = Context;
-  return pthread_create(&Context->threadID, NULL, rootFunc, arguments);
+  int err;
+  if((err = pthread_create(&Context->threadID, NULL, rootFunc, arguments)) != 0)
+    printf("Failed to create pthread. Error code = %d\n", err);
+  return Context;
 }
 
 extern "C"
-__int32_t llvm_visc_wait_x86(void* graphID) {
+void llvm_visc_x86_wait(void* graphID) {
   DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
-  return pthread_join(Context->threadID, NULL);
+  pthread_join(Context->threadID, NULL);
 }
 
-static inline void checkErr(cl_int err, cl_int success, const char * name) {
-    if (err != success) {
-          printf("ERROR: %s\n", name);
-              exit(EXIT_FAILURE);
-                }
+extern "C"
+void* llvm_visc_ptx_initContext() {
+  cl_uint numPlatforms;
+  cl_int errcode;
+  errcode = clGetPlatformIDs(0, NULL, &numPlatforms);
+  checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms");
+
+  // now get all the platform IDs
+  cl_platform_id* platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id)*numPlatforms);
+  errcode = clGetPlatformIDs(numPlatforms, platforms, NULL);
+  checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs");
+
+  for(unsigned i=0; i < numPlatforms; i++) {
+    char buffer[10240];
+    printf("  -- %d --\n", i);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL);
+    printf("  PROFILE = %s\n", buffer);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL);
+    printf("  VERSION = %s\n", buffer);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL);
+    printf("  NAME = %s\n", buffer);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL);
+    printf("  VENDOR = %s\n", buffer);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL);
+    printf("  EXTENSIONS = %s\n", buffer);
+  }
+  // set platform property - just pick the first one
+  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
+                                        (long) platforms[0],
+                                        0};
+  globalGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU,
+                                         NULL, NULL, &errcode);
+  checkErr(errcode, CL_SUCCESS, "Failure to create GPU context");
+  return globalGPUContext;
+}
+
+extern "C"
+void llvm_visc_ptx_clearContext() {
+  clReleaseContext(globalGPUContext);
 }
 
+extern "C"
+void llvm_visc_ptx_input_scalar(void* graphID, void* input, int arg_index, size_t size) {
+  DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
+  cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input);
+  checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument");
+}
+
+extern "C"
+void* llvm_visc_ptx_input_ptr(void* graphID, void* input, int arg_index, size_t size) {
+  DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
+  cl_int errcode;
+  cl_mem d_input = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE |
+      CL_MEM_COPY_HOST_PTR, size, input, &errcode);
+  checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
+  errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input);
+  checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument");
+  return d_input;
+}
+
+extern "C"
+void* llvm_visc_ptx_output_ptr(void* graphID, int arg_index, size_t size) {
+  DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
+  cl_int errcode;
+  cl_mem d_output = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE,
+      size, NULL, &errcode);
+  checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
+  errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output);
+  checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument");
+  return d_output;
+}
+
+extern "C"
+void* llvm_visc_ptx_getOutput(void* graphID, void* d_output, size_t size) {
+  void* h_output = malloc(size);
+  DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
+  cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size,
+                                h_output, 0, NULL, NULL);
+  checkErr(errcode, CL_SUCCESS, "Failure to read output");
+  return h_output;
+}
+
+extern "C"
+void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t* localWorkSize, const size_t* globalWorkSize) {
+  DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
+  cl_event* event;
+  cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue,
+      Context->clKernel, workDim, NULL, globalWorkSize, localWorkSize, 0, NULL, event);
+  checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
+  return event;
+}
+
+
 //////////////////////////////////////////////////////////////////////////////
 //! Loads a Program binary file.
 //!
 //! @return the source string if succeeded, 0 otherwise
-//! @param cFilename        program filename
+//! @param Filename        program filename
 //! @param szFinalLength    returned length of the code string
 //////////////////////////////////////////////////////////////////////////////
-static char* LoadProgSource(const char* cFilename, size_t* szFinalLength)
+static char* LoadProgSource(const char* Filename, size_t* szFinalLength)
 {
-    // locals
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-
-    // open the OpenCL source code file
-    #ifdef _WIN32   // Windows version
-        if(fopen_s(&pFileStream, cFilename, "rb") != 0)
-        {
-            return NULL;
-        }
-    #else           // Linux version
-        pFileStream = fopen(cFilename, "rb");
-        if(pFileStream == 0)
-        {
-            return NULL;
-        }
-    #endif
-
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END);
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET);
-
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + 1);
-    if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1)
-    {
-        fclose(pFileStream);
-        free(cSourceString);
-        return 0;
-    }
-
-    // close the file and return the total length of the combined (preamble + source) string
-    fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength;
-    }
-    cSourceString[szSourceLength] = '\0';
-
-    return cSourceString;
+  // locals
+  FILE* pFileStream = NULL;
+  size_t szSourceLength;
+
+  // open the OpenCL source code file
+  pFileStream = fopen(Filename, "rb");
+  if(pFileStream == 0)
+  {
+    return NULL;
+  }
+
+  // get the length of the source code
+  fseek(pFileStream, 0, SEEK_END);
+  szSourceLength = ftell(pFileStream);
+  fseek(pFileStream, 0, SEEK_SET);
+
+  // allocate a buffer for the source code string and read it in
+  char* cSourceString = (char *)malloc(szSourceLength + 1);
+  if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1)
+  {
+      fclose(pFileStream);
+      free(cSourceString);
+      return 0;
+  }
+
+  // close the file and return the total length of the combined (preamble + source) string
+  fclose(pFileStream);
+  if(szFinalLength != 0)
+  {
+      *szFinalLength = szSourceLength;
+  }
+  cSourceString[szSourceLength] = '\0';
+
+  return cSourceString;
 }
 
 extern "C"
-__int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void* arguments) {
+void* llvm_visc_ptx_launch(const char* Filename) {
   // Initialize OpenCL
 
   // OpenCL specific variables
   DFNodeContext_PTX *Context = (DFNodeContext_PTX *) malloc(sizeof(DFNodeContext_PTX));
   
-  // Return Context pointer as grpahID;
-  *graphID = Context;
-
-
   size_t dataBytes;
   size_t kernelLength;
   cl_int errcode;
@@ -112,7 +199,7 @@ __int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void*
   /* Initialize OpenCL */
   /*****************************************/
   // query the number of platforms
-  cl_uint numPlatforms;
+  /*cl_uint numPlatforms;
   errcode = clGetPlatformIDs(0, NULL, &numPlatforms);
   checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms");
 
@@ -142,6 +229,9 @@ __int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void*
   Context->clGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU,
                                          NULL, NULL, &errcode);
   checkErr(errcode, CL_SUCCESS, "Failure to create GPU context");
+  */
+  // For a single context for all kernels
+  Context->clGPUContext = globalGPUContext;
 
   // get the list of GPU devices associated with context
   errcode = clGetContextInfo(Context->clGPUContext, CL_CONTEXT_DEVICES, 0,
@@ -155,23 +245,13 @@ __int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void*
   Context->clCommandQue = clCreateCommandQueue(Context->clGPUContext, clDevices[0], 0, &errcode);
   checkErr(errcode, CL_SUCCESS, "Failure to create command queue");
 
-  /* Application specific code
-  // Setup device memory
-  d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, bytes_C, NULL,
-                       &errcode);
-  d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
-                       bytes_A, h_A, &errcode);
-  d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
-                       bytes_B, h_B, &errcode);
-  */
-
-  char *clMatrixMul = LoadProgSource("matrixMul.nvptx.s", &kernelLength);
-  checkErr(clMatrixMul != NULL, 1 /*bool true*/, "Failure to load Program Binary");
+  char *programSource = LoadProgSource(Filename, &kernelLength);
+  checkErr(programSource != NULL, 1 /*bool true*/, "Failure to load Program Binary");
   
   cl_int binaryStatus;
   Context->clProgram = clCreateProgramWithBinary(Context->clGPUContext, 1, &clDevices[0],
                                         &kernelLength,
-                                        (const unsigned char **)&clMatrixMul,
+                                        (const unsigned char **)&programSource,
                                         &binaryStatus, &errcode);
   checkErr(errcode, CL_SUCCESS, "Failure to create program from binary");
 
@@ -184,37 +264,7 @@ __int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void*
   
 
   // Invoke the callback function to put memory allocations in place
-  rootFunc(graphID);
-
-  /* Application Specific Code
-  // Launch OpenCL kernel
-  size_t localWorkSize[2], globalWorkSize[2];
-
-  int wA = WA;
-  int wC = WC;
-  errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C);
-  errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A);
-  errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B);
-  errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA);
-  errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC);
-  checkErr(errcode, CL_SUCCESS, "Failure to set kernel arguments");
- 
-  localWorkSize[0] = BLOCK_SIZE;
-  localWorkSize[1] = BLOCK_SIZE;
-  globalWorkSize[0] = ((WB-1)/BLOCK_SIZE + 1) * BLOCK_SIZE;
-  globalWorkSize[1] = ((HA-1)/BLOCK_SIZE + 1) * BLOCK_SIZE;
- 
-  errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, 
-                                   globalWorkSize, localWorkSize,
-                                   0, NULL, NULL);
-  checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
-  
- 
-  // Retrieve result from device
-  errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, bytes_C, 
-                                h_C, 0, NULL, NULL);
-  checkErr(errcode, CL_SUCCESS, "Failure to read buffer");
-  */ 
+  //rootFunc(Context);
 
   /* App specific code
   // Deallocate memory
@@ -228,28 +278,19 @@ __int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void*
   */
 
   free(clDevices);
-  free(clMatrixMul);
+  free(programSource);
 
-  /*
-  // Free in wait implementation
-  clReleaseContext(Context->clGPUContext);
-  clReleaseKernel(Context->clKernel);
-  clReleaseProgram(Context->clProgram);
-  */
-
-  return 0;
+  return Context;
 }
 
 
 extern "C" 
-__int32_t llvm_visc_wait_ptx(void* graphID) {
+void llvm_visc_ptx_wait(void* graphID) {
   DFNodeContext_PTX *Context = (DFNodeContext_PTX*) graphID;
   clFinish(Context->clCommandQue);
 
   // Release
-  clReleaseContext(Context->clGPUContext);
+  // clReleaseContext(Context->clGPUContext);
   clReleaseKernel(Context->clKernel);
   clReleaseProgram(Context->clProgram);
-
-  return 0;
 }
diff --git a/llvm/test/VISC/unitTests/3level.ll b/llvm/test/VISC/unitTests/3level.ll
index c884acf32c..72248b3b95 100644
--- a/llvm/test/VISC/unitTests/3level.ll
+++ b/llvm/test/VISC/unitTests/3level.ll
@@ -17,10 +17,10 @@ declare i8* @llvm.visc.createNode(i8*) #0
 declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.launch(i8**, i8*, i8*) #0
+declare i8* @llvm.visc.launch(i8*, i8*) #0
 
 ; Function Attrs: nounwind
-declare i32 @llvm.visc.wait(i8*) #0
+declare void @llvm.visc.wait(i8*) #0
 
 ; Function Attrs: nounwind
 declare i8* @llvm.visc.getNode() #0
@@ -42,11 +42,9 @@ entry:
   %1 = bitcast { i32, %rtype }* %in.addr to i32*
   store i32 %conv.i, i32* %1
   %args = bitcast { i32, %rtype }* %in.addr to i8*
-  %graphIDloc = alloca i8*
-  %launch = call i32 @llvm.visc.launch(i8** %graphIDloc, i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
+  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args)
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0
-  %graphID = load i8** %graphIDloc
-  %wait = call i32 @llvm.visc.wait(i8* %graphID)
+  call void @llvm.visc.wait(i8* %graphID)
   %2 = getelementptr { i32, %rtype }* %in.addr, i32 0, i32 1
   %outputstruct = load %rtype* %2
   %output1 = extractvalue %rtype %outputstruct, 0
-- 
GitLab