diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 0670018dd8997d18b3d4a397c886d19099b81e37..a37d9c152504c9f4271daceff81ff5be83ec292c 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -17,8 +17,12 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/InstIterator.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/BuildDFG/BuildDFG.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker.h"
+#include "llvm/Support/SourceMgr.h"
 
 #include <sstream>
 
@@ -69,9 +73,28 @@ namespace {
     // we already have an index and dim extended function copy or not (i.e.,
     // "Have we visited this function before?")
     ValueMap<Function*, Function*> FMap; 
+    DenseMap<DFNode*, Value*> OutputMap;
+
+    // VISC Runtime API
+    Module* runtimeModule;
+    Constant* llvm_visc_ptx_launch;
+    Constant* llvm_visc_ptx_wait;
+    Constant* llvm_visc_ptx_initContext;
+    Constant* llvm_visc_ptx_input_scalar;
+    Constant* llvm_visc_ptx_input_ptr;
+    Constant* llvm_visc_ptx_output_ptr;
+    Constant* llvm_visc_ptx_getOutput;
+    Constant* llvm_visc_ptx_executeNode;
+
 
     //Functions
     void transformFunctionToVoid(Function* F);
+    void initRuntimeAPI();
+    void addIdxDimArgs(Function* F);
+    Argument* getArgumentAt(Function* F, unsigned offset);
+    Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
+                        Instruction* InsertBefore);
+    void insertRuntimeCalls(DFInternalNode* N, const Twine& FileName, const Twine& Kernel);
 
     void codeGen(DFInternalNode* N);
     void codeGen(DFLeafNode* N);
@@ -101,6 +124,358 @@ namespace {
 
   };
 
+  // Initialize the VISC runtime API. This makes it easier to insert these calls
+  void CodeGenTraversal::initRuntimeAPI() {
+
+    // Load Runtime API Module
+    SMDiagnostic Err;
+    runtimeModule = ParseIRFile("/home/psrivas2/current-src/projects/visc-rt/visc-rt.ll", Err, M.getContext());
+    if(runtimeModule == NULL)
+      DEBUG(errs() << Err.getMessage());
+    else
+      DEBUG(errs() << "Successfully loaded visc-rt API module\n");
+
+    // Get or insert the global declarations for launch/wait functions
+    llvm_visc_ptx_launch = M.getOrInsertFunction("llvm_visc_ptx_launch",
+        runtimeModule->getFunction("llvm_visc_ptx_launch")->getFunctionType());
+    DEBUG(errs() << *llvm_visc_ptx_launch);
+
+    llvm_visc_ptx_wait = M.getOrInsertFunction("llvm_visc_ptx_wait",
+        runtimeModule->getFunction("llvm_visc_ptx_wait")->getFunctionType());
+    DEBUG(errs() << *llvm_visc_ptx_wait);
+
+    llvm_visc_ptx_initContext = M.getOrInsertFunction("llvm_visc_ptx_initContext"  ,
+        runtimeModule->getFunction("llvm_visc_ptx_initContext")->getFunctionType());
+    DEBUG(errs() << *llvm_visc_ptx_initContext);
+
+    llvm_visc_ptx_input_scalar = M.getOrInsertFunction("llvm_visc_ptx_input_scalar",
+        runtimeModule->getFunction("llvm_visc_ptx_input_scalar")->getFunctionType());
+    DEBUG(errs() << *llvm_visc_ptx_input_scalar);
+    
+    llvm_visc_ptx_input_ptr = M.getOrInsertFunction("llvm_visc_ptx_input_ptr",
+        runtimeModule->getFunction("llvm_visc_ptx_input_ptr")->getFunctionType());
+    DEBUG(errs() << *llvm_visc_ptx_input_ptr);
+
+    llvm_visc_ptx_output_ptr = M.getOrInsertFunction("llvm_visc_ptx_output_ptr",
+        runtimeModule->getFunction("llvm_visc_ptx_output_ptr")->getFunctionType());
+    DEBUG(errs() << *llvm_visc_ptx_output_ptr);
+
+    llvm_visc_ptx_getOutput = M.getOrInsertFunction("llvm_visc_ptx_getOutput",
+      runtimeModule->getFunction("llvm_visc_ptx_getOutput")->getFunctionType());
+    DEBUG(errs() << *llvm_visc_ptx_getOutput);
+
+    llvm_visc_ptx_executeNode = M.getOrInsertFunction("llvm_visc_ptx_executeNode",
+        runtimeModule->getFunction("llvm_visc_ptx_executeNode")->getFunctionType());
+    DEBUG(errs() << *llvm_visc_ptx_executeNode);
+
+  }
+  void CodeGenTraversal::addIdxDimArgs(Function* F) {
+    // Add Index and Dim arguments
+    std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"};
+    for (int i = 0; i < 6; ++i) {
+      new Argument(Type::getInt32Ty(F->getContext()), names[i], F);
+    }
+
+    // Create the argument type list with added argument types
+    std::vector<Type*> ArgTypes;
+    for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+        ai != ae; ++ai) {
+      ArgTypes.push_back(ai->getType());
+    }
+    // Adding new arguments to the function argument list, would not change the
+    // function type. We need to change the type of this function to reflect the
+    // added arguments
+    FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg());
+    PointerType* PTy = PointerType::get(FTy, cast<PointerType>(F->getType())->getAddressSpace());
+
+    // Change the function type
+    F->mutateType(PTy);
+  }
+
+  /* Traverse the function F argument list to get argument at offset*/
+  Argument* CodeGenTraversal::getArgumentAt(Function* F, unsigned offset) {
+    assert((F->getFunctionType()->getNumParams() > offset && offset >= 0)
+           && "Invalid offset to access arguments!");
+
+    Argument* arg;
+    Function::arg_iterator i = F->arg_begin(), e = F->arg_end();
+    for(; offset != 0 && i!=e; i++) {
+      offset--;
+    }
+    arg = i;
+    DEBUG(errs() << *F);
+    DEBUG(errs() << *arg <<"\n");
+    return arg;
+  }
+
+
+  Value* CodeGenTraversal::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
+                                        Instruction* InsertBefore) {
+    // TODO: Assumption is that each input port of a node has just one
+    // incoming edge. May change later on.
+
+    // Find the incoming edge at the requested input port
+    DFEdge* E = Child->getInDFEdgeAt(i);
+    assert(E && "No incoming edge or binding for input element!");
+    // Find the Source DFNode associated with the incoming edge
+    DFNode* SrcDF = E->getSourceDF();
+
+    // If Source DFNode is a dummyNode, edge is from parent. Get the
+    // argument from argument list of this internal node
+    Value* inputVal;
+    if(SrcDF->isEntryNode()) {
+      inputVal = getArgumentAt(ParentF_X86, i);
+      DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+    }
+    else {
+      // edge is from a sibling
+      // Check - code should already be generated for this source dfnode
+      assert(OutputMap.count(SrcDF)
+             && "Source node call not found. Dependency violation!");
+
+      // Find CallInst associated with the Source DFNode using FMap
+      Value* CI = OutputMap[SrcDF];
+
+      // Extract element at source position from this call instruction
+      std::vector<unsigned> IndexList;
+      IndexList.push_back(E->getSourcePosition());
+      DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
+      ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
+                             "", InsertBefore);
+      inputVal = EI;
+    }
+    return inputVal;
+  }
+
+  // Generate Code to call the kernel
+  // The plan is to replace the internal node with a leaf node. This method is
+  // used to generate a function to associate with this leaf node. The function
+  // is responsible for all the memory allocation/transfer and invoking the
+  // kernel call on the device
+  void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileName, const Twine& KernelName) {
+    // Check if clone already exists. If it does, it means we have visited this
+    // function before and nothing else needs to be done for this leaf node.
+    assert(N->getGenFunc() != NULL && "Code already generated for this node");
+
+    Function* F = N->getFuncPointer();
+
+
+    // Create of clone of F with no instructions. Only the type is the same as F
+    // without the extra arguments.
+    Function* F_X86;
+
+    // Clone the function, if we are seeing this function for the first time. We
+    // only need a clone in terms of type.
+    ValueToValueMapTy VMap;
+
+    // Create new function with the same type
+    F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
+
+    // Loop over the arguments, copying the names of arguments over.
+    Function::arg_iterator dest_iterator = F_X86->arg_begin();
+    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+         i != e; ++i) {
+      dest_iterator->setName(i->getName()); // Copy the name over...
+      // Add mapping to VMap and increment dest iterator
+      VMap[i] = dest_iterator++;
+    }
+
+    // Add a basic block to this empty function
+    BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86);
+    ReturnInst* RI = ReturnInst::Create(M.getContext(),
+                                        UndefValue::get(F_X86->getReturnType()), BB);
+
+    //Add old func: new func pair to the FMap
+    N->setGenFunc(F_X86, DFNode::X86);
+
+    // FIXME: Adding Index and Dim arguments are probably not required except
+    // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do
+    // have those arguments)
+    
+    // Add Index and Dim arguments except for the root node
+    if(!N->isRoot())
+      addIdxDimArgs(F_X86);
+
+    // Sort children in topological order before code generation for kernel call
+    N->getChildGraph()->sortChildren();
+
+    // The DFNode N has the property that it has only one child (leaving Entry
+    // and Exit dummy nodes). This child is the PTX kernel. This simplifies code
+    // generation for kernel calls significantly. All the inputs to this child
+    // node would either be constants or from the parent node N.
+   
+    assert(N->getChildGraph()->size() == 3
+        && "Node expected to have just one non-dummy node!");
+
+    DFNode* C;
+    for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+        ce = N->getChildGraph()->end(); ci != ce; ++ci) {
+      C = *ci;
+      // Skip dummy node call
+      if (!C->isDummyNode())
+        break;
+    }
+
+    assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!");
+
+    Function* CF = C->getFuncPointer();
+    // Initialize context
+    CallInst::Create(llvm_visc_ptx_initContext, None, "", RI);
+
+    // Initialize command queue
+    Constant* file = ConstantDataArray::get(M.getContext(),
+          ArrayRef<uint8_t>((uint8_t*)FileName.str().c_str(), FileName.str().length()));
+
+    Constant* kernel = ConstantDataArray::get(M.getContext(),
+          ArrayRef<uint8_t>((uint8_t*)KernelName.str().c_str(), KernelName.str().length()));
+
+    Value* LaunchInstArgs[] = {file, kernel};
+    CallInst* GraphID = CallInst::Create(llvm_visc_ptx_launch,
+                                        ArrayRef<Value*>(LaunchInstArgs, 2),
+                                        "graph"+CF->getName(),
+                                        RI);
+    // Iterate over the required input edges of the node and use the visc-rt API
+    // to set inputs
+    for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
+
+      Value* inputVal = getInValueAt(C, i, F_X86, RI);
+      // input value has been obtained.
+      // Check if input is a scalar value or a pointer operand
+      // For scalar values such as int, float, etc. the size is simply the size of
+      // type on target machine, but for pointers, the size of data would be the
+      // next integer argument
+      if(inputVal->getType()->isPointerTy()) {
+        // Pointer Input
+        Value* inputSize = getInValueAt(C, i+1, F_X86, RI);
+        assert(inputSize->getType()->isIntegerTy()
+            && "Pointer type input must always be followed by size (integer type)");
+        Value* setInputArgs[] = {GraphID,
+                                inputVal,
+                                ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                                inputSize 
+                                };
+        CallInst::Create(llvm_visc_ptx_input_ptr,
+              ArrayRef<Value*>(setInputArgs, 4), "", RI);
+      }
+      else { // Scalar Input
+        Value* setInputArgs[] = {GraphID,
+                                inputVal,
+                                ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                                ConstantExpr::getSizeOf(inputVal->getType())
+                                };
+        CallInst::Create(llvm_visc_ptx_input_scalar,
+              ArrayRef<Value*>(setInputArgs, 4), "", RI);
+      }
+
+    }
+    // Setup output
+    // FIXME: Note - There is a tricky question. In X86 we do not need to care
+    // about pointer inputs which modify data in memory implicitly (without
+    // showing it as output). There is no extra cost needed to handle such inputs
+    // For PTX, we need to read back such data from device memory to host memory.
+    // The cost is huge and hence we need to differentiate between readonly
+    // pointer inputs vs read/write pointer inputs. Currently supporting only a
+    // simple model in which all input edges are readonly and output is
+    // writeonly.
+    
+    // Set output
+    StructType* OutputTy = C->getOutputType();
+    unsigned outputIndex = CF->getFunctionType()->getNumParams();
+    Value* outputSize = ConstantExpr::getSizeOf(OutputTy);
+    Value* setOutputArgs[] = {GraphID,
+                              ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex),
+                              ConstantExpr::getSizeOf(OutputTy)};
+
+    CallInst* d_Output = CallInst::Create(llvm_visc_ptx_output_ptr,
+                                        ArrayRef<Value*>(setOutputArgs,3),
+                                        "d_output."+CF->getName(),
+                                        RI);
+
+    // Enqueue kernel
+    // Need work dim, localworksize, globalworksize
+    // FIXME: Talk to DFG2LLVM_PTX pass to figure out the workdim, loacal work
+    // size and global work size
+    Value* ExecNodeArgs[] = {GraphID,
+                            ConstantInt::get(Type::getInt32Ty(M.getContext()), C->getNumOfDim()),
+                            Constant::getNullValue(Type::getInt64PtrTy(M.getContext())),
+                            Constant::getNullValue(Type::getInt64PtrTy(M.getContext()))
+                            };
+    CallInst* Event = CallInst::Create(llvm_visc_ptx_executeNode,
+                                       ArrayRef<Value*>(ExecNodeArgs, 4),
+                                       "event."+CF->getName(),
+                                       RI);
+    // Wait for Kernel to Finish
+    CallInst::Create(llvm_visc_ptx_wait,
+                     ArrayRef<Value*>(GraphID),
+                     "",
+                     RI);
+    // Read Output
+    Value* GetOutputArgs[] = {GraphID,
+                              d_Output,
+                              outputSize};
+    CallInst* h_Output = CallInst::Create(llvm_visc_ptx_getOutput,
+                                          ArrayRef<Value*>(GetOutputArgs, 3),
+                                          "h_output."+CF->getName(),
+                                          RI);
+    // Prepare output
+    CastInst* BI = BitCastInst::CreatePointerCast(h_Output, CF->getReturnType(), "output.ptr", RI);
+    LoadInst* KernelOutput = new LoadInst(BI, "output."+CF->getName(), RI);
+    OutputMap[C] = KernelOutput;
+
+    DEBUG(errs() << "*** Generating epilogue code for the function****\n");
+    // Generate code for output bindings
+    // Get Exit node
+    C = N->getChildGraph()->getExit();
+    // Get OutputType of this node
+    StructType* OutTy = N->getOutputType();
+    Value *retVal = UndefValue::get(F_X86->getReturnType());
+    // Find all the input edges to exit node
+    for (unsigned i=0; i < OutTy->getNumElements(); i++) {
+      DEBUG(errs() << "Output Edge " << i << "\n");
+      // Find the incoming edge at the requested input port
+      DFEdge* E = C->getInDFEdgeAt(i);
+
+      assert(E && "No Binding for output element!");
+      // Find the Source DFNode associated with the incoming edge
+      DFNode* SrcDF = E->getSourceDF();
+
+      DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
+
+      // If Source DFNode is a dummyNode, edge is from parent. Get the
+      // argument from argument list of this internal node
+      Value* inputVal;
+      if(SrcDF->isEntryNode()) {
+        inputVal = getArgumentAt(F_X86, i);
+        DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
+      }
+      else {
+        // edge is from a internal node
+        // Check - code should already be generated for this source dfnode
+        assert(OutputMap.count(SrcDF)
+               && "Source node call not found. Dependency violation!");
+
+        // Find Output Value associated with the Source DFNode using OutputMap
+        Value* CI = OutputMap[SrcDF];
+
+        // Extract element at source position from this call instruction
+        std::vector<unsigned> IndexList;
+        IndexList.push_back(E->getSourcePosition());
+        DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
+        ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
+                               "",RI);
+        inputVal = EI;
+      }
+      std::vector<unsigned> IdxList;
+      IdxList.push_back(i);
+      retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
+    }
+    DEBUG(errs() << "Extracted all\n");
+    retVal->setName("output");
+    ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
+    ReplaceInstWithInst(RI, newRI);
+  }
+
+
 // Right now, only targeting the one level case. In general, device functions
 // can return values so we don't need to change them
   void CodeGenTraversal::codeGen(DFInternalNode* N) {
@@ -144,7 +519,7 @@ namespace {
     std::vector<IntrinsicInst *> IItoRemove;
     BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap;
 
-    // Get the function associated with the dataflow node      
+    // Get the function associated with the dataflow node
     Function *F = N->getFuncPointer();
 
     // Look up if we have visited this function before. If we have, then just
@@ -381,7 +756,7 @@ namespace {
       } else {
         //TODO: how to handle address space qualifiers in load/store
       }
-        
+
     }
 
     // We need to do this explicitly: DCE pass will not remove them because we
diff --git a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
index 71de15d38546b00fa46c0dc11888a564ee5bef76..8b61d42af9cf2987db155cb88f246315519dc597 100644
--- a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -67,15 +67,6 @@ private:
   Module* runtimeModule;
   Constant* llvm_visc_x86_launch;
   Constant* llvm_visc_x86_wait;
-  Constant* llvm_visc_ptx_launch;
-  Constant* llvm_visc_ptx_wait;
-  Constant* llvm_visc_ptx_initContext;
-  Constant* llvm_visc_ptx_input_scalar;
-  Constant* llvm_visc_ptx_input_ptr;
-  Constant* llvm_visc_ptx_output_ptr;
-  Constant* llvm_visc_ptx_getOutput;
-  Constant* llvm_visc_ptx_executeNode;
-  FunctionType* AppFuncTy;
 
 
   //Functions
@@ -162,38 +153,6 @@ void CodeGenTraversal::initRuntimeAPI() {
   llvm_visc_x86_wait = M.getOrInsertFunction("llvm_visc_x86_wait",
       runtimeModule->getFunction("llvm_visc_x86_wait")->getFunctionType());
   DEBUG(errs() << *llvm_visc_x86_wait);
- 
-  llvm_visc_ptx_launch = M.getOrInsertFunction("llvm_visc_ptx_launch",
-      runtimeModule->getFunction("llvm_visc_ptx_launch")->getFunctionType());
-  DEBUG(errs() << *llvm_visc_ptx_launch);
-
-  llvm_visc_ptx_wait = M.getOrInsertFunction("llvm_visc_ptx_wait",
-      runtimeModule->getFunction("llvm_visc_ptx_wait")->getFunctionType());
-  DEBUG(errs() << *llvm_visc_ptx_wait);
-
-  llvm_visc_ptx_initContext = M.getOrInsertFunction("llvm_visc_ptx_initContext"  ,
-      runtimeModule->getFunction("llvm_visc_ptx_initContext")->getFunctionType());
-  DEBUG(errs() << *llvm_visc_ptx_initContext);
-
-  llvm_visc_ptx_input_scalar = M.getOrInsertFunction("llvm_visc_ptx_input_scalar",
-      runtimeModule->getFunction("llvm_visc_ptx_input_scalar")->getFunctionType());
-  DEBUG(errs() << *llvm_visc_ptx_input_scalar);
-  
-  llvm_visc_ptx_input_ptr = M.getOrInsertFunction("llvm_visc_ptx_input_ptr",
-      runtimeModule->getFunction("llvm_visc_ptx_input_ptr")->getFunctionType());
-  DEBUG(errs() << *llvm_visc_ptx_input_ptr);
-
-  llvm_visc_ptx_output_ptr = M.getOrInsertFunction("llvm_visc_ptx_output_ptr",
-      runtimeModule->getFunction("llvm_visc_ptx_output_ptr")->getFunctionType());
-  DEBUG(errs() << *llvm_visc_ptx_output_ptr);
-
-  llvm_visc_ptx_getOutput = M.getOrInsertFunction("llvm_visc_ptx_getOutput",
-    runtimeModule->getFunction("llvm_visc_ptx_getOutput")->getFunctionType());
-  DEBUG(errs() << *llvm_visc_ptx_getOutput);
-
-  llvm_visc_ptx_executeNode = M.getOrInsertFunction("llvm_visc_ptx_executeNode",
-      runtimeModule->getFunction("llvm_visc_ptx_executeNode")->getFunctionType());
-  DEBUG(errs() << *llvm_visc_ptx_executeNode);
 
 }
 
@@ -345,7 +304,7 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) {
    */
   // Create Launch Function of type i8*(i8*) which calls the root function 
   Type* i8Ty = Type::getInt8Ty(M.getContext());
-  AppFuncTy = FunctionType::get(i8Ty->getPointerTo(),
+  FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(),
                                 ArrayRef<Type*>(i8Ty->getPointerTo()),
                                 false);
   Function* AppFunc = Function::Create(AppFuncTy,
@@ -523,82 +482,6 @@ void CodeGenTraversal::invokeChild_X86(DFNode* C, Function* F_X86,
 
 }
 
-void CodeGenTraversal::invokeChild_PTX(DFNode* C, Function* F_X86,
-    ValueToValueMapTy &VMap, Instruction* IB) {
-  Function* CF = C->getFuncPointer();
-
-  //FIXME: A way to check if PTX code has been generated for this child node
-  /*assert(FMap.count(CF)
-         && "Found leaf node for which code generation has not happened yet!");
-  */
-  //assert(C->getTag() == DFNode::PTX && "Cannot generate GPU call for non PTX nodes");
-
-  // Initialize context
-  CallInst::Create(llvm_visc_ptx_initContext, None, "", IB);
-
-  // Initialize command queue
-  // Filename = <DFNode function name>.nvptx.ll
-  Twine file = CF->getName() + ".nvptx.ll";
-  DEBUG(errs() << file << "\n");
-  Constant* filename = ConstantDataArray::get(M.getContext(),
-        ArrayRef<uint8_t>((uint8_t*)file.str().c_str(), file.str().length()));
-
-  CallInst* GraphID = CallInst::Create(llvm_visc_ptx_launch,
-                                      ArrayRef<Value*>(filename),
-                                      "graph"+CF->getName(),
-                                      IB);
-
-  // Iterate over the required input edges of the node and use the visc-rt API
-  // to set inputs
-  for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
-
-    Value* inputVal = getInValueAt(C, i, F_X86, IB);
-    // input value has been obtained.
-    // Check if input is a scalar value or a pointer operand
-    // For scalar values such as int, float, etc. the size is simply the size of
-    // type on target machine, but for pointers, the size of data would be the
-    // next integer argument
-    Value* inputSize;
-    if(inputVal->getType()->isPointerTy()) {
-      // Pointer Input
-      inputSize = getInValueAt(C, i+1, F_X86, IB);
-      assert(inputSize->getType()->isIntegerTy()
-          && "Pointer type input must always be followed by size (integer type)");
-    }
-    else { // Scalar Input
-      inputSize = ConstantExpr::getSizeOf(inputVal->getType());
-    }
-
-    Value* setInputArgs[] = {GraphID,
-                            inputVal,
-                            ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                            inputSize
-                            };
-    CallInst::Create(llvm_visc_ptx_input_ptr,
-          ArrayRef<Value*>(setInputArgs, 4), "", IB);
-
-  }
-  // Setup output
-  // FIXME: Note - There is a tricky question. In X86 we do not need to care
-  // about pointer inputs which modify data in memory implicitly (without
-  // showing it as output). There is no extra cost needed to handle such inputs
-  // For PTX, we need to read back such data from device memory to host memory.
-  // The cost is huge and hence we need to differentiate between readonly
-  // pointer inputs vs read/write pointer inputs. Currently supporting only a
-  // simple model in which all input edges are readonly and output is
-  // writeonly.
-  StructType* OutputTy = C->getOutputType();
-  for(unsigned i=0; OutputTy->getNumElements(); i++) {
-    Type* elemTy = OutputTy->getElementType(i);
-  }
-
-  // Enqueue kernel
-  // Read Output
-  // return output
-  // free data structures
-
-}
-
 void CodeGenTraversal::codeGen(DFInternalNode* N) {
   Function* F = N->getFuncPointer();
 
@@ -653,11 +536,7 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
       continue;
 
     // Check if Child Node has PTX tag or X86 tag
-    invokeChild_PTX(C, F_X86, VMap, RI);
-    if (C->getTag() == DFNode::PTX) 
-      invokeChild_PTX(C, F_X86, VMap, RI);
-    else 
-      invokeChild_X86(C, F_X86, VMap, RI);
+    invokeChild_X86(C, F_X86, VMap, RI);
   }
 
   DEBUG(errs() << "*** Generating epilogue code for the function****\n");
diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp
index 4b05c8178bfedd860728ebdb97546b398e4fa33c..f753d9ad1d38d3178d3e3a4f27ff4f1006a18987 100644
--- a/llvm/projects/visc-rt/visc-rt.cpp
+++ b/llvm/projects/visc-rt/visc-rt.cpp
@@ -15,6 +15,11 @@ typedef struct {
   cl_kernel clKernel;
 } DFNodeContext_PTX;
 
+typedef struct {
+  cl_mem d_elem;
+  size_t size;
+} OutputTy;
+
 cl_context globalGPUContext;
 
 static inline void checkErr(cl_int err, cl_int success, const char * name) {
@@ -118,6 +123,18 @@ void* llvm_visc_ptx_getOutput(void* graphID, void* d_output, size_t size) {
   cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size,
                                 h_output, 0, NULL, NULL);
   checkErr(errcode, CL_SUCCESS, "Failure to read output");
+  // Assuming all output is in the format of device pointer followed by size of
+  // output size format
+  OutputTy* Output = (OutputTy*) h_output;
+  unsigned numElems = size/sizeof(OutputTy);
+  for(unsigned i = 0; i < numElems; i++) {
+    OutputTy& outputElem = Output[i];
+    void* h_outputElem = malloc(outputElem.size);
+    errcode = clEnqueueReadBuffer(Context->clCommandQue, outputElem.d_elem, CL_TRUE, 0,
+                                  outputElem.size, h_outputElem, 0, NULL, NULL);
+    checkErr(errcode, CL_SUCCESS, "Failure to read output");
+    Output[i].d_elem = (cl_mem) h_outputElem;
+  }
   return h_output;
 }