diff --git a/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp b/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
index 6d04074b96b1d5043f25c58f6be6ef018cc1edb7..07d7667fa044732293833382a9f57b05c8297509 100644
--- a/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
+++ b/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
@@ -74,6 +74,7 @@ public:
     // Generate code for this internal node now. This way all the cloned
     // functions for children exist.
     deleteNode(N);
+    DEBUG(errs() << "\tDone - " << "\n");
     //errs() << "DONE: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n";
   }
 
diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 6926e6cfed65679b455152cffc311ae946dab314..59bdb622b93ef45a4d2b01ed415ff5e86b524470 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -147,6 +147,7 @@ private:
   Constant* llvm_visc_ocl_clearContext;
   Constant* llvm_visc_ocl_argument_scalar;
   Constant* llvm_visc_ocl_argument_ptr;
+  Constant* llvm_visc_ocl_output_ptr;
   Constant* llvm_visc_ocl_free;
   Constant* llvm_visc_ocl_getOutput;
   Constant* llvm_visc_ocl_executeNode;
@@ -245,6 +246,7 @@ void CGT_NVPTX::initRuntimeAPI() {
   DECLARE(llvm_visc_ocl_clearContext);
   DECLARE(llvm_visc_ocl_argument_scalar);
   DECLARE(llvm_visc_ocl_argument_ptr);
+  DECLARE(llvm_visc_ocl_output_ptr);
   DECLARE(llvm_visc_ocl_free);
   DECLARE(llvm_visc_ocl_getOutput);
   DECLARE(llvm_visc_ocl_executeNode);
@@ -332,7 +334,7 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
   // have those arguments)
 
   // Add Index and Dim arguments except for the root node
-  if(!N->isRoot())
+  if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
     addIdxDimArgs(F_X86);
 
   /* TODO: Use this code to verufy if this is a good pattern for PTX kernel
@@ -480,24 +482,23 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
 
   // Set output if struct is not an empty struct
   StructType* OutputTy = K->KernelLeafNode->getOutputType();
-  Value *outputSize, *d_Output;
+  std::vector<Value*> d_Outputs;
   if(!OutputTy->isEmptyTy()) {
     switchToTimer(visc_TimerID_COPY_PTR, RI);
     // Not an empty struct
-    unsigned outputIndex = KF->getFunctionType()->getNumParams();
-    outputSize = ConstantExpr::getSizeOf(OutputTy);
-    Value* setOutputArgs[] = {GraphID,
-                              Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
-                              ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex),
-                              ConstantExpr::getSizeOf(OutputTy),
-                              False,
-                              True
-                             };
-
-    d_Output = CallInst::Create(llvm_visc_ocl_argument_ptr,
-                                ArrayRef<Value*>(setOutputArgs, 6),
+    // Iterate over all elements of the struct and put them in
+    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
+      unsigned outputIndex = KF->getFunctionType()->getNumParams()+i;
+      Value* setOutputArgs[] = {GraphID,
+                                ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex),
+                                ConstantExpr::getSizeOf(OutputTy->getElementType(i))};
+
+      CallInst* d_Output = CallInst::Create(llvm_visc_ocl_output_ptr,
+                                ArrayRef<Value*>(setOutputArgs, 3),
                                 "d_output."+KF->getName(),
                                 RI);
+      d_Outputs.push_back(d_Output);
+    }
   }
 
   // Enqueue kernel
@@ -529,22 +530,30 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
   switchToTimer(visc_TimerID_READ_OUTPUT, RI);
   // Read Output Struct if not empty
   if(!OutputTy->isEmptyTy()) {
-    Value* GetOutputArgs[] = {GraphID,
+    std::vector<Value*>h_Outputs;
+    Value* KernelOutput = UndefValue::get(OutputTy);
+    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
+      Value* GetOutputArgs[] = {GraphID,
                               Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
-                              d_Output,
-                              outputSize
+                              d_Outputs[i],
+                              ConstantExpr::getSizeOf(OutputTy->getElementType(i))
                              };
-    CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput,
+      CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput,
                                           ArrayRef<Value*>(GetOutputArgs, 4),
                                           "h_output."+KF->getName()+".addr",
                                           RI);
-    // Read each device pointer listed in output struct
-    // Load the output struct
-    CastInst* BI = BitCastInst::CreatePointerCast(h_Output, KF->getReturnType()->getPointerTo(), "output.ptr", RI);
-    Value* KernelOutput = new LoadInst(BI, "output."+KF->getName(), RI);
+      // Read each device pointer listed in output struct
+      // Load the output struct
+      CastInst* BI = BitCastInst::CreatePointerCast(h_Output,
+          OutputTy->getElementType(i)->getPointerTo(), "output.ptr", RI);
+
+      Value* OutputElement = new LoadInst(BI, "output."+KF->getName(), RI);
+      KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, ArrayRef<unsigned>(i),
+                                            KF->getName()+"output", RI);
+    }
     OutputMap[K->KernelLeafNode] = KernelOutput;
   }
-
+  
   // Read all the pointer arguments which had side effects i.e., had out
   // attribute
   DEBUG(errs() << "Output Pointers : " << OutputPointers.size() << "\n");
@@ -642,7 +651,14 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
 // Right now, only targeting the one level case. In general, device functions
 // can return values so we don't need to change them
 void CGT_NVPTX::codeGen(DFInternalNode* N) {
-
+  errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n";
+  if(KernelLaunchNode == NULL)
+    errs () << "No kernel launch node\n";
+  else {
+    errs() << "KernelLaunchNode is not null: "<< KernelLaunchNode<<"\n";
+    errs () << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n";
+  }
+ 
   if (!KernelLaunchNode) {
     DEBUG(errs() << "No code generated (host code for kernel launch complete).\n");
     return;
@@ -721,6 +737,13 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
     DEBUG(errs() << "Skipping dummy node\n");
     return;
   }
+ 
+  // Generate code only if it has the right hint
+  if(!checkPreferredTarget(N, visc::GPU_TARGET)) {
+    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
+    return;
+  }
+ 
 
   // Checking which node is the kernel launch
   DFNode* PNode = N->getParent();
@@ -1185,23 +1208,16 @@ void CGT_NVPTX::transformFunctionToVoid(Function* F) {
   else {
     // The struct has return values, thus needs to be converted to parameter
 
-    int initialNumParams = F->arg_size();
-
-    Type* ArgType = FRetTy->getPointerTo(GENERIC_ADDRSPACE);
-    new Argument(ArgType, "ret_struct_ptr", F);
-    DEBUG(errs() << "\tCreated parameter\n");
-
-    // Find where the new parameter is in the header
-    Function::arg_iterator ai, ae;
-    int check = 0;
-    for (ai = F->arg_begin(), ae = F->arg_end();
-         ai != ae; ++ai) {
-      if (ai->getName().equals("ret_struct_ptr")) break;
-      check++;
+    // Iterate over all element types of return struct and add arguments to the
+    // function
+    std::vector<Argument*> Args;
+    for (unsigned i=0; i<FRetTy->getNumElements(); i++) {
+      Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
+      Args.push_back(RetArg);
+      DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
     }
 
-    // DEBUG(errs() << "\tcheck = " << check << "\tinitialNumParams = " << initialNumParams << "\n");
-    assert(check == initialNumParams);
+    Function::arg_iterator ai, ae;
 
     DEBUG(errs() << "\tReplacing Return statements\n");
     // Replace return statements with extractValue and store instructions
@@ -1209,11 +1225,15 @@ void CGT_NVPTX::transformFunctionToVoid(Function* F) {
          rie = RItoRemove.end(); rii != rie; ++rii) {
       ReturnInst* RI = (*rii);
       Value* RetVal = RI->getReturnValue();
+      for(unsigned i = 0; i < Args.size(); i++) {
+        ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i),
+                                    Args[i]->getName()+".val", RI);
+        new StoreInst(EI, Args[i], RI);
+      }
       // assert(RetVal && "Return value should not be null at this point");
       // StructType* RetType = cast<StructType>(RetVal->getType());
       // assert(RetType && "Return type is not a struct");
 
-      new StoreInst(RetVal, &(*ai), RI);
       ReturnInst::Create((F->getContext()), 0, RI);
       RI->eraseFromParent();
 
@@ -1237,7 +1257,6 @@ void CGT_NVPTX::transformFunctionToVoid(Function* F) {
 
   // Change the function type
   F->mutateType(PTy);
-
 }
 
 /******************************************************************************
diff --git a/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp b/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp
index a05fe6080262ccf4657650c6169900ac4e35d6f9..51a40baa94bae48dc4fe4b5994d3bd43f07e8e4c 100644
--- a/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp
@@ -151,6 +151,7 @@ private:
   Constant* llvm_visc_ocl_clearContext;
   Constant* llvm_visc_ocl_argument_scalar;
   Constant* llvm_visc_ocl_argument_ptr;
+  Constant* llvm_visc_ocl_output_ptr;
   Constant* llvm_visc_ocl_free;
   Constant* llvm_visc_ocl_getOutput;
   Constant* llvm_visc_ocl_executeNode;
@@ -177,6 +178,7 @@ public:
 
   // Constructor
   CGT_SPIR(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG), KernelM(*CloneModule(&_M)) {
+    KernelLaunchNode = NULL;
     init();
     initRuntimeAPI();
 
@@ -250,6 +252,7 @@ void CGT_SPIR::initRuntimeAPI() {
   DECLARE(llvm_visc_ocl_clearContext);
   DECLARE(llvm_visc_ocl_argument_scalar);
   DECLARE(llvm_visc_ocl_argument_ptr);
+  DECLARE(llvm_visc_ocl_output_ptr);
   DECLARE(llvm_visc_ocl_free);
   DECLARE(llvm_visc_ocl_getOutput);
   DECLARE(llvm_visc_ocl_executeNode);
@@ -337,7 +340,7 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil
   // have those arguments)
 
   // Add Index and Dim arguments except for the root node
-  if(!N->isRoot())
+  if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
     addIdxDimArgs(F_X86);
 
   /* TODO: Use this code to verufy if this is a good pattern for OCL kernel
@@ -485,24 +488,23 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil
 
   // Set output if struct is not an empty struct
   StructType* OutputTy = K->KernelLeafNode->getOutputType();
-  Value *outputSize, *d_Output;
+  std::vector<Value*> d_Outputs;
   if(!OutputTy->isEmptyTy()) {
     switchToTimer(visc_TimerID_COPY_PTR, RI);
     // Not an empty struct
-    unsigned outputIndex = KF->getFunctionType()->getNumParams();
-    outputSize = ConstantExpr::getSizeOf(OutputTy);
-    Value* setOutputArgs[] = {GraphID,
-                              Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
-                              ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex),
-                              ConstantExpr::getSizeOf(OutputTy),
-                              False,
-                              True
-                             };
-
-    d_Output = CallInst::Create(llvm_visc_ocl_argument_ptr,
-                                ArrayRef<Value*>(setOutputArgs, 6),
+    // Iterate over all elements of the struct and put them in
+    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
+      unsigned outputIndex = KF->getFunctionType()->getNumParams()+i;
+      Value* setOutputArgs[] = {GraphID,
+                                ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex),
+                                ConstantExpr::getSizeOf(OutputTy->getElementType(i))};
+
+      CallInst* d_Output = CallInst::Create(llvm_visc_ocl_output_ptr,
+                                ArrayRef<Value*>(setOutputArgs, 3),
                                 "d_output."+KF->getName(),
                                 RI);
+      d_Outputs.push_back(d_Output);
+    }
   }
 
   // Enqueue kernel
@@ -534,19 +536,27 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil
   switchToTimer(visc_TimerID_READ_OUTPUT, RI);
   // Read Output Struct if not empty
   if(!OutputTy->isEmptyTy()) {
-    Value* GetOutputArgs[] = {GraphID,
+    std::vector<Value*>h_Outputs;
+    Value* KernelOutput = UndefValue::get(OutputTy);
+    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
+      Value* GetOutputArgs[] = {GraphID,
                               Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
-                              d_Output,
-                              outputSize
+                              d_Outputs[i],
+                              ConstantExpr::getSizeOf(OutputTy->getElementType(i))
                              };
-    CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput,
+      CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput,
                                           ArrayRef<Value*>(GetOutputArgs, 4),
                                           "h_output."+KF->getName()+".addr",
                                           RI);
-    // Read each device pointer listed in output struct
-    // Load the output struct
-    CastInst* BI = BitCastInst::CreatePointerCast(h_Output, KF->getReturnType()->getPointerTo(), "output.ptr", RI);
-    Value* KernelOutput = new LoadInst(BI, "output."+KF->getName(), RI);
+      // Read each device pointer listed in output struct
+      // Load the output struct
+      CastInst* BI = BitCastInst::CreatePointerCast(h_Output,
+          OutputTy->getElementType(i)->getPointerTo(), "output.ptr", RI);
+
+      Value* OutputElement = new LoadInst(BI, "output."+KF->getName(), RI);
+      KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, ArrayRef<unsigned>(i),
+                                            KF->getName()+"output", RI);
+    }
     OutputMap[K->KernelLeafNode] = KernelOutput;
   }
 
@@ -647,6 +657,14 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil
 // Right now, only targeting the one level case. In general, device functions
 // can return values so we don't need to change them
 void CGT_SPIR::codeGen(DFInternalNode* N) {
+  errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n";
+  if(KernelLaunchNode == NULL)
+    errs () << "No kernel launch node\n";
+  else {
+    errs() << "KernelLaunchNode is not null: "<< KernelLaunchNode<<"\n";
+    errs () << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n";
+  }
+
 
   if (!KernelLaunchNode) {
     DEBUG(errs() << "No code generated (host code for kernel launch complete).\n");
@@ -720,6 +738,31 @@ void CGT_SPIR::codeGen(DFInternalNode* N) {
 
 }
 
+//static bool checkPreferredTarget(DFNode* N, visc::Target T) {
+  //Function* F = N->getFuncPointer();
+  //Module* M = F->getParent();
+  //NamedMDNode* HintNode;
+  //switch (T) {
+    //case visc::GPU_TARGET:
+      //HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+      //break;
+    //case visc::SPIR_TARGET:
+      //HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
+      //break;
+    //case visc::CPU_TARGET:
+      //HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+      //break;
+    //default:
+      //llvm_unreachable("Target Not supported yet!");
+  //}
+  //for (unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+    //MDNode* MetaNode = HintNode->getOperand(i);
+    //if(F == MetaNode->getOperand(0))
+      //return true;
+  //}
+  //return false;
+//}
+
 void CGT_SPIR::codeGen(DFLeafNode* N) {
 
   // Skip code generation if it is a dummy node
@@ -728,6 +771,12 @@ void CGT_SPIR::codeGen(DFLeafNode* N) {
     return;
   }
 
+  // Generate code only if it has the right hint
+  if(!checkPreferredTarget(N, visc::SPIR_TARGET)) {
+    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
+    return;
+  }
+
   // Checking which node is the kernel launch
   DFNode* PNode = N->getParent();
   int pLevel = PNode->getLevel();
@@ -739,6 +788,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) {
   // (2) Parent does not have multiple instances
   if (!pLevel || !pReplFactor) {
     KernelLaunchNode = PNode;
+    errs() << "Setting Kernel Launch Node\n";
     kernel = new Kernel(NULL,
                         N,
                         N->getInArgMap(),
@@ -1203,23 +1253,16 @@ void CGT_SPIR::transformFunctionToVoid(Function* F) {
   else {
     // The struct has return values, thus needs to be converted to parameter
 
-    int initialNumParams = F->arg_size();
-
-    Type* ArgType = FRetTy->getPointerTo(GENERIC_ADDRSPACE);
-    new Argument(ArgType, "ret_struct_ptr", F);
-    DEBUG(errs() << "\tCreated parameter\n");
-
-    // Find where the new parameter is in the header
-    Function::arg_iterator ai, ae;
-    int check = 0;
-    for (ai = F->arg_begin(), ae = F->arg_end();
-         ai != ae; ++ai) {
-      if (ai->getName().equals("ret_struct_ptr")) break;
-      check++;
+    // Iterate over all element types of return struct and add arguments to the
+    // function
+    std::vector<Argument*> Args;
+    for (unsigned i=0; i<FRetTy->getNumElements(); i++) {
+      Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
+      Args.push_back(RetArg);
+      DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
     }
 
-    // DEBUG(errs() << "\tcheck = " << check << "\tinitialNumParams = " << initialNumParams << "\n");
-    assert(check == initialNumParams);
+    Function::arg_iterator ai, ae;
 
     DEBUG(errs() << "\tReplacing Return statements\n");
     // Replace return statements with extractValue and store instructions
@@ -1227,11 +1270,15 @@ void CGT_SPIR::transformFunctionToVoid(Function* F) {
          rie = RItoRemove.end(); rii != rie; ++rii) {
       ReturnInst* RI = (*rii);
       Value* RetVal = RI->getReturnValue();
+      for(unsigned i = 0; i < Args.size(); i++) {
+        ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i),
+                                    Args[i]->getName()+".val", RI);
+        new StoreInst(EI, Args[i], RI);
+      }
       // assert(RetVal && "Return value should not be null at this point");
       // StructType* RetType = cast<StructType>(RetVal->getType());
       // assert(RetType && "Return type is not a struct");
 
-      new StoreInst(RetVal, &(*ai), RI);
       ReturnInst::Create((F->getContext()), 0, RI);
       RI->eraseFromParent();
 
diff --git a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
index 98f654d3da748e1250a9a36ac18d8be6a3d1cb0e..5f9f6cb12fc28f64ca1b2ba72c54b27035cdf3f5 100644
--- a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -837,7 +837,7 @@ Value* CGT_X86::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, I
   // argument from argument list of this internal node
   Value* inputVal;
   if(SrcDF->isEntryNode()) {
-    inputVal = getArgumentAt(ParentF_X86, i);
+    inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition());
     DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
   }
   else {
@@ -881,6 +881,12 @@ void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86,
   for(unsigned j=0; j<6; j++)
     Args.push_back(I32Zero);
 
+  errs() << "Function type: " << *CF_X86->getType() << "\n";
+  errs() << "Function type: " << *CF->getType() << "\n";
+  errs() << "Arguments: " << Args.size() << "\n";
+  for(unsigned i=0; i < Args.size(); i++)
+    errs() << *Args[i]->getType() << " ";
+  errs() << "\n";
   // Call the F_X86 function associated with this node
   CallInst* CI = CallInst::Create(CF_X86, Args,
                                   CF_X86->getName()+"_output",
diff --git a/llvm/lib/Transforms/GenVISC/GenVISC.cpp b/llvm/lib/Transforms/GenVISC/GenVISC.cpp
index 8ad0602703745e0a2c8b78fbda3a6b410d493408..3aa7735ee36720574b589d578cdd303abbf4b194 100644
--- a/llvm/lib/Transforms/GenVISC/GenVISC.cpp
+++ b/llvm/lib/Transforms/GenVISC/GenVISC.cpp
@@ -120,7 +120,7 @@ static void addHint(Function* F, visc::Target T) {
     HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
   }
   else {
-    HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+    HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
   }
 
   // Create a node for the function and add it to the hint node
@@ -826,11 +826,14 @@ bool GenVISC::runOnModule(Module &M) {
         Function* LaunchF = Intrinsic::getDeclaration(&M, Intrinsic::visc_launch);
         DEBUG(errs() << *LaunchF << "\n");
         // Get i8* cast to function pointer
-        Function* graphFunc = cast<Function>(CI->getArgOperand(0));
+        Function* graphFunc = cast<Function>(CI->getArgOperand(1));
         Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
 
+        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(0));
+        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
+                             : ConstantInt::getTrue(Ctx);
 
-        Value* LaunchArgs[] = {F, CI->getArgOperand(1), ConstantInt::getTrue(Ctx)};
+        Value* LaunchArgs[] = {F, CI->getArgOperand(2), isStreaming};
         CallInst* LaunchInst = CallInst::Create(LaunchF,
                                                 ArrayRef<Value*>(LaunchArgs, 3),
                                                 "graphID", CI);
@@ -1038,9 +1041,12 @@ bool GenVISC::runOnModule(Module &M) {
   }
 
   // Erase the __visc__node calls
-  DEBUG(errs() << "Erase Statements:\n");
+  DEBUG(errs() << "Erase " << toBeErased.size() << " Statements:\n");
   for(auto I: toBeErased) {
     DEBUG(errs() << *I << "\n");
+  }
+  for(auto I: toBeErased) {
+    DEBUG(errs() << "\tErasing " << *I << "\n");
     I->eraseFromParent();
   }
 
@@ -1125,7 +1131,7 @@ void GenVISC::genKernel(Function* KernelF, CallInst* CI, StructType* RetTy) {
     Instruction *I = &(*i);
     if(isVISCattributesCall(I)) {
       handleVISCAttributes(KernelF, cast<CallInst>(I));
-      I->eraseFromParent();
+      //I->eraseFromParent();
       break;
     }
   }