diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index d7cdfa2e40cc09daf0987900d4619ac923d6e1dc..425cabd0b485db37c9ea1481e05905f43aef8523 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -443,8 +443,8 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
   DEBUG(errs() << "Initializing commandQ" << "\n");
   // Initialize command queue
   Value* fileStr = getStringPointer(FileName, RI, "Filename");
-  errs() << *fileStr << "\n";
-  errs() << "Generating code for kernel - " << kernel->KernelFunction->getName()<< "\n";
+  DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n");
+  DEBUG(errs() << "Generating code for kernel - " << kernel->KernelFunction->getName()<< "\n");
   Value* kernelStr = getStringPointer(kernel->KernelFunction->getName(), RI,"KernelName");
 
   Value* LaunchInstArgs[] = {fileStr, kernelStr};
@@ -458,7 +458,6 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
   // Iterate over the required input edges of the node and use the visc-rt API
   // to set inputs
   DEBUG(errs() << "Iterate over input edges of node and insert visc api\n");
-
   std::vector<OutputPtr> OutputPointers;
   for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
 
@@ -469,6 +468,7 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
     // type on target machine, but for pointers, the size of data would be the
     // next integer argument
     if(inputVal->getType()->isPointerTy()) {
+      // Pointer Input
       // CheckAttribute
       Value* isOutput = (hasAttribute(CF, i, Attribute::Out))? True : False;
       Value* isInput = ((hasAttribute(CF, i, Attribute::Out))
@@ -482,11 +482,11 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
         errs() << *A << " is an INPUT argument\n";
       }
 
+
       Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal,
                              Type::getInt8PtrTy(M.getContext()),
                              inputVal->getName()+".i8ptr",
                              RI);
-      // Pointer Input
       Value* inputSize = getInValueAt(C, i+1, F_X86, RI);
       assert(inputSize->getType() == Type::getInt64Ty(M.getContext())
              && "Pointer type input must always be followed by size (integer type)");
@@ -503,7 +503,8 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
       // memory to read device memory later
       if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
     }
-    else { // Scalar Input
+    else {
+      // Scalar Input
       // Store the scalar value on stack and then pass the pointer to its
       // location
       AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), inputVal->getName()+".ptr", RI);
@@ -525,15 +526,6 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
 
   }
   DEBUG(errs() << "Setup output edges of node and insert visc api\n");
-  // Setup output
-  // FIXME: Note - There is a tricky question. In X86 we do not need to care
-  // about pointer inputs which modify data in memory implicitly (without
-  // showing it as output). There is no extra cost needed to handle such inputs
-  // For PTX, we need to read back such data from device memory to host memory.
-  // The cost is huge and hence we need to differentiate between readonly
-  // pointer inputs vs read/write pointer inputs. Currently supporting only a
-  // simple model in which all input edges are readonly and output is
-  // writeonly.
 
   // Set output
   StructType* OutputTy = C->getOutputType();
@@ -554,8 +546,6 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
 
   // Enqueue kernel
   // Need work dim, localworksize, globalworksize
-  // FIXME: Talk to DFG2LLVM_PTX pass to figure out the workdim, loacal work
-  // size and global work size
   // Allocate size_t[numDims] space on stack. Store the work group sizes and
   // pass it as an argument to ExecNode
 
@@ -571,11 +561,13 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
                                      "event."+CF->getName(),
                                      RI);
   DEBUG(errs() << "Execute Node Call: " << *Event << "\n");
+
   // Wait for Kernel to Finish
   CallInst::Create(llvm_visc_ptx_wait,
                    ArrayRef<Value*>(GraphID),
                    "",
                    RI);
+
   // Read Output Struct
   Value* GetOutputArgs[] = {GraphID,
                             Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
@@ -589,7 +581,8 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
   // Read each device pointer listed in output struct
   // Load the output struct
   CastInst* BI = BitCastInst::CreatePointerCast(h_Output, CF->getReturnType()->getPointerTo(), "output.ptr", RI);
-  Value* KernelOutput = new LoadInst(BI, "", RI);
+  Value* KernelOutput = new LoadInst(BI, "output."+CF->getName(), RI);
+  OutputMap[C] = KernelOutput;
 
   // Read all the pointer arguments which had side effects i.e., had out
   // attribute
@@ -602,37 +595,6 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
                                     ArrayRef<Value*>(GetOutputArgs, 4),
                                     "", RI);
   }
-  /*for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
-    Type* elemTy = OutputTy->getElementType(i);
-    if(elemTy->isPointerTy()) {
-      // Pointer type
-      assert(OutputTy->getElementType(i+1) == Type::getInt64Ty(M.getContext())
-          && "Every Pointer type must be followed by an integer");
-      ExtractValueInst* d_ptr = ExtractValueInst::Create(KernelOutput, ArrayRef<unsigned>(i), "", RI);
-      // Change d_ptr to i8*
-      CastInst* d_ptr_i8 = BitCastInst::CreatePointerCast(d_ptr, Type::getInt8PtrTy(M.getContext()), "", RI);
-      ExtractValueInst* len = ExtractValueInst::Create(KernelOutput, ArrayRef<unsigned>(i+1), "", RI);
-      // GetOutputPtr call
-      Value* GetOutputArgs[] = {GraphID,
-                                d_ptr_i8,
-                                len};
-      CallInst* h_ptr_i8 = CallInst::Create(llvm_visc_ptx_getOutput,
-                                            ArrayRef<Value*>(GetOutputArgs, 3),
-                                            "",
-                                            RI);
-      // Change h_ptr to correct type
-      CastInst* h_ptr = CastInst::CreatePointerCast(h_ptr_i8,
-                                               cast<StructType>(KernelOutput->getType())->getElementType(i),
-                                               "",
-                                               RI);
-      KernelOutput = InsertValueInst::Create(KernelOutput, h_ptr, ArrayRef<unsigned>(i), "", RI);
-
-    }
-  }*/
-
-  // Prepare output
-  KernelOutput->setName("output."+CF->getName());
-  OutputMap[C] = KernelOutput;
 
   DEBUG(errs() << "*** Generating epilogue code for the function****\n");
   // Generate code for output bindings
@@ -681,6 +643,7 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
     IdxList.push_back(i);
     retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
   }
+
   DEBUG(errs() << "Extracted all\n");
   retVal->setName("output");
   ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);