diff --git a/llvm/include/llvm/IR/DFGraph.h b/llvm/include/llvm/IR/DFGraph.h
index b6471a4cc1420458be65467df814780ea28196e4..a1ac2cb45e216acb643cab0cd29b0b5e26d2d5e3 100644
--- a/llvm/include/llvm/IR/DFGraph.h
+++ b/llvm/include/llvm/IR/DFGraph.h
@@ -335,7 +335,9 @@ public:
   bool isEntryNode();
   bool isExitNode();
   DFEdge* getInDFEdgeAt(unsigned inPort);
+  DFEdge* getOutDFEdgeAt(unsigned outPort);
   std::vector<unsigned> getInArgMap();
+  std::vector<unsigned> getOutArgMap();
   int getAncestorHops(DFNode* N);
 
   virtual void applyDFNodeVisitor(DFNodeVisitor &V) = 0;
@@ -567,6 +569,19 @@ DFEdge* DFNode::getInDFEdgeAt(unsigned inPort) {
   return NULL;
 }
 
+DFEdge* DFNode::getOutDFEdgeAt(unsigned outPort) {
+
+  // Cannot perform check for the number of outputs here,
+  // it depends on the node's return type
+
+  for(outdfedge_iterator i = outdfedge_begin(), e = outdfedge_end(); i != e; ++i) {
+    DFEdge* E = *i;
+    if(outPort == E->getSourcePosition())
+      return E;
+  }
+  return NULL;
+}
+
 std::vector<unsigned> DFNode::getInArgMap() {
   std::vector<unsigned> map(InDFEdges.size());
   for (unsigned i = 0; i < InDFEdges.size(); i++) {
@@ -577,6 +592,16 @@ std::vector<unsigned> DFNode::getInArgMap() {
   return map;
 }
 
+std::vector<unsigned> DFNode::getOutArgMap() {
+  std::vector<unsigned> map(OutDFEdges.size());
+  for (unsigned i = 0; i < OutDFEdges.size(); i++) {
+    DFEdge* E = getOutDFEdgeAt(i);
+    unsigned pos = E->getDestPosition();
+    map[pos] = i;
+  }
+  return map;
+}
+
 int DFNode::getAncestorHops(DFNode* N) {
   DFNode* temp = this->getParent();
   int hops = 1;
diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 47b7e18856406a3f1853550587fd662a267a3e47..e8c027686aba7fa19dfb0d3344f208e5ba2302b0 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -54,13 +54,14 @@ public:
 // calls
 class Kernel {
 public:
-  Kernel(Function* _KF, DFLeafNode* _KLeafNode,std::vector<unsigned> _inArgMap =
+  Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::vector<unsigned> _inArgMap =
+           std::vector<unsigned>(), std::vector<unsigned> _outArgMap =
            std::vector<unsigned>(), unsigned _gridDim = 0, std::vector<Value*>
          _globalWGSize = std::vector<Value*>(),
          unsigned _blockDim = 0,
          std::vector<Value*> _localWGSize = std::vector<Value*>())
     : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
-      gridDim(_gridDim), globalWGSize(_globalWGSize),
+      outArgMap(_outArgMap), gridDim(_gridDim), globalWGSize(_globalWGSize),
       blockDim(_blockDim), localWGSize(_localWGSize) {
 
     assert(gridDim == globalWGSize.size()
@@ -72,6 +73,7 @@ public:
   Function* KernelFunction;
   DFLeafNode* KernelLeafNode;
   std::vector<unsigned> inArgMap;
+  std::vector<unsigned> outArgMap;
   unsigned gridDim;
   unsigned blockDim;
   std::vector<Value*> globalWGSize;
@@ -85,6 +87,13 @@ public:
     inArgMap = map;
   }
 
+  std::vector<unsigned> getOutArgMap() {
+    return outArgMap;
+  }
+  void setOutArgMap(std::vector<unsigned> map) {
+    outArgMap = map;
+  }
+
   void setLocalWGSize(std::vector<Value*> V) {
     localWGSize = V;
   }
@@ -648,6 +657,8 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Tw
   // Get OutputType of this node
   StructType* OutTy = N->getOutputType();
   Value *retVal = UndefValue::get(F_X86->getReturnType());
+  // Find the kernel's output arg map, to use instead of the bindings
+  std::vector<unsigned> outArgMap = kernel->getOutArgMap();
   // Find all the input edges to exit node
   for (unsigned i=0; i < OutTy->getNumElements(); i++) {
     DEBUG(errs() << "Output Edge " << i << "\n");
@@ -682,7 +693,10 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Tw
 
       // Extract element at source position from this call instruction
       std::vector<unsigned> IndexList;
-      IndexList.push_back(E->getSourcePosition());
+      // i is the destination of DFEdge E
+      // Use the mapping instead of the bindings
+//      IndexList.push_back(E->getSourcePosition());
+      IndexList.push_back(outArgMap[i]);
       DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
       ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
                              "",RI);
@@ -722,14 +736,32 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
   } else {
     DEBUG(errs() << "Found intermediate node. Getting size parameters.\n");
     // Keep track of the arguments order.
-    std::vector<unsigned> map1 = N->getInArgMap();
-    std::vector<unsigned> map2 = kernel->getInArgMap();
+    std::vector<unsigned> inmap1 = N->getInArgMap();
+    std::vector<unsigned> inmap2 = kernel->getInArgMap();
 
-    // The limit is the size of map2, because this is the number of kernel arguments
-    for (unsigned i = 0; i < map2.size(); i++) {
-      map2[i] = map1[map2[i]];
+    // TODO: Verify when we have incoming edges from more than one nodes
+    // The limit is the size of inmap2, because this is the number of kernel arguments
+    for (unsigned i = 0; i < inmap2.size(); i++) {
+      inmap2[i] = inmap1[inmap2[i]];
+    }
+    kernel->setInArgMap(inmap2);
+
+    // Keep track of the output arguments order.
+    std::vector<unsigned> outmap1 = N->getOutArgMap();
+    std::vector<unsigned> outmap2 = kernel->getOutArgMap();
+
+    // TODO: Change when we have incoming edges to the dummy exit node from more
+    // than one nodes. In this case, the number of bindings is the same, but
+    // their destination position, thus the index in outmap1, is not
+    // 0 ... outmap2.size()-1
+    // The limit is the size of outmap2, because this is the number of kernel
+    // output arguments for which the mapping matters
+    // For now, it reasonable to assume that all the kernel arguments are returned,
+    // maybe plys some others from other nodes, thus outmap2.size() <= outmap1.size()
+    for (unsigned i = 0; i < outmap2.size(); i++) {
+      outmap1[i] = outmap2[outmap1[i]];
     }
-    kernel->setInArgMap(map2);
+    kernel->setOutArgMap(outmap1);
 
     // Track the source of local dimlimits for the kernel
     // Dimension limit can either be a constant or an argument of parent
@@ -779,7 +811,12 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
   // (2) Parent does not have multiple instances
   if (!pLevel || !pReplFactor) {
     KernelLaunchNode = PNode;
-    kernel = new Kernel(NULL, N, N->getInArgMap(), N->getNumOfDim(), N->getDimLimits());
+    kernel = new Kernel(NULL,
+                        N,
+                        N->getInArgMap(),
+                        N->getOutArgMap(),
+                        N->getNumOfDim(),
+                        N->getDimLimits());
   }
   else {
     // Converting a 2-level DFG to opencl kernel
@@ -790,6 +827,7 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
     kernel = new Kernel(NULL,                 // kernel function
                         N,                    // kernel leaf node
                         N->getInArgMap(),     // kenel argument mapping
+                        N->getOutArgMap(),    // kernel output mapping from the leaf to the interemediate node
                         PNode->getNumOfDim(), // gridDim
                         PNode->getDimLimits(),// grid size
                         N->getNumOfDim(),     // blockDim