diff --git a/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp b/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
index 05869fa0dfca5b742d828f330e2f17fd21b28aa6..4b392b2c966d802664598c3f2d09331910d6e555 100644
--- a/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
+++ b/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
@@ -65,12 +65,12 @@ public:
   virtual void visit(DFInternalNode* N) {
     // Follows a bottom-up approach for code generation.
     // First generate code for all the child nodes
-    DEBUG(errs() << "Erasing Node (I) - " << N->getFuncPointer()->getName() << "\n");
     for(DFGraph::children_iterator i = N->getChildGraph()->begin(),
         e = N->getChildGraph()->end(); i != e; ++i) {
       DFNode* child = *i;
       child->applyDFNodeVisitor(*this);
     }
+    DEBUG(errs() << "Erasing Node (I) - " << N->getFuncPointer()->getName() << "\n");
     // Generate code for this internal node now. This way all the cloned
     // functions for children exist.
     deleteNode(N);
@@ -80,7 +80,7 @@ public:
   virtual void visit(DFLeafNode* N) {
     DEBUG(errs() << "Erasing Node (L) - " << N->getFuncPointer()->getName() << "\n");
     deleteNode(N);
-    //errs() << "DONE: Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n";
+    errs() << "DONE" << "\n";
   }
 
 };
diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 190b9af288bbd77f11e720fd979dc6eb0832bb82..869ca5d5927ffb59db62f1056f77ea357daa77d0 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -54,12 +54,12 @@ public:
 // calls
 class Kernel {
 public:
-  Kernel(Function* _KF, std::vector<unsigned> _inArgMap =
+  Kernel(Function* _KF, DFLeafNode* _KLeafNode,std::vector<unsigned> _inArgMap =
            std::vector<unsigned>(), unsigned _gridDim = 0, std::vector<Value*>
          _globalWGSize = std::vector<Value*>(),
          unsigned _blockDim = 0,
          std::vector<Value*> _localWGSize = std::vector<Value*>())
-    : KernelFunction(_KF), inArgMap(_inArgMap),
+    : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
       gridDim(_gridDim), globalWGSize(_globalWGSize),
       blockDim(_blockDim), localWGSize(_localWGSize) {
 
@@ -70,6 +70,7 @@ public:
   }
 
   Function* KernelFunction;
+  DFLeafNode* KernelLeafNode;
   std::vector<unsigned> inArgMap;
   unsigned gridDim;
   unsigned blockDim;
@@ -169,7 +170,7 @@ private:
   Argument* getArgumentAt(Function* F, unsigned offset);
   Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
                       Instruction* InsertBefore);
-  void insertRuntimeCalls(DFInternalNode* N, const Twine& FileName);
+  void insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName);
 
   void codeGen(DFInternalNode* N);
   void codeGen(DFLeafNode* N);
@@ -318,16 +319,16 @@ void CodeGenTraversal::addIdxDimArgs(Function* F) {
 
 /* Traverse the function F argument list to get argument at offset*/
 Argument* CodeGenTraversal::getArgumentAt(Function* F, unsigned offset) {
+  DEBUG(errs() << "Finding argument " << offset << ":\n");
   assert((F->getFunctionType()->getNumParams() > offset && offset >= 0)
          && "Invalid offset to access arguments!");
-
   Argument* arg;
   Function::arg_iterator i = F->arg_begin(), e = F->arg_end();
   for(; offset != 0 && i!=e; i++) {
     offset--;
   }
   arg = i;
-  DEBUG(errs() << *arg <<"\n");
+  DEBUG(errs() << "\t" << *arg <<"\n");
   return arg;
 }
 
@@ -388,7 +389,7 @@ Value* CodeGenTraversal::getStringPointer(const Twine& S, Instruction* IB, const
 // used to generate a function to associate with this leaf node. The function
 // is responsible for all the memory allocation/transfer and invoking the
 // kernel call on the device
-void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileName) {
+void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName) {
   // Check if clone already exists. If it does, it means we have visited this
   // function before.
   assert(N->getGenFunc() == NULL && "Code already generated for this node");
@@ -398,7 +399,7 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
   Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
 
   // If kernel struct has not been initialized with kernel function, then fail
-  assert(kernel != NULL && "No kernel found!!");
+  assert(K != NULL && "No kernel found!!");
 
   DEBUG(errs() << "Generating kernel call code\n");
 
@@ -441,6 +442,8 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
   if(!N->isRoot())
     addIdxDimArgs(F_X86);
 
+  /* TODO: Use this code to verufy if this is a good pattern for PTX kernel
+
   // Sort children in topological order before code generation for kernel call
   N->getChildGraph()->sortChildren();
 
@@ -464,6 +467,8 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
   assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!");
 
   Function* CF = C->getFuncPointer();
+  */
+  Function* KF = K->KernelLeafNode->getFuncPointer();
   // Initialize context
   DEBUG(errs() << "Initializing context" << "\n");
   CallInst::Create(llvm_visc_ptx_initContext, None, "", RI);
@@ -472,26 +477,26 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
   // Initialize command queue
   Value* fileStr = getStringPointer(FileName, RI, "Filename");
   DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n");
-  DEBUG(errs() << "Generating code for kernel - " << kernel->KernelFunction->getName()<< "\n");
-  Value* kernelStr = getStringPointer(kernel->KernelFunction->getName(), RI,"KernelName");
+  DEBUG(errs() << "Generating code for kernel - " << K->KernelFunction->getName()<< "\n");
+  Value* kernelStr = getStringPointer(K->KernelFunction->getName(), RI,"KernelName");
 
   Value* LaunchInstArgs[] = {fileStr, kernelStr};
 
   DEBUG(errs() << "Inserting launch call" << "\n");
   CallInst* GraphID = CallInst::Create(llvm_visc_ptx_launch,
                                        ArrayRef<Value*>(LaunchInstArgs, 2),
-                                       "graph"+CF->getName(),
+                                       "graph"+KF->getName(),
                                        RI);
   DEBUG(errs() << *GraphID << "\n");
   // Iterate over the required input edges of the node and use the visc-rt API
   // to set inputs
   DEBUG(errs() << "Iterate over input edges of node and insert visc api\n");
   std::vector<OutputPtr> OutputPointers;
-  for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
+  for(unsigned i=0; i<KF->getFunctionType()->getNumParams(); i++) {
 
     // The kernel object gives us the mapping of arguments from kernel launch
     // node function (F_X86) to kernel (kernel->KF)
-    Value* inputVal = getArgumentAt(F_X86, kernel->getInArgMap()[i]);
+    Value* inputVal = getArgumentAt(F_X86, K->getInArgMap()[i]);
     DEBUG(errs() << "\tArgument "<< i<< " = "  << *inputVal << "\n");
 
     // input value has been obtained.
@@ -502,11 +507,11 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
     if(inputVal->getType()->isPointerTy()) {
       // Pointer Input
       // CheckAttribute
-      Value* isOutput = (hasAttribute(CF, i, Attribute::Out))? True : False;
-      Value* isInput = ((hasAttribute(CF, i, Attribute::Out))
-                        && !(hasAttribute(CF, i, Attribute::In)))? False : True;
+      Value* isOutput = (hasAttribute(KF, i, Attribute::Out))? True : False;
+      Value* isInput = ((hasAttribute(KF, i, Attribute::Out))
+                        && !(hasAttribute(KF, i, Attribute::In)))? False : True;
 
-      Argument* A = getArgumentAt(CF, i);
+      Argument* A = getArgumentAt(KF, i);
       if(isOutput == True) {
         DEBUG(errs() << *A << " is an OUTPUT argument\n");
       }
@@ -519,7 +524,7 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
                              Type::getInt8PtrTy(M.getContext()),
                              inputVal->getName()+".i8ptr",
                              RI);
-      Value* inputSize = getArgumentAt(F_X86, kernel->getInArgMap()[i+1]);
+      Value* inputSize = getArgumentAt(F_X86, K->getInArgMap()[i+1]);
       assert(inputSize->getType() == Type::getInt64Ty(M.getContext())
              && "Pointer type input must always be followed by size (integer type)");
       Value* setInputArgs[] = {GraphID,
@@ -560,11 +565,11 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
   DEBUG(errs() << "Setup output edges of node and insert visc api\n");
 
   // Set output if struct is not an empty struct
-  StructType* OutputTy = C->getOutputType();
+  StructType* OutputTy = K->KernelLeafNode->getOutputType();
   Value *outputSize, *d_Output;
   if(!OutputTy->isEmptyTy()) {
     // Not an empty struct
-    unsigned outputIndex = CF->getFunctionType()->getNumParams();
+    unsigned outputIndex = KF->getFunctionType()->getNumParams();
     outputSize = ConstantExpr::getSizeOf(OutputTy);
     Value* setOutputArgs[] = {GraphID,
                               Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
@@ -576,7 +581,7 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
 
     d_Output = CallInst::Create(llvm_visc_ptx_argument_ptr,
                                 ArrayRef<Value*>(setOutputArgs, 6),
-                                "d_output."+CF->getName(),
+                                "d_output."+KF->getName(),
                                 RI);
   }
 
@@ -586,7 +591,7 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
   // pass it as an argument to ExecNode
 
   Value *workDim, *LocalWGPtr, *GlobalWGPtr;
-  getExecuteNodeParams(workDim, LocalWGPtr, GlobalWGPtr, kernel, VMap, RI);
+  getExecuteNodeParams(workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI);
   Value* ExecNodeArgs[] = {GraphID,
                            workDim,
                            LocalWGPtr,
@@ -594,7 +599,7 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
                           };
   CallInst* Event = CallInst::Create(llvm_visc_ptx_executeNode,
                                      ArrayRef<Value*>(ExecNodeArgs, 4),
-                                     "event."+CF->getName(),
+                                     "event."+KF->getName(),
                                      RI);
   DEBUG(errs() << "Execute Node Call: " << *Event << "\n");
 
@@ -613,13 +618,13 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
                              };
     CallInst* h_Output = CallInst::Create(llvm_visc_ptx_getOutput,
                                           ArrayRef<Value*>(GetOutputArgs, 4),
-                                          "h_output."+CF->getName()+".addr",
+                                          "h_output."+KF->getName()+".addr",
                                           RI);
     // Read each device pointer listed in output struct
     // Load the output struct
-    CastInst* BI = BitCastInst::CreatePointerCast(h_Output, CF->getReturnType()->getPointerTo(), "output.ptr", RI);
-    Value* KernelOutput = new LoadInst(BI, "output."+CF->getName(), RI);
-    OutputMap[C] = KernelOutput;
+    CastInst* BI = BitCastInst::CreatePointerCast(h_Output, KF->getReturnType()->getPointerTo(), "output.ptr", RI);
+    Value* KernelOutput = new LoadInst(BI, "output."+KF->getName(), RI);
+    OutputMap[K->KernelLeafNode] = KernelOutput;
   }
 
   // Read all the pointer arguments which had side effects i.e., had out
@@ -639,7 +644,7 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
   DEBUG(errs() << "*** Generating epilogue code for the function****\n");
   // Generate code for output bindings
   // Get Exit node
-  C = N->getChildGraph()->getExit();
+  DFNode* C = N->getChildGraph()->getExit();
   // Get OutputType of this node
   StructType* OutTy = N->getOutputType();
   Value *retVal = UndefValue::get(F_X86->getReturnType());
@@ -665,6 +670,10 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
     else {
       // edge is from a internal node
       // Check - code should already be generated for this source dfnode
+      // FIXME: Since the 2-level kernel code gen has aspecific structure, we
+      // can assume the SrcDF is same as Kernel Leaf node.
+      // Use outArgMap to get correct mapping
+      SrcDF = K->KernelLeafNode;
       assert(OutputMap.count(SrcDF)
              && "Source node call not found. Dependency violation!");
 
@@ -707,7 +716,7 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
     // Now the remaining nodes to be visited should be ignored
     KernelLaunchNode = NULL;
     errs() << "Insert Runtime calls\n";
-    insertRuntimeCalls(N, getPTXFilename(M));
+    insertRuntimeCalls(N, kernel, getPTXFilename(M));
     writeKernelsModule();
 
   } else {
@@ -770,7 +779,7 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
   // (2) Parent does not have multiple instances
   if (!pLevel || !pReplFactor) {
     KernelLaunchNode = PNode;
-    kernel = new Kernel(NULL, N->getInArgMap(), N->getNumOfDim(), N->getDimLimits());
+    kernel = new Kernel(NULL, N, N->getInArgMap(), N->getNumOfDim(), N->getDimLimits());
   }
   else {
     // Converting a 2-level DFG to opencl kernel
@@ -779,6 +788,7 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
     assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match");
     // Contains the instructions generating the kernel configuration parameters
     kernel = new Kernel(NULL,                 // kernel function
+                        N,                    // kernel leaf node
                         N->getInArgMap(),     // kenel argument mapping
                         PNode->getNumOfDim(), // gridDim
                         PNode->getDimLimits(),// grid size
@@ -1316,9 +1326,17 @@ static void getExecuteNodeParams(Value* &workDim, Value* &LocalWGPtr, Value*
     LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(getGlobalContext()));
   }
   else {
+    for(unsigned i = 0; i < kernel->localWGSize.size(); i++) {
+      if(isa<Argument>(kernel->localWGSize[i]))
+        kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
+    }
     LocalWGPtr = genWorkGroupPtr(kernel->localWGSize, VMap, IB, "LocalWGSize");
   }
 
+  for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
+    if(isa<Argument>(kernel->globalWGSize[i]))
+      kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
+  }
   // For OpenCL, global work group size is the total bumber of instances in each
   // dimension. So, multiply local and global dim limits.
   std::vector<Value*> globalWGSizeInsts;
@@ -1350,16 +1368,7 @@ static Value* genWorkGroupPtr(std::vector<Value*> WGSize, ValueToValueMapTy& VMa
   // size in that dimension
   for(unsigned i=0; i < WGSize.size(); i++) {
     assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
-
-    // If WGSize[i] is not a constant or a instruction, use mapped value in the new function
-    Value* WGSizeMapped;
-    if(isa<Argument>(WGSize[i]))
-      WGSizeMapped = VMap[WGSize[i]];
-    else {
-      WGSizeMapped = WGSize[i];
-      errs() << "Mapping value is not required: ";
-      errs() << *WGSize[i] << "\n";
-    }
+ 
     if(WGSize[i]->getType() != Int64Ty) {
       // If number of dimensions are mentioned in any other integer format,
       // generate code to extend it to i64. We need to use the mapped value in
@@ -1367,8 +1376,7 @@ static Value* genWorkGroupPtr(std::vector<Value*> WGSize, ValueToValueMapTy& VMa
       // FIXME: Why are we changing the kernel WGSize vector here?
       errs() << "Not i64. Zero extend required.\n";
       errs() << *WGSize[i] << "\n";
-      errs() << *WGSizeMapped << "\n";
-      CastInst* CI = BitCastInst::CreateIntegerCast(WGSizeMapped, Int64Ty, true, "", IB);
+      CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
       errs() << "Bitcast done.\n";
       StoreInst* SI = new StoreInst(CI, nextDim, IB);
       errs() << "Zero extend done.\n";
@@ -1376,7 +1384,7 @@ static Value* genWorkGroupPtr(std::vector<Value*> WGSize, ValueToValueMapTy& VMa
     } else {
       // Store the value representing work group size in ith dimension on
       // stack
-      StoreInst* SI = new StoreInst(WGSizeMapped, nextDim, IB);
+      StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB);
 
       DEBUG(errs() << "\t Work group size: " << *SI << "\n");
     }