diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 7df93bd099e837335656eb6f5f30aa34e4cb0f41..190b9af288bbd77f11e720fd979dc6eb0832bb82 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -75,20 +75,29 @@ public:
   unsigned blockDim;
   std::vector<Value*> globalWGSize;
   std::vector<Value*> localWGSize;
+  std::vector<int> localDimMap;
 
   std::vector<unsigned> getInArgMap() {
     return inArgMap;
   }
+  void setInArgMap(std::vector<unsigned> map) {
+    inArgMap = map;
+  }
+
+  void setLocalWGSize(std::vector<Value*> V) {
+    localWGSize = V;
+  }
+
 
   bool hasLocalWG() {
-    return blockDim == 0;
+    return blockDim != 0;
   }
 };
 
 // Helper function declarations
 static void getExecuteNodeParams(Value* &, Value* &, Value* &, Kernel*,
                                  ValueToValueMapTy&, Instruction*);
-static Value* genWorkGroupPtr(std::vector<Value*>&, ValueToValueMapTy&,
+static Value* genWorkGroupPtr(std::vector<Value*>, ValueToValueMapTy&,
                               Instruction*, const Twine& WGName = "WGSize");
 static bool hasAttribute(Function*, unsigned, Attribute::AttrKind);
 static std::string getPTXFilename(const Module&);
@@ -615,10 +624,11 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa
 
   // Read all the pointer arguments which had side effects i.e., had out
   // attribute
+  errs() << "Output Pointers : " << OutputPointers.size() << "\n";
   for(auto output: OutputPointers) {
-    DEBUG(errs() << "Read: " << *output.d_ptr << "\n");
-    DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n");
-    DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n");
+    errs() << "Read: " << *output.d_ptr << "\n";
+    errs() << "\tTo: " << *output.h_ptr << "\n";
+    errs() << "\t#bytes: " << *output.bytes << "\n";
 
     Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes};
     CallInst* CI = CallInst::Create(llvm_visc_ptx_getOutput,
@@ -703,7 +713,6 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
   } else {
     DEBUG(errs() << "Found intermediate node. Getting size parameters.\n");
     // Keep track of the arguments order.
-
     std::vector<unsigned> map1 = N->getInArgMap();
     std::vector<unsigned> map2 = kernel->getInArgMap();
 
@@ -711,7 +720,33 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) {
     for (unsigned i = 0; i < map2.size(); i++) {
       map2[i] = map1[map2[i]];
     }
-
+    kernel->setInArgMap(map2);
+
+    // Track the source of local dimlimits for the kernel
+    // Dimension limit can either be a constant or an argument of parent
+    // function. Since Internal node would no longer exist, we need to insert the
+    // localWGSize with values from the parent of N.
+    std::vector<Value*> localWGSizeMapped;
+    for (unsigned i = 0; i < kernel->localWGSize.size(); i++) {
+      if (isa<Constant>(kernel->localWGSize[i])) {
+        // if constant, use as it is
+        localWGSizeMapped.push_back(kernel->localWGSize[i]);
+      }
+      else if (Argument* Arg = dyn_cast<Argument>(kernel->localWGSize[i])) {
+        // if argument, find the argument location in N. Use InArgMap of N to
+        // find the source location in Parent of N. Retrieve the argument from
+        // parent to insert in the vector.
+        unsigned argNum = Arg->getArgNo();
+        unsigned parentArgNum = N->getInArgMap()[argNum];
+        Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum);
+        localWGSizeMapped.push_back(A);
+      }
+      else {
+        assert(false && "LocalWGsize using value which is neither argument nor constant!");
+      }
+    }
+    // Update localWGSize vector of kernel
+    kernel->setLocalWGSize(localWGSizeMapped);
   }
 
 }
@@ -736,20 +771,6 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
   if (!pLevel || !pReplFactor) {
     KernelLaunchNode = PNode;
     kernel = new Kernel(NULL, N->getInArgMap(), N->getNumOfDim(), N->getDimLimits());
-    // TODO: Find a good way of choosing parameters - is this required?
-    //kernel->gridDim = N->getNumOfDim();
-    //kernel->blockDim = N->getNumOfDim();
-    //kernel->globalWGSize = N->getDimLimits();
-    //kernel->localWGSize = N->getDimLimits();
-    //FIXME: Comment this out as we can provide localWGSize as null
-    //IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext());
-    // TODO: How to choose the div factor;
-    //ConstantInt* divFactor = ConstantInt::getSigned(IntTy, (int64_t) 16);
-    //std::vector<Value*> tmp(kernel->gridDim, divFactor);
-    //for (unsigned i = 0; i < kernel->gridDim; i++) {
-    //  BinaryOperator* SDivInst = BinaryOperator::CreateSDiv(kernel->globalWGSize[i],tmp[i]);
-    //  kernel->localWGSize.push_back(SDivInst);
-    //}
   }
   else {
     // Converting a 2-level DFG to opencl kernel
@@ -757,15 +778,10 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
     KernelLaunchNode = PNode->getParent();
     assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match");
     // Contains the instructions generating the kernel configuration parameters
-    std::vector<Value*> globalWGSizeInsts;
-    for (unsigned i = 0; i < PNode->getNumOfDim(); i++) {
-      BinaryOperator* MulInst = BinaryOperator::CreateMul(PNode->getDimLimits()[i],N->getDimLimits()[i]);
-      globalWGSizeInsts.push_back(MulInst);
-    }
     kernel = new Kernel(NULL,                 // kernel function
                         N->getInArgMap(),     // kenel argument mapping
                         PNode->getNumOfDim(), // gridDim
-                        globalWGSizeInsts,    // grid size
+                        PNode->getDimLimits(),// grid size
                         N->getNumOfDim(),     // blockDim
                         N->getDimLimits());   // block size
 
@@ -885,21 +901,26 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
 
         // The following is to find which function to call
         Function * OpenCLFunction;
-        int parentLevel = ParentDFNode->getLevel();
-        int parentReplFactor = ParentDFNode->getNumOfDim();
+        int parentLevel = N->getParent()->getLevel();
+        int parentReplFactor = N->getParent()->getNumOfDim();
+        errs() << "Parent Level = " << parentLevel << "\n";
+        errs() << "Parent Repl factor = " << parentReplFactor << "\n";
 
-        if (!parentLevel || !parentReplFactor) {
+        if ((!parentLevel || !parentReplFactor) && ArgDFNode == N) {
           // We only have one level in the hierarchy or the parent node is not
           // replicated. This indicates that the parent node is the kernel
-          // launch, so we need to specify a global id
-
+          // launch, so we need to specify a global id.
+          // We can translate this only if the argument is the current node
+          // itself
+          errs() << "Substitute with get_global_id()\n";
+          errs() << *II << "\n";
           FunctionType* FT =
             FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
                               std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
                               false);
           OpenCLFunction = cast<Function>
                            (KernelM.getOrInsertFunction(StringRef("get_global_id"), FT));
-        } else if (Leaf_HandleToDFNodeMap[ArgII] == Leaf_HandleToDFNodeMap[II]) {
+        } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
           // We are asking for this node's id with respect to its parent
           // this is a local id call
           FunctionType* FT =
@@ -918,7 +939,11 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
           OpenCLFunction = cast<Function>
                            (KernelM.getOrInsertFunction(StringRef("get_group_id"), FT));
         } else {
-          assert(false && "Unable to translate this intrinsic");
+          errs() << N->getFuncPointer()->getName() << "\n";
+          errs() << N->getParent()->getFuncPointer()->getName() << "\n";
+          errs() << *II << "\n";
+
+          assert(false && "Unable to translate getNodeInstanceID intrinsic");
         }
 
         // Create call instruction, insert it before the intrinsic and
@@ -973,7 +998,7 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
                               false);
           OpenCLFunction = cast<Function>
                            (KernelM.getOrInsertFunction(StringRef("get_global_size"), FT));
-        } else if (Leaf_HandleToDFNodeMap[ArgII] == Leaf_HandleToDFNodeMap[II]) {
+        } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
           // We are asking for this node's instances
           // this is a local size (block dim) call
           FunctionType* FT =
@@ -992,7 +1017,7 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) {
           OpenCLFunction = cast<Function>
                            (KernelM.getOrInsertFunction(StringRef("get_num_groups"), FT));
         } else {
-          assert(false && "Unable to translate this intrinsic");
+          assert(false && "Unable to translate getNumNodeInstances intrinsic");
         }
 
         // Create call instruction, insert it before the intrinsic and
@@ -1294,11 +1319,20 @@ static void getExecuteNodeParams(Value* &workDim, Value* &LocalWGPtr, Value*
     LocalWGPtr = genWorkGroupPtr(kernel->localWGSize, VMap, IB, "LocalWGSize");
   }
 
-  GlobalWGPtr = genWorkGroupPtr(kernel->globalWGSize, VMap, IB, "GlobalWGSize");
+  // For OpenCL, global work group size is the total bumber of instances in each
+  // dimension. So, multiply local and global dim limits.
+  std::vector<Value*> globalWGSizeInsts;
+  for (unsigned i = 0; i < kernel->gridDim; i++) {
+    BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB);
+    globalWGSizeInsts.push_back(MulInst);
+  }
+  GlobalWGPtr = genWorkGroupPtr(globalWGSizeInsts, VMap, IB, "GlobalWGSize");
   DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
 }
 
-static Value* genWorkGroupPtr(std::vector<Value*>& WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) {
+// CodeGen for allocating space for Work Group on stack and returning a pointer
+// to its address
+static Value* genWorkGroupPtr(std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) {
   Value* WGPtr;
   // Get int64_t and or ease of use
   Type* Int64Ty = Type::getInt64Ty(getGlobalContext());
@@ -1316,18 +1350,34 @@ static Value* genWorkGroupPtr(std::vector<Value*>& WGSize, ValueToValueMapTy& VM
   // size in that dimension
   for(unsigned i=0; i < WGSize.size(); i++) {
     assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
+
+    // If WGSize[i] is not a constant or a instruction, use mapped value in the new function
+    Value* WGSizeMapped;
+    if(isa<Argument>(WGSize[i]))
+      WGSizeMapped = VMap[WGSize[i]];
+    else {
+      WGSizeMapped = WGSize[i];
+      errs() << "Mapping value is not required: ";
+      errs() << *WGSize[i] << "\n";
+    }
     if(WGSize[i]->getType() != Int64Ty) {
       // If number of dimensions are mentioned in any other integer format,
       // generate code to extend it to i64. We need to use the mapped value in
       // the new generated function, hence the use of VMap
       // FIXME: Why are we changing the kernel WGSize vector here?
-      WGSize[i] = BitCastInst::CreateIntegerCast(VMap[WGSize[i]], Int64Ty, true, "", IB);
-      StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB);
+      errs() << "Not i64. Zero extend required.\n";
+      errs() << *WGSize[i] << "\n";
+      errs() << *WGSizeMapped << "\n";
+      CastInst* CI = BitCastInst::CreateIntegerCast(WGSizeMapped, Int64Ty, true, "", IB);
+      errs() << "Bitcast done.\n";
+      StoreInst* SI = new StoreInst(CI, nextDim, IB);
+      errs() << "Zero extend done.\n";
       DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
     } else {
       // Store the value representing work group size in ith dimension on
       // stack
-      StoreInst* SI = new StoreInst(VMap[WGSize[i]], nextDim, IB);
+      StoreInst* SI = new StoreInst(WGSizeMapped, nextDim, IB);
+
       DEBUG(errs() << "\t Work group size: " << *SI << "\n");
     }
     if(i+1 < WGSize.size()) {
diff --git a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
index 32ffaa13410a0321de1c1ad094effe6b6daadb0c..818be97f6ff22d73fe362feff59cb5062bcef987 100644
--- a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -90,9 +90,15 @@ public:
   void codeGenLaunch(DFInternalNode* Root);
 
   virtual void visit(DFInternalNode* N) {
+    // If code has already been generated for this internal node, skip the
+    // children
+    if(N->getGenFunc() != NULL)
+      return;
+
+    DEBUG(errs() << "Start: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n");
+
     // Follows a bottom-up approach for code generation.
     // First generate code for all the child nodes
-    DEBUG(errs() << "Start: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n");
     for(DFGraph::children_iterator i = N->getChildGraph()->begin(),
         e = N->getChildGraph()->end(); i != e; ++i) {
       DFNode* child = *i;