diff --git a/llvm/include/llvm/IR/DFGraph.h b/llvm/include/llvm/IR/DFGraph.h
index d068301bc7d48e55fb78c2c10f7c10efbc797d4f..247154b6801abe7b7dffc1e90487ad872d832f9b 100644
--- a/llvm/include/llvm/IR/DFGraph.h
+++ b/llvm/include/llvm/IR/DFGraph.h
@@ -496,7 +496,6 @@ public:
   }
 
   void removeGenFuncForTarget(visc::Target T) {
-  errs() << "Target tag = " << T << "\n";
     switch (T) {
       case visc::CPU_TARGET:
         GenFuncs.CPUGenFunc = NULL;
diff --git a/llvm/include/llvm/SupportVISC/DFG2LLVM.h b/llvm/include/llvm/SupportVISC/DFG2LLVM.h
index a036d255c81aeec436b1add99ac0738ec3a46860..9a48405cd5d2631905e265a547167f79dc2eb681 100644
--- a/llvm/include/llvm/SupportVISC/DFG2LLVM.h
+++ b/llvm/include/llvm/SupportVISC/DFG2LLVM.h
@@ -17,6 +17,7 @@
 #include "llvm/BuildDFG/BuildDFG.h"
 #include "llvm/SupportVISC/VISCHint.h"
 #include "llvm/SupportVISC/VISCTimer.h"
+#include "llvm/SupportVISC/VISCUtils.h"
 
 using namespace llvm;
 using namespace builddfg;
@@ -84,8 +85,10 @@ protected:
 
   // Functions
   Value* getStringPointer(const Twine& S, Instruction* InsertBefore, const Twine& Name = "");
-  void addArgument(Function*, Type*, const Twine& Name = "");
-  void addIdxDimArgs(Function* F);
+//  void addArgument(Function*, Type*, const Twine& Name = "");
+  Function *addArgument(Function*, Type*, const Twine& Name = "");
+//  void addIdxDimArgs(Function* F);
+  Function *addIdxDimArgs(Function* F);
   std::vector<Value*> extractElements(Value*, std::vector<Type*>,
       std::vector<std::string>, Instruction*);
   Argument* getArgumentAt(Function* F, unsigned offset);
@@ -223,7 +226,29 @@ Value* CodeGenTraversal::getStringPointer(const Twine& S, Instruction* IB, const
 }
 
 // Add an argument of type Ty to the given function F
-void CodeGenTraversal::addArgument(Function* F, Type* Ty, const Twine& name) {
+//void CodeGenTraversal::addArgument(Function* F, Type* Ty, const Twine& name) {
+//  // Add the argument to argument list
+//  new Argument(Ty, name, F);
+//
+//  // Create the argument type list with added argument types
+//  std::vector<Type*> ArgTypes;
+//  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+//      ai != ae; ++ai) {
+//    ArgTypes.push_back(ai->getType());
+//  }
+//  // Adding new arguments to the function argument list, would not change the
+//  // function type. We need to change the type of this function to reflect the
+//  // added arguments
+//  FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg());
+//  PointerType* PTy = PointerType::get(FTy, cast<PointerType>(F->getType())->getAddressSpace());
+//
+//  // Change the function type
+//  F->mutateType(PTy);
+//}
+
+// Creates a function with an additional argument of the specified type and
+// name. The previous function is not deleted.
+Function *CodeGenTraversal::addArgument(Function* F, Type* Ty, const Twine& name) {
   // Add the argument to argument list
   new Argument(Ty, name, F);
 
@@ -235,21 +260,43 @@ void CodeGenTraversal::addArgument(Function* F, Type* Ty, const Twine& name) {
   }
   // Adding new arguments to the function argument list, would not change the
   // function type. We need to change the type of this function to reflect the
-  // added arguments
+  // added arguments. So, we create a clone of this function with the correct
+  // type.
   FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg());
-  PointerType* PTy = PointerType::get(FTy, cast<PointerType>(F->getType())->getAddressSpace());
+  Function *newF = viscUtils::cloneFunction(F, FTy, false);
+
+  // Check if the function is used by a metadata node
+  if(F->isUsedByMetadata()) {
+    viscUtils::fixHintMetadata(*F->getParent(), F, newF);
+  }
 
-  // Change the function type
-  F->mutateType(PTy);
+  return newF;
 }
 
 // Change the argument list of function F to add index and limit arguments
-void CodeGenTraversal::addIdxDimArgs(Function* F) {
+//void CodeGenTraversal::addIdxDimArgs(Function* F) {
+//  // Add Index and Dim arguments
+//  std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"};
+//  for (int i = 0; i < 6; ++i) {
+//    addArgument(F, Type::getInt32Ty(F->getContext()), names[i]);
+//  }
+//}
+
+// Return new function with additional index and limit arguments.
+// The original function is removed from the module and erased.
+Function *CodeGenTraversal::addIdxDimArgs(Function* F) {
+  errs() << "Function Type: " << *F->getFunctionType() << "\n";
   // Add Index and Dim arguments
   std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"};
+  Function *newF;
   for (int i = 0; i < 6; ++i) {
-    addArgument(F, Type::getInt32Ty(F->getContext()), names[i]);
+    newF = addArgument(F, Type::getInt64Ty(F->getContext()), names[i]);
+    F->replaceAllUsesWith(UndefValue::get(F->getType()));
+    F->eraseFromParent();
+    F = newF;
   }
+  errs() << "Function Type after adding args: " << *newF->getFunctionType() << "\n";
+  return newF;
 }
 
 // Extract elements from an aggregate value. TyList contains the type of each
diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index bb83544969c3fa88fb0dd10557cae734ced28786..845e5a7f9f50fcd9cf5eaa8455b5d913ab401a1c 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -354,8 +354,7 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
   for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
        i != e; ++i) {
     dest_iterator->setName(i->getName()); // Copy the name over...
-    // Add mapping to VMap and increment dest iterator
-    VMap[&*i] = &*dest_iterator;
+    // Increment dest iterator
     ++dest_iterator;
   }
 
@@ -364,17 +363,29 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
   ReturnInst* RI = ReturnInst::Create(M.getContext(),
                                       UndefValue::get(F_X86->getReturnType()), BB);
 
-  //Add the generated function info to DFNode
-//  N->setGenFunc(F_X86, visc::CPU_TARGET);
-  N->addGenFunc(F_X86, visc::GPU_TARGET, true);
-
   // FIXME: Adding Index and Dim arguments are probably not required except
   // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do
   // have those arguments)
 
   // Add Index and Dim arguments except for the root node
   if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
-    addIdxDimArgs(F_X86);
+    F_X86 = addIdxDimArgs(F_X86);
+
+  BB = &*F_X86->begin();
+  RI = cast<ReturnInst>(BB->getTerminator());
+
+  //Add the generated function info to DFNode
+//  N->setGenFunc(F_X86, visc::CPU_TARGET);
+  N->addGenFunc(F_X86, visc::GPU_TARGET, true);
+
+  // Loop over the arguments, to create the VMap
+  dest_iterator = F_X86->arg_begin();
+  for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+       i != e; ++i) {
+    // Add mapping to VMap and increment dest iterator
+    VMap[&*i] = &*dest_iterator;
+    ++dest_iterator;
+  }
 
   /* TODO: Use this code to verufy if this is a good pattern for PTX kernel
 
diff --git a/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp b/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp
index f31e75931c4a997d8c43716e299faf323ac48477..d62d0561ac47c384c21835608a512d3c8246a022 100644
--- a/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp
@@ -369,7 +369,7 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil
 
   // Add Index and Dim arguments except for the root node
   if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
-    addIdxDimArgs(F_X86);
+    F_X86 = addIdxDimArgs(F_X86);
 
   /* TODO: Use this code to verufy if this is a good pattern for OCL kernel
 
diff --git a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
index 8794f423e03af521ab44562301924ba106454e1a..d55640dc6f875777ab3fb3fbe75763a4dd6d016d 100644
--- a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -949,9 +949,9 @@ void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86,
     Args.push_back(getInValueAt(C, i, F_X86, IB));
   }
 
-  Value* I32Zero = ConstantInt::get(Type::getInt32Ty(F_X86->getContext()), 0);
+  Value* I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0);
   for(unsigned j=0; j<6; j++)
-    Args.push_back(I32Zero);
+    Args.push_back(I64Zero);
 
   errs() << "Function type: " << *CF_X86->getType() << "\n";
   errs() << "Function type: " << *CF->getType() << "\n";
@@ -971,7 +971,7 @@ void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86,
   std::string varNames[3] = {"x", "y", "z"};
   unsigned numArgs = CI->getNumArgOperands();
   for(unsigned j=0; j < C->getNumOfDim(); j++) {
-    Value* indexLimit;
+    Value* indexLimit = NULL;
     // Limit can either be a constant or an arguement of the internal node.
     // In case of constant we can use that constant value directly in the
     // new F_X86 function. In case of an argument, we need to get the mapped
@@ -1300,7 +1300,7 @@ void CGT_X86::codeGen(DFInternalNode* N) {
     if (!(C->hasX86GenFuncForTarget(visc::CPU_TARGET))) {
       errs() << "No CPU x86 version for child node "
              << C->getFuncPointer()->getName()
-             << " . Skip code gen for parent node "
+             << "\n  Skip code gen for parent node "
              << N->getFuncPointer()->getName() << "\n";
       codeGen = false;
     }
@@ -1318,34 +1318,42 @@ void CGT_X86::codeGen(DFInternalNode* N) {
   
     // Create new function with the same type
     F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
-    errs() << "--------------" << F->getName() << "\n";
+
     // Loop over the arguments, copying the names of arguments over.
     Function::arg_iterator dest_iterator = F_X86->arg_begin();
-    assert(false && "Got here\n");
     for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
          i != e; ++i) {
       dest_iterator->setName(i->getName()); // Copy the name over...
-      // Add mapping to VMap and increment dest iterator
-      VMap[&*i] = &*dest_iterator;
+      // Increment dest iterator
       ++dest_iterator;
     }
-  
-    assert(false && "Got here\n");
 
     // Add a basic block to this empty function
     BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86);
     ReturnInst* RI = ReturnInst::Create(F_X86->getContext(),
                                         UndefValue::get(F_X86->getReturnType()), BB);
 
-    //Add generated function info to DFNode
-//    N->setGenFunc(F_X86, visc::CPU_TARGET);
-    N->addGenFunc(F_X86, visc::CPU_TARGET, true);
-
     // Add Index and Dim arguments except for the root node and the child graph of
     // parent node is not streaming
     if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
-      addIdxDimArgs(F_X86);
+      F_X86 = addIdxDimArgs(F_X86);
+
+    BB = &*F_X86->begin();
+    RI = cast<ReturnInst>(BB->getTerminator());
   
+    //Add generated function info to DFNode
+//    N->setGenFunc(F_X86, visc::CPU_TARGET);
+    N->addGenFunc(F_X86, visc::CPU_TARGET, true);
+
+    // Loop over the arguments, to create the VMap.
+    dest_iterator = F_X86->arg_begin();
+    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
+         i != e; ++i) {
+      // Add mapping and increment dest iterator
+      VMap[&*i] = &*dest_iterator;
+      ++dest_iterator;
+    }
+
     // Iterate over children in topological order
     for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
         ce = N->getChildGraph()->end(); ci != ce; ++ci) {
@@ -1675,14 +1683,14 @@ void CGT_X86::codeGen(DFLeafNode* N) {
   // Insert the cloned function into the module
   M.getFunctionList().push_back(F_X86);
 
-  // Add generated function info to DFNode
-//  N->setGenFunc(F_X86, visc::CPU_TARGET);
-  N->addGenFunc(F_X86, visc::CPU_TARGET, true);
-
   // Add the new argument to the argument list. Add arguments only if the cild
   // graph of parent node is not streaming
   if(!N->getParent()->isChildGraphStreaming())
-    addIdxDimArgs(F_X86);
+    F_X86 = addIdxDimArgs(F_X86);
+
+  // Add generated function info to DFNode
+//  N->setGenFunc(F_X86, visc::CPU_TARGET);
+  N->addGenFunc(F_X86, visc::CPU_TARGET, true);
 
   // Go through the arguments, and any pointer arguments with in attribute need
   // to have x86_argument_ptr call to get the x86 ptr of the argument
diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp
index e61802d25e2b5ccd4affa7044deb8712feaf9c03..8d52e471652ab5ca17e30ca9328f067ae1ab9942 100644
--- a/llvm/projects/visc-rt/visc-rt.cpp
+++ b/llvm/projects/visc-rt/visc-rt.cpp
@@ -99,7 +99,7 @@ void llvm_visc_x86_dstack_pop() {
   //DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n");
 }
 
-unsigned llvm_visc_x86_getDimLimit(unsigned level, unsigned dim) {
+uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim) {
   //DEBUG(cout << "Request limit for dim " << dim << " of ancestor " << level <<flush << "\n");
   //unsigned size = DStack.size();
   //DEBUG(cout << "\t Return: " << DStack[size-level-1].getDimLimit(dim) <<flush << "\n");
@@ -107,7 +107,7 @@ unsigned llvm_visc_x86_getDimLimit(unsigned level, unsigned dim) {
   return 0;
 }
 
-unsigned llvm_visc_x86_getDimInstance(unsigned level, unsigned dim) {
+uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim) {
   //DEBUG(cout << "Request instance id for dim " << dim << " of ancestor " << level <<flush << "\n");
   //unsigned size = DStack.size();
   //DEBUG(cout << "\t Return: " << DStack[size-level-1].getDimInstance(dim) <<flush << "\n");
diff --git a/llvm/projects/visc-rt/visc-rt.h b/llvm/projects/visc-rt/visc-rt.h
index 20cc6e35a6f0b7802d4e662b61f5c6e2bf086149..69392671fb6a244922c75a6c11c82b2405787732 100644
--- a/llvm/projects/visc-rt/visc-rt.h
+++ b/llvm/projects/visc-rt/visc-rt.h
@@ -71,8 +71,8 @@ class DFGDepth {
 void llvm_visc_x86_dstack_push(unsigned n, unsigned limitX = 0, unsigned iX = 0,
     unsigned limitY = 0, unsigned iY = 0, unsigned limitZ = 0, unsigned iZ = 0);
 void llvm_visc_x86_dstack_pop();
-unsigned llvm_visc_x86_getDimLimit(unsigned level, unsigned dim);
-unsigned llvm_visc_x86_getDimInstance(unsigned level, unsigned dim);
+uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim);
+uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim);
 
 
 /********************* Memory Tracker **********************************/