diff --git a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 97e0f3411b7d74e97b1b6aca3d9f7d09f0650d72..642202bf055adcf534b7f1b44b3ab71858347ecb 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -953,8 +953,13 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // (2) Parent does not have multiple instances
   errs() << "pLevel = " << pLevel << "\n";
   errs() << "pReplFactor = " << pReplFactor << "\n";
-  if (!pLevel || !pReplFactor) {
+  assert((pLevel > 0) && "Root not allowed to be chosen as Kernel Node.");
+
+  // Only these options are supported
+  enum XLevelHierarchy{ONE_LEVEL, TWO_LEVEL} SelectedHierarchy;
+  if(pLevel == 1 || !pReplFactor) {
     errs() << "*************** Kernel Gen: 1-Level Hierarchy **************\n";
+    SelectedHierarchy = ONE_LEVEL;
     KernelLaunchNode = PNode;
     kernel = new Kernel(NULL,
                         N,
@@ -967,6 +972,8 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   else {
     // Converting a 2-level DFG to opencl kernel
     errs() << "*************** Kernel Gen: 2-Level Hierarchy **************\n";
+    assert((pLevel >= 2) && "Selected node not nested deep enough to be Kernel Node.");
+    SelectedHierarchy = TWO_LEVEL;
     KernelLaunchNode = PNode->getParent();
     assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match");
     // Contains the instructions generating the kernel configuration parameters
@@ -982,7 +989,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
 
   }
 
-  std::vector<IntrinsicInst *> IItoRemove;
+  std::vector<Instruction *> IItoRemove;
   BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap;
 
   // Get the function associated with the dataflow node
@@ -1127,6 +1134,33 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   F_nvptx = changeArgAddrspace(F_nvptx, SharedMemArgs, SHARED_ADDRSPACE);
   F_nvptx = changeArgAddrspace(F_nvptx, GlobalMemArgs, GLOBAL_ADDRSPACE);
 
+// Function to replace call instructions to functions in the kernel
+  std::map<Function *, Function *> OrgToClonedFuncMap;
+  std::vector<Function *> FuncToBeRemoved;
+  auto CloneAndReplaceCall = [&] (CallInst *CI, Function *OrgFunc) {
+    Function* NewFunc;
+    // Check if the called function has already been cloned before.
+    auto It = OrgToClonedFuncMap.find(OrgFunc);
+    if(It == OrgToClonedFuncMap.end()) {
+      ValueToValueMapTy VMap;
+      NewFunc = CloneFunction(OrgFunc, VMap);
+      OrgToClonedFuncMap[OrgFunc] = NewFunc;
+      FuncToBeRemoved.push_back(NewFunc);
+    } else {
+      NewFunc = (*It).second;
+    }
+    // Replace the calls to this function
+    std::vector<Value*> args;
+    for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
+      args.push_back(CI->getArgOperand(i));
+    }
+    CallInst* Inst = CallInst::Create(NewFunc, args,
+        OrgFunc->getReturnType()->isVoidTy()? "" : CI->getName(), CI);
+    CI->replaceAllUsesWith(Inst);
+    IItoRemove.push_back(CI);
+    return NewFunc;
+  };
+
 
   // Go through all the instructions
   for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) {
@@ -1212,16 +1246,12 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
 
         // The following is to find which function to call
         Function * OpenCLFunction;
-        int parentLevel = N->getParent()->getLevel();
-        int parentReplFactor = N->getParent()->getNumOfDim();
-        DEBUG(errs() << "Parent Level = " << parentLevel << "\n");
-        DEBUG(errs() << "Parent Repl factor = " << parentReplFactor << "\n");
 
         FunctionType* FT =
           FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
                             Type::getInt32Ty(KernelM->getContext()),
                             false);
-        if ((!parentLevel || !parentReplFactor) && ArgDFNode == N) {
+        if (SelectedHierarchy == ONE_LEVEL && ArgDFNode == N) {
           // We only have one level in the hierarchy or the parent node is not
           // replicated. This indicates that the parent node is the kernel
           // launch, so we need to specify a global id.
@@ -1296,14 +1326,12 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
 
         // The following is to find which function to call
         Function * OpenCLFunction;
-        int parentLevel = ParentDFNode->getLevel();
-        int parentReplFactor = ParentDFNode->getNumOfDim();
         FunctionType* FT =
             FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
                               Type::getInt32Ty(KernelM->getContext()),
                               false);
 
-        if ((N == ArgDFNode) && (!parentLevel || !parentReplFactor)) {
+        if (N == ArgDFNode && SelectedHierarchy == ONE_LEVEL) {
           // We only have one level in the hierarchy or the parent node is not
           // replicated. This indicates that the parent node is the kernel
           // launch, so the instances are global_size (gridDim x blockDim)
@@ -1412,9 +1440,9 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
                    && "Only handling sin(float) and cos(float)!");
             std::string name;
             if(II->getIntrinsicID() == Intrinsic::sin)
-              name = "_Z3sinf";
+              name = "sin";
             else
-              name = "_Z3cosf";
+              name = "cos";
 
             FunctionType* SinCosFT = FunctionType::get(II->getType(),
                                      Type::getFloatTy(KernelM->getContext()),
@@ -1465,11 +1493,16 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
 
       }
       else {
-        // Clone the function
-        ValueToValueMapTy VMap;
-        Function* newCalleeF = CloneFunction(calleeF, VMap);
-        newCalleeF->removeFromParent(); //TODO: MARIA check
-        KernelM->getFunctionList().push_back(newCalleeF);
+      // Check if the called function has already been cloned before.
+        Function *NewFunc = CloneAndReplaceCall(CI, calleeF);
+        // Iterate over the new function to see if it calls any other functions
+        // in the module.
+        for(inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); i != e; ++i) {
+          if(auto *Call = dyn_cast<CallInst>(&*i)) {
+            Function *CalledFunc = cast<Function>(Call->getCalledValue()->stripPointerCasts());
+            CloneAndReplaceCall(Call, CalledFunc);
+          }
+        }
       }
       //TODO: how to handle address space qualifiers in load/store
     }
@@ -1480,10 +1513,15 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // have assumed theworst memory behaviour for these function calls
   // Traverse the vector backwards, otherwise definitions are deleted while
   // their subsequent uses are still around
-  for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(),
-       re = IItoRemove.rend(); ri != re; ++ri) {
-    DEBUG(errs() << "Erasing: " << **ri << "\n");
-    (*ri)->eraseFromParent();
+  for (auto *I : IItoRemove) {
+    DEBUG(errs() << "Erasing: " << *I << "\n");
+    I->eraseFromParent();
+  }
+
+ // Removed the cloned functions from the parent module into the new module 
+  for(auto *F : FuncToBeRemoved) {
+    F->removeFromParent(); //TODO: MARIA check
+    KernelM->getFunctionList().push_back(F);
   }
 
   addCLMetadata(F_nvptx);
@@ -1936,7 +1974,7 @@ static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValu
 // Get generated PTX binary name
 static std::string getPTXFilename(const Module& M) {
   std::string moduleID = M.getModuleIdentifier();
-  moduleID.append(".nvptx.s");
+  moduleID.append(".kernels.cl");
   return moduleID;
 }