diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 48d65ec3b4113146ac4e79be681d4e1fc06c221a..681249231e630d09d0212a689ce61350fbc94b90 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -65,6 +65,7 @@ namespace {
   private:
     //Member variables
     Module &M;
+    Module &KernelM;
     BuildDFG &DFG;
     DFNode * KernelLaunchNode;
 
@@ -88,6 +89,8 @@ namespace {
 
 
     //Functions
+    std::string getKernelsModuleName(Module &M);
+    void writeKernelsModule();
     void transformFunctionToVoid(Function* F);
     void initRuntimeAPI();
     void addIdxDimArgs(Function* F);
@@ -100,8 +103,44 @@ namespace {
     void codeGen(DFLeafNode* N);
 
   public:
+
     // Constructor
-    CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) { }
+    CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG), KernelM(*CloneModule(&_M)) {
+
+      // Copying instead of creating new, in order to preserve required info (metadata)
+
+      // Remove functions, global variables and aliases
+      std::vector<GlobalVariable*> gvv = std::vector<GlobalVariable*>();
+      for (Module::global_iterator mi = KernelM.global_begin(),
+           me = KernelM.global_end(); (mi != me); ++mi) {
+        GlobalVariable* gv = &*mi;
+        gvv.push_back(gv);
+      }
+      for (std::vector<GlobalVariable*>::iterator vi = gvv.begin(); vi != gvv.end(); ++vi)
+        (*vi)->eraseFromParent();
+
+      std::vector<Function*> fv = std::vector<Function*>();
+      for (Module::iterator mi = KernelM.begin(),
+           me = KernelM.end(); (mi != me); ++mi) {
+        Function* f = &*mi;
+        fv.push_back(f);
+      }
+      for (std::vector<Function*>::iterator vi = fv.begin(); vi != fv.end(); ++vi)
+        (*vi)->eraseFromParent();
+
+      std::vector<GlobalAlias*> av = std::vector<GlobalAlias*>();
+      for (Module::alias_iterator mi = KernelM.alias_begin(),
+           me = KernelM.alias_end(); (mi != me); ++mi) {
+        GlobalAlias* a = &*mi;
+        av.push_back(a);
+      }
+      for (std::vector<GlobalAlias*>::iterator vi = av.begin(); vi != av.end(); ++vi)
+        (*vi)->eraseFromParent();
+
+      changeDataLayout(KernelM);
+      changeTargetTriple(KernelM);
+
+    }
 
     virtual void visit(DFInternalNode* N) {
       for(DFGraph::children_iterator i = N->getChildGraph()->begin(),
@@ -557,8 +596,8 @@ namespace {
       ValueToValueMapTy VMap;
       F_nvptx = CloneFunction(F, VMap, true);
 
-      // Insert the cloned function into the module
-      M.getFunctionList().push_back(F_nvptx);
+      // Insert the cloned function into the kernels module
+      KernelM.getFunctionList().push_back(F_nvptx);
 
       DEBUG(errs() << *F_nvptx->getType());
       DEBUG(errs() << *F_nvptx);
@@ -617,6 +656,7 @@ namespace {
             ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
             int numOfDim = ArgDFNode->getNumOfDim();
             DEBUG(errs() << "\t  Got node dimension : " << numOfDim << "\n");
+//            IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext());
             IntegerType* IntTy = Type::getInt32Ty(getGlobalContext());
             ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim);
 
@@ -648,7 +688,7 @@ namespace {
 
             // Argument of the function to be called
             ConstantInt * DimConstant =
-              ConstantInt::get(Type::getInt32Ty(getGlobalContext()), dim);
+              ConstantInt::get(Type::getInt32Ty(getGlobalContext()) /*KernelM.getContext()*/ , dim);
             ArrayRef<Value *> Args(DimConstant);
 
             // The following is to find which function to call
@@ -662,29 +702,29 @@ namespace {
             // launch, so we need to specify a global id
 
               FunctionType* FT =
-                FunctionType::get(Type::getInt32Ty(getGlobalContext()),
-                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext())),
+                FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
+                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
                                   false);
               OpenCLFunction = cast<Function>
-                (M.getOrInsertFunction(StringRef("get_global_id"), FT));
+                (KernelM.getOrInsertFunction(StringRef("get_global_id"), FT));
             } else if (Leaf_HandleToDFNodeMap[ArgII] == Leaf_HandleToDFNodeMap[II]) {
               // We are asking for this node's id with respect to its parent
               // this is a local id call
               FunctionType* FT =
-                FunctionType::get(Type::getInt32Ty(getGlobalContext()),
-                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext())),
+                FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
+                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
                                   false);
               OpenCLFunction = cast<Function>
-                (M.getOrInsertFunction(StringRef("get_local_id"), FT));
+                (KernelM.getOrInsertFunction(StringRef("get_local_id"), FT));
             } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
               // We are asking for this node's parent's id with respect to its
               // parent: this is a group id call
               FunctionType* FT =
-                FunctionType::get(Type::getInt32Ty(getGlobalContext()),
-                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext())),
+                FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
+                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
                                   false);
               OpenCLFunction = cast<Function>
-                (M.getOrInsertFunction(StringRef("get_group_id"), FT));
+                (KernelM.getOrInsertFunction(StringRef("get_group_id"), FT));
             } else {
               assert(false && "Unable to translate this intrinsic");
             }
@@ -723,7 +763,7 @@ namespace {
 
             // Argument of the function to be called
             ConstantInt * DimConstant =
-              ConstantInt::get(Type::getInt32Ty(getGlobalContext()), dim);
+              ConstantInt::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), dim);
             ArrayRef<Value *> Args(DimConstant);
 
             // The following is to find which function to call
@@ -736,29 +776,29 @@ namespace {
             // replicated. This indicates that the parent node is the kernel
             // launch, so the instances are global_size (gridDim x blockDim)
               FunctionType* FT =
-                FunctionType::get(Type::getInt32Ty(getGlobalContext()),
-                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext())),
+                FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
+                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
                                   false);
               OpenCLFunction = cast<Function>
-                (M.getOrInsertFunction(StringRef("get_global_size"), FT));
+                (KernelM.getOrInsertFunction(StringRef("get_global_size"), FT));
             } else if (Leaf_HandleToDFNodeMap[ArgII] == Leaf_HandleToDFNodeMap[II]) {
               // We are asking for this node's instances
               // this is a local size (block dim) call
               FunctionType* FT =
-                FunctionType::get(Type::getInt32Ty(getGlobalContext()),
-                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext())),
+                FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
+                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
                                   false);
               OpenCLFunction = cast<Function>
-                (M.getOrInsertFunction(StringRef("get_local_size"), FT));
+                (KernelM.getOrInsertFunction(StringRef("get_local_size"), FT));
             } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
               // We are asking for this node's parent's instances
               // this is a (global_size/local_size) (grid dim) call
               FunctionType* FT =
-                FunctionType::get(Type::getInt32Ty(getGlobalContext()),
-                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext())),
+                FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/),
+                                  std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
                                   false);
               OpenCLFunction = cast<Function>
-                (M.getOrInsertFunction(StringRef("get_num_groups"), FT));
+                (KernelM.getOrInsertFunction(StringRef("get_num_groups"), FT));
             } else {
               assert(false && "Unable to translate this intrinsic");
             }
@@ -815,6 +855,26 @@ namespace {
     return true;
   }
 
+  std::string CodeGenTraversal::getKernelsModuleName(Module &M) {
+    std::string mid = M.getModuleIdentifier();
+    return mid.append("_kernels.ll");
+  }
+
+  void CodeGenTraversal::writeKernelsModule() {
+
+    //TODO: Add metadata
+
+    char* ErrorMessage = NULL;
+    LLVMModuleRef KernelMRef = wrap(&KernelM);
+    LLVMPrintModuleToFile(KernelMRef,
+                          getKernelsModuleName(M).c_str(),
+                          &ErrorMessage);
+    if (ErrorMessage) {
+      LLVMDisposeMessage(ErrorMessage);
+    }
+    LLVMDisposeModule(KernelMRef);
+  }
+
   void CodeGenTraversal::transformFunctionToVoid(Function* F) {
 
     // FIXME: Maybe do that using the Node?