diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp index 4fc3934246fedde551eaa09e47493b1e0ebf6084..2961a966afb63e48fa80e241e853d31264abe6b2 100644 --- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -25,6 +25,7 @@ #include "llvm/IRReader/IRReader.h" #include "llvm/Linker.h" #include "llvm/Support/SourceMgr.h" +#include "llvm/Support/FileSystem.h" #include <sstream> @@ -36,11 +37,13 @@ using namespace builddfg; namespace { // Helper function declarations - void changeDataLayout(Module &); - void changeTargetTriple(Module &); - std::string printType(Type*); - std::string convertInt(int); - void findReturnInst(Function *, std::vector<ReturnInst *> &); + static std::string getPTXFilename(const Module&); + static std::string getFilenameFromModule(const Module& M); + static void changeDataLayout(Module &); + static void changeTargetTriple(Module &); + static std::string printType(Type*); + static std::string convertInt(int); + static void findReturnInst(Function *, std::vector<ReturnInst *> &); // DFG2LLVM_NVPTX - The first implementation. struct DFG2LLVM_NVPTX : public ModulePass { @@ -64,6 +67,26 @@ namespace { }; // Visitor for Code generation traversal (tree traversal for now) + class Kernel { + public: + Kernel(Function* _KF, unsigned _gridDim = 0, std::vector<Value*> + _globalWGSize = std::vector<Value*>(), unsigned _blockDim = 0, + std::vector<Value*> _localWGSize = std::vector<Value*>()) : KernelFunction(_KF), + gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim), + localWGSize(_localWGSize) { + assert(gridDim == globalWGSize.size() + && "gridDim should be same as the size of vector globalWGSize"); + assert(blockDim == localWGSize.size() + && "blockDim should be same as the size of vector localWGSize"); + } + + Function* KernelFunction; + unsigned gridDim; + unsigned blockDim; + std::vector<Value*> globalWGSize; + std::vector<Value*> localWGSize; + }; + class CodeGenTraversal : public DFNodeVisitor { private: @@ -71,14 +94,8 @@ namespace { Module &M; Module &KernelM; BuildDFG &DFG; - DFNode * KernelLaunchNode; - struct { Function * KF; - unsigned gridDim; - unsigned blockDim; - std::vector<Value*> localWGSize; - std::vector<Value*> globalWGSize; - } kernel; - + DFNode* KernelLaunchNode; + Kernel* kernel; // Map from Old function associated with DFNode to new cloned function with // extra index and dimension arguments. This map also serves to find out if // we already have an index and dim extended function copy or not (i.e., @@ -110,7 +127,7 @@ namespace { Argument* getArgumentAt(Function* F, unsigned offset); Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore); - void insertRuntimeCalls(DFInternalNode* N, const Twine& FileName, const Twine& Kernel); + void insertRuntimeCalls(DFInternalNode* N, const Twine& FileName); void codeGen(DFInternalNode* N); void codeGen(DFLeafNode* N); @@ -326,11 +343,14 @@ namespace { // used to generate a function to associate with this leaf node. The function // is responsible for all the memory allocation/transfer and invoking the // kernel call on the device - void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileName, const Twine& KernelName) { + void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileName) { // Check if clone already exists. If it does, it means we have visited this - // function before and nothing else needs to be done for this leaf node. + // function before. assert(N->getGenFunc() == NULL && "Code already generated for this node"); + // If kernel struct has not been initialized with kernel function, then fail + assert(kernel != NULL && "No kernel found!!"); + DEBUG(errs() << "Generating kernel call code\n"); Function* F = N->getFuncPointer(); @@ -401,10 +421,12 @@ namespace { DEBUG(errs() << "Initializing commandQ" << "\n"); // Initialize command queue - Value* file = getStringPointer(FileName, RI, "Filename"); - Value* kernel = getStringPointer(KernelName, RI,"KernelName"); + Value* fileStr = getStringPointer(FileName, RI, "Filename"); + errs() << *fileStr << "\n"; + errs() << "Generating code for kernel - " << kernel->KernelFunction->getName()<< "\n"; + Value* kernelStr = getStringPointer(kernel->KernelFunction->getName(), RI,"KernelName"); - Value* LaunchInstArgs[] = {file, kernel}; + Value* LaunchInstArgs[] = {fileStr, kernelStr}; DEBUG(errs() << "Inserting launch call" << "\n"); CallInst* GraphID = CallInst::Create(llvm_visc_ptx_launch, @@ -489,15 +511,46 @@ namespace { // Need work dim, localworksize, globalworksize // FIXME: Talk to DFG2LLVM_PTX pass to figure out the workdim, loacal work // size and global work size + // Allocate size_t[numDims] space on stack. Store the work group sizes and + // pass it as an argument to ExecNode + Type* Int64Ty = Type::getInt64Ty(M.getContext()); + Type* GlobalWGTy = ArrayType::get(Int64Ty, kernel->gridDim); + AllocaInst* GlobalWG = new AllocaInst(GlobalWGTy, "GlobalWGSize", RI); + Value* GlobalWGPtr = BitCastInst::CreatePointerCast(GlobalWG, Int64Ty->getPointerTo(), GlobalWG->getName()+".0", RI); + Value* nextDim = GlobalWGPtr; + errs() << *GlobalWGPtr << "\n"; + Constant* IntOne = ConstantInt::get(Int64Ty, 1); + errs() << *IntOne << "\n"; + for(unsigned i=0; i < kernel->gridDim; i++) { + errs() << *kernel->globalWGSize[i]->getType() << "\n"; + errs() << *nextDim->getType() << "\n"; + assert(kernel->globalWGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!"); + if(kernel->globalWGSize[i]->getType() != Int64Ty) { + kernel->globalWGSize[i] = BitCastInst::CreateIntegerCast(VMap[kernel->globalWGSize[i]], Int64Ty, true, "", RI); + StoreInst* SI = new StoreInst(kernel->globalWGSize[i], nextDim, RI); + errs() << *SI << "\n"; + } else { + StoreInst* SI = new StoreInst(VMap[kernel->globalWGSize[i]], nextDim, RI); + errs() << *SI << "\n"; + } + if(i+1 < kernel->gridDim) { + GetElementPtrInst* GEP = GetElementPtrInst::Create(nextDim, ArrayRef<Value*>(IntOne), GlobalWG->getName()+"."+Twine(i+1), RI); + errs() << *GEP << "\n"; + nextDim = GEP; + } + } + errs() << *llvm_visc_ptx_executeNode << "\n"; + errs() << *GlobalWGPtr << "\n"; Value* ExecNodeArgs[] = {GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), C->getNumOfDim()), Constant::getNullValue(Type::getInt64PtrTy(M.getContext())), - Constant::getNullValue(Type::getInt64PtrTy(M.getContext())) + GlobalWGPtr }; CallInst* Event = CallInst::Create(llvm_visc_ptx_executeNode, ArrayRef<Value*>(ExecNodeArgs, 4), "event."+CF->getName(), RI); + errs() << *Event << "\n"; // Wait for Kernel to Finish CallInst::Create(llvm_visc_ptx_wait, ArrayRef<Value*>(GraphID), @@ -615,9 +668,9 @@ namespace { // Now the remaining nodes to be visited should be ignored KernelLaunchNode = NULL; - writeKernelsModule(); errs() << "Insert Runtime calls\n"; - insertRuntimeCalls(N, getKernelsModuleName(M), "matrixMul"); + insertRuntimeCalls(N, getPTXFilename(M)); + writeKernelsModule(); } else { DEBUG(errs() << "Found intermediate node. Getting size parameters.\n"); @@ -643,31 +696,36 @@ namespace { if (!pLevel || !pReplFactor) { KernelLaunchNode = PNode; + kernel = new Kernel(NULL, N->getNumOfDim(), N->getDimLimits()); // TODO: Find a better way of choosing parameters - kernel.gridDim = N->getNumOfDim(); - kernel.blockDim = N->getNumOfDim(); - kernel.globalWGSize = N->getDimLimits(); - IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext()); + //kernel->gridDim = N->getNumOfDim(); + //kernel->blockDim = N->getNumOfDim(); + //kernel->globalWGSize = N->getDimLimits(); + //kernel->localWGSize = N->getDimLimits(); + //FIXME: Comment this out as we can provide localWGSize as null + //IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext()); // TODO: How to choose the div factor; - ConstantInt* divFactor = ConstantInt::getSigned(IntTy, (int64_t) 16); - std::vector<Value*> tmp(kernel.gridDim, divFactor); - for (unsigned i = 0; i < kernel.gridDim; i++) { - BinaryOperator* SDivInst = BinaryOperator::CreateSDiv(kernel.globalWGSize[i],tmp[i]); - kernel.localWGSize.push_back(SDivInst); - } + //ConstantInt* divFactor = ConstantInt::getSigned(IntTy, (int64_t) 16); + //std::vector<Value*> tmp(kernel->gridDim, divFactor); + //for (unsigned i = 0; i < kernel->gridDim; i++) { + // BinaryOperator* SDivInst = BinaryOperator::CreateSDiv(kernel->globalWGSize[i],tmp[i]); + // kernel->localWGSize.push_back(SDivInst); + //} } else { + errs() << "*************** Entering else part **************\n"; + /* KernelLaunchNode = PNode->getParent(); - kernel.gridDim = PNode->getNumOfDim(); - kernel.blockDim = N->getNumOfDim(); + kernel->gridDim = PNode->getNumOfDim(); + kernel->blockDim = N->getNumOfDim(); // TODO: Handle different number of dimensions - assert((kernel.gridDim == kernel.blockDim) && "Dimension number must match"); + assert((kernel->gridDim == kernel->blockDim) && "Dimension number must match"); std::vector<Value*> numOfBlocks = PNode->getDimLimits(); - kernel.localWGSize = N->getDimLimits(); - for (unsigned i = 0; i < kernel.gridDim; i++) { - BinaryOperator* MulInst = BinaryOperator::CreateMul(kernel.localWGSize[i],numOfBlocks[i]); - kernel.globalWGSize.push_back(MulInst); - } + kernel->localWGSize = N->getDimLimits(); + for (unsigned i = 0; i < kernel->gridDim; i++) { + //BinaryOperator* MulInst = BinaryOperator::CreateMul(kernel->localWGSize[i],numOfBlocks[i]); + //kernel->globalWGSize.push_back(MulInst); + }*/ } std::vector<IntrinsicInst *> IItoRemove; @@ -922,7 +980,8 @@ namespace { (*ri)->eraseFromParent(); addCLMetadata(F_nvptx); - kernel.KF = F_nvptx; + kernel->KernelFunction = F_nvptx; + errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n"; DEBUG(errs() << KernelM); return; @@ -945,13 +1004,19 @@ namespace { // Initiate code generation for root DFNode CGTVisitor->visit(Root); //TODO: Edit module epilogue to remove the VISC intrinsic declarations + delete CGTVisitor; return true; } std::string CodeGenTraversal::getKernelsModuleName(Module &M) { + /*SmallString<128> currentDir; + llvm::sys::fs::current_path(currentDir); + std::string fileName = getFilenameFromModule(M); + Twine output = Twine(currentDir) + "/Output/" + fileName + ""; + return output.str().append(".kernels.ll");*/ std::string mid = M.getModuleIdentifier(); - return mid.append("_kernels.ll"); + return mid.append(".kernels.ll"); } void CodeGenTraversal::fixValueAddrspace(Value* V, unsigned addrspace) { @@ -1174,9 +1239,22 @@ namespace { * Helper functions * ******************************************************************************/ + // Get generated PTX binary name + static std::string getPTXFilename(const Module& M) { + std::string moduleID = M.getModuleIdentifier(); + moduleID.append(".nvptx.s"); + return moduleID; + } + + // Get the name of the input file from module ID + static std::string getFilenameFromModule(const Module& M) { + std::string moduleID = M.getModuleIdentifier(); + return moduleID.substr(moduleID.find_last_of("/")+1); + } + // Changes the data layout of the Module to be compiled with NVPTX backend // TODO: Figure out when to call it, probably after duplicating the modules - void changeDataLayout(Module &M) { + static void changeDataLayout(Module &M) { std::string nvptx32_layoutStr = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"; std::string nvptx64_layoutStr = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"; @@ -1189,7 +1267,7 @@ namespace { return; } - void changeTargetTriple(Module &M) { + static void changeTargetTriple(Module &M) { std::string nvptx32_TargetTriple = "nvptx--nvidiacl"; std::string nvptx64_TargetTriple = "nvptx64--nvidiacl"; @@ -1203,7 +1281,7 @@ namespace { } // Helper function, generate a string representation of a type - std::string printType(Type* ty) { + static std::string printType(Type* ty) { std::string type_str; raw_string_ostream rso(type_str); ty->print(rso); @@ -1211,14 +1289,14 @@ namespace { } // Helper function, convert int to string - std::string convertInt(int number) { + static std::string convertInt(int number) { std::stringstream ss;//create a stringstream ss << number;//add number to the stream return ss.str();//return a string with the contents of the stream } // Helper function, populate a vector with all return statements in a function - void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) { + static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) { for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { Instruction *I = &(*i); ReturnInst* RI = dyn_cast<ReturnInst>(I);