From a098504f823bc45a613b0620609066b445e5cb40 Mon Sep 17 00:00:00 2001 From: Prakalp Srivastava <psrivas2@illinois.edu> Date: Mon, 17 Nov 2014 20:01:54 +0000 Subject: [PATCH] Refactored code to make insert runtime call function more readable --- .../DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp | 2375 +++++++++-------- 1 file changed, 1207 insertions(+), 1168 deletions(-) diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp index fafc892ea7..d7cdfa2e40 100644 --- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -36,291 +36,637 @@ using namespace builddfg; //STATISTIC(IntrinsicCounter, "Counts number of visc intrinsics greeted"); namespace { +// Helper class declarations + +// Class to maintain the tuple of host pointer, device pointer and size +// in bytes. Would have preferred to use tuple but support not yet available +class OutputPtr { +public: + OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes) + : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {} + + Value* h_ptr; + Value* d_ptr; + Value* bytes; +}; + +// Class to maintain important kernel info required for generating runtime +// calls +class Kernel { +public: + Kernel(Function* _KF, unsigned _gridDim = 0, std::vector<Value*> + _globalWGSize = std::vector<Value*>(), unsigned _blockDim = 0, + std::vector<Value*> _localWGSize = std::vector<Value*>()) + : KernelFunction(_KF), + gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim), + localWGSize(_localWGSize) { + + assert(gridDim == globalWGSize.size() + && "gridDim should be same as the size of vector globalWGSize"); + assert(blockDim == localWGSize.size() + && "blockDim should be same as the size of vector localWGSize"); + } + + Function* KernelFunction; + unsigned gridDim; + unsigned blockDim; + std::vector<Value*> globalWGSize; + std::vector<Value*> localWGSize; +}; + // Helper function declarations - static bool hasAttribute(Function*, unsigned, Attribute::AttrKind); - static std::string getPTXFilename(const Module&); - static std::string getFilenameFromModule(const Module& M); - static void changeDataLayout(Module &); - static void changeTargetTriple(Module &); - static std::string printType(Type*); - static std::string convertInt(int); - static void findReturnInst(Function *, std::vector<ReturnInst *> &); - - // DFG2LLVM_NVPTX - The first implementation. - struct DFG2LLVM_NVPTX : public ModulePass { - static char ID; // Pass identification, replacement for typeid - DFG2LLVM_NVPTX() : ModulePass(ID) {} - - private: - // Member variables - - // Functions - - public: - bool runOnModule(Module &M); - - void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<BuildDFG>(); - AU.addPreserved<BuildDFG>(); - } +static void getExecuteNodeParams(Value* &, Value* &, Value* &, Kernel*, + ValueToValueMapTy&, Instruction*); +static bool hasAttribute(Function*, unsigned, Attribute::AttrKind); +static std::string getPTXFilename(const Module&); +static std::string getFilenameFromModule(const Module& M); +static void changeDataLayout(Module &); +static void changeTargetTriple(Module &); +static std::string printType(Type*); +static std::string convertInt(int); +static void findReturnInst(Function *, std::vector<ReturnInst *> &); + +// DFG2LLVM_NVPTX - The first implementation. +struct DFG2LLVM_NVPTX : public ModulePass { + static char ID; // Pass identification, replacement for typeid + DFG2LLVM_NVPTX() : ModulePass(ID) {} + +private: + // Member variables + + // Functions + +public: + bool runOnModule(Module &M); + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<BuildDFG>(); + AU.addPreserved<BuildDFG>(); + } - }; - - // Helper class to maintain the tuple of host pointer, device pointer and size - // in bytes. Would have preferred to use tuple but support not yet available - class OutputPtr { - public: - OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes) - : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {} - - Value* h_ptr; - Value* d_ptr; - Value* bytes; - }; - - // Visitor for Code generation traversal (tree traversal for now) - class Kernel { - public: - Kernel(Function* _KF, unsigned _gridDim = 0, std::vector<Value*> - _globalWGSize = std::vector<Value*>(), unsigned _blockDim = 0, - std::vector<Value*> _localWGSize = std::vector<Value*>()) : KernelFunction(_KF), - gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim), - localWGSize(_localWGSize) { - assert(gridDim == globalWGSize.size() - && "gridDim should be same as the size of vector globalWGSize"); - assert(blockDim == localWGSize.size() - && "blockDim should be same as the size of vector localWGSize"); +}; + +// Visitor for Code generation traversal (tree traversal for now) +class CodeGenTraversal : public DFNodeVisitor { + +private: + //Member variables + Module &M; + Module &KernelM; + BuildDFG &DFG; + DFNode* KernelLaunchNode; + Kernel* kernel; + // Map from Old function associated with DFNode to new cloned function with + // extra index and dimension arguments. This map also serves to find out if + // we already have an index and dim extended function copy or not (i.e., + // "Have we visited this function before?") + DenseMap<DFNode*, Value*> OutputMap; + + // VISC Runtime API + Module* runtimeModule; + Function* llvm_visc_ptx_launch; + Function* llvm_visc_ptx_wait; + Function* llvm_visc_ptx_initContext; + Function* llvm_visc_ptx_argument_scalar; + Function* llvm_visc_ptx_argument_ptr; + Function* llvm_visc_ptx_getOutput; + Function* llvm_visc_ptx_executeNode; + + + //Functions + std::string getKernelsModuleName(Module &M); + void fixValueAddrspace(Value* V, unsigned addrspace); + Value* getStringPointer(const Twine& S, Instruction* InsertBefore, const Twine& Name = ""); + void changeArgAddrspace(Function* F, unsigned i); + void addCLMetadata(Function* F); + void writeKernelsModule(); + void transformFunctionToVoid(Function* F); + void initRuntimeAPI(); + void addIdxDimArgs(Function* F); + Argument* getArgumentAt(Function* F, unsigned offset); + Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, + Instruction* InsertBefore); + void insertRuntimeCalls(DFInternalNode* N, const Twine& FileName); + + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); + +public: + + // Constructor + CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG), KernelM(*CloneModule(&_M)) { + // Initialize Runtime API + initRuntimeAPI(); + + // Copying instead of creating new, in order to preserve required info (metadata) + + // Remove functions, global variables and aliases + std::vector<GlobalVariable*> gvv = std::vector<GlobalVariable*>(); + for (Module::global_iterator mi = KernelM.global_begin(), + me = KernelM.global_end(); (mi != me); ++mi) { + GlobalVariable* gv = &*mi; + gvv.push_back(gv); + } + for (std::vector<GlobalVariable*>::iterator vi = gvv.begin(); vi != gvv.end(); ++vi) { + (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType())); + (*vi)->eraseFromParent(); } - Function* KernelFunction; - unsigned gridDim; - unsigned blockDim; - std::vector<Value*> globalWGSize; - std::vector<Value*> localWGSize; - }; - - class CodeGenTraversal : public DFNodeVisitor { - - private: - //Member variables - Module &M; - Module &KernelM; - BuildDFG &DFG; - DFNode* KernelLaunchNode; - Kernel* kernel; - // Map from Old function associated with DFNode to new cloned function with - // extra index and dimension arguments. This map also serves to find out if - // we already have an index and dim extended function copy or not (i.e., - // "Have we visited this function before?") - DenseMap<DFNode*, Value*> OutputMap; - - // VISC Runtime API - Module* runtimeModule; - Function* llvm_visc_ptx_launch; - Function* llvm_visc_ptx_wait; - Function* llvm_visc_ptx_initContext; - Function* llvm_visc_ptx_argument_scalar; - Function* llvm_visc_ptx_argument_ptr; - Function* llvm_visc_ptx_getOutput; - Function* llvm_visc_ptx_executeNode; - - - //Functions - std::string getKernelsModuleName(Module &M); - void fixValueAddrspace(Value* V, unsigned addrspace); - Value* getStringPointer(const Twine& S, Instruction* InsertBefore, const Twine& Name = ""); - void changeArgAddrspace(Function* F, unsigned i); - void addCLMetadata(Function* F); - void writeKernelsModule(); - void transformFunctionToVoid(Function* F); - void initRuntimeAPI(); - void addIdxDimArgs(Function* F); - Argument* getArgumentAt(Function* F, unsigned offset); - Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, - Instruction* InsertBefore); - void insertRuntimeCalls(DFInternalNode* N, const Twine& FileName); - - void codeGen(DFInternalNode* N); - void codeGen(DFLeafNode* N); - - public: - - // Constructor - CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG), KernelM(*CloneModule(&_M)) { - // Initialize Runtime API - initRuntimeAPI(); - - // Copying instead of creating new, in order to preserve required info (metadata) - - // Remove functions, global variables and aliases - std::vector<GlobalVariable*> gvv = std::vector<GlobalVariable*>(); - for (Module::global_iterator mi = KernelM.global_begin(), - me = KernelM.global_end(); (mi != me); ++mi) { - GlobalVariable* gv = &*mi; - gvv.push_back(gv); - } - for (std::vector<GlobalVariable*>::iterator vi = gvv.begin(); vi != gvv.end(); ++vi) { - (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType())); - (*vi)->eraseFromParent(); - } + std::vector<Function*> fv = std::vector<Function*>(); + for (Module::iterator mi = KernelM.begin(), + me = KernelM.end(); (mi != me); ++mi) { + Function* f = &*mi; + fv.push_back(f); + } + for (std::vector<Function*>::iterator vi = fv.begin(); vi != fv.end(); ++vi) { + (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType())); + (*vi)->eraseFromParent(); + } - std::vector<Function*> fv = std::vector<Function*>(); - for (Module::iterator mi = KernelM.begin(), - me = KernelM.end(); (mi != me); ++mi) { - Function* f = &*mi; - fv.push_back(f); - } - for (std::vector<Function*>::iterator vi = fv.begin(); vi != fv.end(); ++vi) { - (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType())); - (*vi)->eraseFromParent(); - } + std::vector<GlobalAlias*> av = std::vector<GlobalAlias*>(); + for (Module::alias_iterator mi = KernelM.alias_begin(), + me = KernelM.alias_end(); (mi != me); ++mi) { + GlobalAlias* a = &*mi; + av.push_back(a); + } + for (std::vector<GlobalAlias*>::iterator vi = av.begin(); vi != av.end(); ++vi) { + (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType())); + (*vi)->eraseFromParent(); + } - std::vector<GlobalAlias*> av = std::vector<GlobalAlias*>(); - for (Module::alias_iterator mi = KernelM.alias_begin(), - me = KernelM.alias_end(); (mi != me); ++mi) { - GlobalAlias* a = &*mi; - av.push_back(a); - } - for (std::vector<GlobalAlias*>::iterator vi = av.begin(); vi != av.end(); ++vi) { - (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType())); - (*vi)->eraseFromParent(); - } + changeDataLayout(KernelM); + changeTargetTriple(KernelM); - changeDataLayout(KernelM); - changeTargetTriple(KernelM); + DEBUG(errs() << KernelM); - DEBUG(errs() << KernelM); + } + virtual void visit(DFInternalNode* N) { + for(DFGraph::children_iterator i = N->getChildGraph()->begin(), + e = N->getChildGraph()->end(); i != e; ++i) { + DFNode* child = *i; + child->applyDFNodeVisitor(*this); } - virtual void visit(DFInternalNode* N) { - for(DFGraph::children_iterator i = N->getChildGraph()->begin(), - e = N->getChildGraph()->end(); i != e; ++i) { - DFNode* child = *i; - child->applyDFNodeVisitor(*this); - } - - DEBUG(errs() << "Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n"); - codeGen(N); - DEBUG(errs() << "DONE" << "\n"); + DEBUG(errs() << "Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n"); + codeGen(N); + DEBUG(errs() << "DONE" << "\n"); - } + } - virtual void visit(DFLeafNode* N) { - DEBUG(errs() << "Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n"); - codeGen(N); - DEBUG(errs() << "DONE" << "\n"); - } + virtual void visit(DFLeafNode* N) { + DEBUG(errs() << "Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n"); + codeGen(N); + DEBUG(errs() << "DONE" << "\n"); + } - }; +}; + +// Initialize the VISC runtime API. This makes it easier to insert these calls +void CodeGenTraversal::initRuntimeAPI() { + + // Load Runtime API Module + SMDiagnostic Err; + runtimeModule = ParseIRFile("/home/psrivas2/current-src/projects/visc-rt/visc-rt.ll", Err, M.getContext()); + if(runtimeModule == NULL) + DEBUG(errs() << Err.getMessage()); + else + DEBUG(errs() << "Successfully loaded visc-rt API module\n"); + + // Get or insert the global declarations for launch/wait functions + llvm_visc_ptx_launch = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_launch", + runtimeModule->getFunction("llvm_visc_ptx_launch")->getFunctionType())); + DEBUG(errs() << *llvm_visc_ptx_launch); + + llvm_visc_ptx_wait = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_wait", + runtimeModule->getFunction("llvm_visc_ptx_wait")->getFunctionType())); + DEBUG(errs() << *llvm_visc_ptx_wait); + + llvm_visc_ptx_initContext = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_initContext" , + runtimeModule->getFunction("llvm_visc_ptx_initContext")->getFunctionType())); + DEBUG(errs() << *llvm_visc_ptx_initContext); + + llvm_visc_ptx_argument_scalar = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_argument_scalar", + runtimeModule->getFunction("llvm_visc_ptx_argument_scalar")->getFunctionType())); + DEBUG(errs() << *llvm_visc_ptx_argument_scalar); + + llvm_visc_ptx_argument_ptr = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_argument_ptr", + runtimeModule->getFunction("llvm_visc_ptx_argument_ptr")->getFunctionType())); + DEBUG(errs() << *llvm_visc_ptx_argument_ptr); + + llvm_visc_ptx_getOutput = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_getOutput", + runtimeModule->getFunction("llvm_visc_ptx_getOutput")->getFunctionType())); + DEBUG(errs() << *llvm_visc_ptx_getOutput); + + llvm_visc_ptx_executeNode = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_executeNode", + runtimeModule->getFunction("llvm_visc_ptx_executeNode")->getFunctionType())); + DEBUG(errs() << *llvm_visc_ptx_executeNode); + +} +void CodeGenTraversal::addIdxDimArgs(Function* F) { + // Add Index and Dim arguments + std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"}; + for (int i = 0; i < 6; ++i) { + new Argument(Type::getInt32Ty(F->getContext()), names[i], F); + } - // Initialize the VISC runtime API. This makes it easier to insert these calls - void CodeGenTraversal::initRuntimeAPI() { + // Create the argument type list with added argument types + std::vector<Type*> ArgTypes; + for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + ArgTypes.push_back(ai->getType()); + } + // Adding new arguments to the function argument list, would not change the + // function type. We need to change the type of this function to reflect the + // added arguments + FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg()); + PointerType* PTy = PointerType::get(FTy, cast<PointerType>(F->getType())->getAddressSpace()); + + // Change the function type + F->mutateType(PTy); +} + +/* Traverse the function F argument list to get argument at offset*/ +Argument* CodeGenTraversal::getArgumentAt(Function* F, unsigned offset) { + assert((F->getFunctionType()->getNumParams() > offset && offset >= 0) + && "Invalid offset to access arguments!"); + + Argument* arg; + Function::arg_iterator i = F->arg_begin(), e = F->arg_end(); + for(; offset != 0 && i!=e; i++) { + offset--; + } + arg = i; + DEBUG(errs() << *arg <<"\n"); + return arg; +} + + +Value* CodeGenTraversal::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, + Instruction* InsertBefore) { + // TODO: Assumption is that each input port of a node has just one + // incoming edge. May change later on. + + // Find the incoming edge at the requested input port + DFEdge* E = Child->getInDFEdgeAt(i); + assert(E && "No incoming edge or binding for input element!"); + // Find the Source DFNode associated with the incoming edge + DFNode* SrcDF = E->getSourceDF(); + + // If Source DFNode is a dummyNode, edge is from parent. Get the + // argument from argument list of this internal node + Value* inputVal; + if(SrcDF->isEntryNode()) { + inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition()); + DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + } + else { + // edge is from a sibling + // Check - code should already be generated for this source dfnode + assert(OutputMap.count(SrcDF) + && "Source node call not found. Dependency violation!"); + + // Find CallInst associated with the Source DFNode using OutputMap + Value* CI = OutputMap[SrcDF]; + + // Extract element at source position from this call instruction + std::vector<unsigned> IndexList; + IndexList.push_back(E->getSourcePosition()); + DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); + ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, + "", InsertBefore); + inputVal = EI; + } + return inputVal; +} + +// Generate Code for declaring a constant string [L x i8] and return a pointer +// to the start of it. +Value* CodeGenTraversal::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) { + Constant* SConstant = ConstantDataArray::getString(M.getContext(), S.str(), true); + Value* SGlobal = new GlobalVariable(M, SConstant->getType(), true, + GlobalValue::InternalLinkage, SConstant, Name); + Value* Zero = ConstantInt::get(Type::getInt64Ty(getGlobalContext()), 0); + Value* GEPArgs[] = {Zero, Zero}; + GetElementPtrInst* SPtr = GetElementPtrInst::Create(SGlobal, + ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB); + return SPtr; +} + +// Generate Code to call the kernel +// The plan is to replace the internal node with a leaf node. This method is +// used to generate a function to associate with this leaf node. The function +// is responsible for all the memory allocation/transfer and invoking the +// kernel call on the device +void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileName) { + // Check if clone already exists. If it does, it means we have visited this + // function before. + assert(N->getGenFunc() == NULL && "Code already generated for this node"); + + // Useful values + Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1); + Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0); + + // If kernel struct has not been initialized with kernel function, then fail + assert(kernel != NULL && "No kernel found!!"); + + DEBUG(errs() << "Generating kernel call code\n"); + + Function* F = N->getFuncPointer(); + + + // Create of clone of F with no instructions. Only the type is the same as F + // without the extra arguments. + Function* F_X86; + + // Clone the function, if we are seeing this function for the first time. We + // only need a clone in terms of type. + ValueToValueMapTy VMap; + + // Create new function with the same type + F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); + + // Loop over the arguments, copying the names of arguments over. + Function::arg_iterator dest_iterator = F_X86->arg_begin(); + for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); + i != e; ++i) { + dest_iterator->setName(i->getName()); // Copy the name over... + // Add mapping to VMap and increment dest iterator + VMap[i] = dest_iterator++; + } - // Load Runtime API Module - SMDiagnostic Err; - runtimeModule = ParseIRFile("/home/psrivas2/current-src/projects/visc-rt/visc-rt.ll", Err, M.getContext()); - if(runtimeModule == NULL) - DEBUG(errs() << Err.getMessage()); - else - DEBUG(errs() << "Successfully loaded visc-rt API module\n"); + // Add a basic block to this empty function + BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86); + ReturnInst* RI = ReturnInst::Create(M.getContext(), + UndefValue::get(F_X86->getReturnType()), BB); - // Get or insert the global declarations for launch/wait functions - llvm_visc_ptx_launch = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_launch", - runtimeModule->getFunction("llvm_visc_ptx_launch")->getFunctionType())); - DEBUG(errs() << *llvm_visc_ptx_launch); + //Add the generated function info to DFNode + N->setGenFunc(F_X86, DFNode::X86); - llvm_visc_ptx_wait = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_wait", - runtimeModule->getFunction("llvm_visc_ptx_wait")->getFunctionType())); - DEBUG(errs() << *llvm_visc_ptx_wait); + // FIXME: Adding Index and Dim arguments are probably not required except + // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do + // have those arguments) - llvm_visc_ptx_initContext = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_initContext" , - runtimeModule->getFunction("llvm_visc_ptx_initContext")->getFunctionType())); - DEBUG(errs() << *llvm_visc_ptx_initContext); + // Add Index and Dim arguments except for the root node + if(!N->isRoot()) + addIdxDimArgs(F_X86); - llvm_visc_ptx_argument_scalar = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_argument_scalar", - runtimeModule->getFunction("llvm_visc_ptx_argument_scalar")->getFunctionType())); - DEBUG(errs() << *llvm_visc_ptx_argument_scalar); - - llvm_visc_ptx_argument_ptr = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_argument_ptr", - runtimeModule->getFunction("llvm_visc_ptx_argument_ptr")->getFunctionType())); - DEBUG(errs() << *llvm_visc_ptx_argument_ptr); + // Sort children in topological order before code generation for kernel call + N->getChildGraph()->sortChildren(); - llvm_visc_ptx_getOutput = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_getOutput", - runtimeModule->getFunction("llvm_visc_ptx_getOutput")->getFunctionType())); - DEBUG(errs() << *llvm_visc_ptx_getOutput); + // The DFNode N has the property that it has only one child (leaving Entry + // and Exit dummy nodes). This child is the PTX kernel. This simplifies code + // generation for kernel calls significantly. All the inputs to this child + // node would either be constants or from the parent node N. - llvm_visc_ptx_executeNode = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_executeNode", - runtimeModule->getFunction("llvm_visc_ptx_executeNode")->getFunctionType())); - DEBUG(errs() << *llvm_visc_ptx_executeNode); + assert(N->getChildGraph()->size() == 3 + && "Node expected to have just one non-dummy node!"); + DFNode* C; + for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); ci != ce; ++ci) { + C = *ci; + // Skip dummy node call + if (!C->isDummyNode()) + break; } - void CodeGenTraversal::addIdxDimArgs(Function* F) { - // Add Index and Dim arguments - std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"}; - for (int i = 0; i < 6; ++i) { - new Argument(Type::getInt32Ty(F->getContext()), names[i], F); - } - // Create the argument type list with added argument types - std::vector<Type*> ArgTypes; - for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - ArgTypes.push_back(ai->getType()); - } - // Adding new arguments to the function argument list, would not change the - // function type. We need to change the type of this function to reflect the - // added arguments - FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg()); - PointerType* PTy = PointerType::get(FTy, cast<PointerType>(F->getType())->getAddressSpace()); - - // Change the function type - F->mutateType(PTy); - } + assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!"); + + Function* CF = C->getFuncPointer(); + // Initialize context + DEBUG(errs() << "Initializing context" << "\n"); + CallInst::Create(llvm_visc_ptx_initContext, None, "", RI); - /* Traverse the function F argument list to get argument at offset*/ - Argument* CodeGenTraversal::getArgumentAt(Function* F, unsigned offset) { - assert((F->getFunctionType()->getNumParams() > offset && offset >= 0) - && "Invalid offset to access arguments!"); + DEBUG(errs() << "Initializing commandQ" << "\n"); + // Initialize command queue + Value* fileStr = getStringPointer(FileName, RI, "Filename"); + errs() << *fileStr << "\n"; + errs() << "Generating code for kernel - " << kernel->KernelFunction->getName()<< "\n"; + Value* kernelStr = getStringPointer(kernel->KernelFunction->getName(), RI,"KernelName"); - Argument* arg; - Function::arg_iterator i = F->arg_begin(), e = F->arg_end(); - for(; offset != 0 && i!=e; i++) { - offset--; + Value* LaunchInstArgs[] = {fileStr, kernelStr}; + + DEBUG(errs() << "Inserting launch call" << "\n"); + CallInst* GraphID = CallInst::Create(llvm_visc_ptx_launch, + ArrayRef<Value*>(LaunchInstArgs, 2), + "graph"+CF->getName(), + RI); + DEBUG(errs() << *GraphID << "\n"); + // Iterate over the required input edges of the node and use the visc-rt API + // to set inputs + DEBUG(errs() << "Iterate over input edges of node and insert visc api\n"); + + std::vector<OutputPtr> OutputPointers; + for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) { + + Value* inputVal = getInValueAt(C, i, F_X86, RI); + // input value has been obtained. + // Check if input is a scalar value or a pointer operand + // For scalar values such as int, float, etc. the size is simply the size of + // type on target machine, but for pointers, the size of data would be the + // next integer argument + if(inputVal->getType()->isPointerTy()) { + // CheckAttribute + Value* isOutput = (hasAttribute(CF, i, Attribute::Out))? True : False; + Value* isInput = ((hasAttribute(CF, i, Attribute::Out)) + && !(hasAttribute(CF, i, Attribute::In)))? False : True; + + Argument* A = getArgumentAt(CF, i); + if(isOutput == True) { + errs() << *A << " is an OUTPUT argument\n"; + } + if(isInput == True) { + errs() << *A << " is an INPUT argument\n"; + } + + Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal, + Type::getInt8PtrTy(M.getContext()), + inputVal->getName()+".i8ptr", + RI); + // Pointer Input + Value* inputSize = getInValueAt(C, i+1, F_X86, RI); + assert(inputSize->getType() == Type::getInt64Ty(M.getContext()) + && "Pointer type input must always be followed by size (integer type)"); + Value* setInputArgs[] = {GraphID, + inputValI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),i), + inputSize, + isInput, + isOutput + }; + Value* d_ptr = CallInst::Create(llvm_visc_ptx_argument_ptr, + ArrayRef<Value*>(setInputArgs, 6), "", RI); + // If this has out attribute, store the returned device pointer in + // memory to read device memory later + if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize)); + } + else { // Scalar Input + // Store the scalar value on stack and then pass the pointer to its + // location + AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), inputVal->getName()+".ptr", RI); + StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI); + + Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr, + Type::getInt8PtrTy(M.getContext()), + inputVal->getName()+".i8ptr", + RI); + + Value* setInputArgs[] = {GraphID, + inputValI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),i), + ConstantExpr::getSizeOf(inputVal->getType()) + }; + CallInst::Create(llvm_visc_ptx_argument_scalar, + ArrayRef<Value*>(setInputArgs, 4), "", RI); } - arg = i; - DEBUG(errs() << *arg <<"\n"); - return arg; - } + } + DEBUG(errs() << "Setup output edges of node and insert visc api\n"); + // Setup output + // FIXME: Note - There is a tricky question. In X86 we do not need to care + // about pointer inputs which modify data in memory implicitly (without + // showing it as output). There is no extra cost needed to handle such inputs + // For PTX, we need to read back such data from device memory to host memory. + // The cost is huge and hence we need to differentiate between readonly + // pointer inputs vs read/write pointer inputs. Currently supporting only a + // simple model in which all input edges are readonly and output is + // writeonly. + + // Set output + StructType* OutputTy = C->getOutputType(); + unsigned outputIndex = CF->getFunctionType()->getNumParams(); + Value* outputSize = ConstantExpr::getSizeOf(OutputTy); + Value* setOutputArgs[] = {GraphID, + Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), + ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex), + ConstantExpr::getSizeOf(OutputTy), + False, + True + }; + + CallInst* d_Output = CallInst::Create(llvm_visc_ptx_argument_ptr, + ArrayRef<Value*>(setOutputArgs, 6), + "d_output."+CF->getName(), + RI); - Value* CodeGenTraversal::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, - Instruction* InsertBefore) { - // TODO: Assumption is that each input port of a node has just one - // incoming edge. May change later on. + // Enqueue kernel + // Need work dim, localworksize, globalworksize + // FIXME: Talk to DFG2LLVM_PTX pass to figure out the workdim, loacal work + // size and global work size + // Allocate size_t[numDims] space on stack. Store the work group sizes and + // pass it as an argument to ExecNode + + Value *workDim, *LocalWGPtr, *GlobalWGPtr; + getExecuteNodeParams(workDim, LocalWGPtr, GlobalWGPtr, kernel, VMap, RI); + Value* ExecNodeArgs[] = {GraphID, + workDim, + LocalWGPtr, + GlobalWGPtr + }; + CallInst* Event = CallInst::Create(llvm_visc_ptx_executeNode, + ArrayRef<Value*>(ExecNodeArgs, 4), + "event."+CF->getName(), + RI); + DEBUG(errs() << "Execute Node Call: " << *Event << "\n"); + // Wait for Kernel to Finish + CallInst::Create(llvm_visc_ptx_wait, + ArrayRef<Value*>(GraphID), + "", + RI); + // Read Output Struct + Value* GetOutputArgs[] = {GraphID, + Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), + d_Output, + outputSize + }; + CallInst* h_Output = CallInst::Create(llvm_visc_ptx_getOutput, + ArrayRef<Value*>(GetOutputArgs, 4), + "h_output."+CF->getName()+".addr", + RI); + // Read each device pointer listed in output struct + // Load the output struct + CastInst* BI = BitCastInst::CreatePointerCast(h_Output, CF->getReturnType()->getPointerTo(), "output.ptr", RI); + Value* KernelOutput = new LoadInst(BI, "", RI); + + // Read all the pointer arguments which had side effects i.e., had out + // attribute + for(auto output: OutputPointers) { + errs() << "Read: " << *output.d_ptr << "\n"; + errs() << "\t To: " << *output.h_ptr << "\n"; + errs() << "\t #bytes: " << *output.bytes << "\n"; + Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes}; + CallInst* CI = CallInst::Create(llvm_visc_ptx_getOutput, + ArrayRef<Value*>(GetOutputArgs, 4), + "", RI); + } + /*for(unsigned i=0; i < OutputTy->getNumElements(); i++) { + Type* elemTy = OutputTy->getElementType(i); + if(elemTy->isPointerTy()) { + // Pointer type + assert(OutputTy->getElementType(i+1) == Type::getInt64Ty(M.getContext()) + && "Every Pointer type must be followed by an integer"); + ExtractValueInst* d_ptr = ExtractValueInst::Create(KernelOutput, ArrayRef<unsigned>(i), "", RI); + // Change d_ptr to i8* + CastInst* d_ptr_i8 = BitCastInst::CreatePointerCast(d_ptr, Type::getInt8PtrTy(M.getContext()), "", RI); + ExtractValueInst* len = ExtractValueInst::Create(KernelOutput, ArrayRef<unsigned>(i+1), "", RI); + // GetOutputPtr call + Value* GetOutputArgs[] = {GraphID, + d_ptr_i8, + len}; + CallInst* h_ptr_i8 = CallInst::Create(llvm_visc_ptx_getOutput, + ArrayRef<Value*>(GetOutputArgs, 3), + "", + RI); + // Change h_ptr to correct type + CastInst* h_ptr = CastInst::CreatePointerCast(h_ptr_i8, + cast<StructType>(KernelOutput->getType())->getElementType(i), + "", + RI); + KernelOutput = InsertValueInst::Create(KernelOutput, h_ptr, ArrayRef<unsigned>(i), "", RI); + } + }*/ + + // Prepare output + KernelOutput->setName("output."+CF->getName()); + OutputMap[C] = KernelOutput; + + DEBUG(errs() << "*** Generating epilogue code for the function****\n"); + // Generate code for output bindings + // Get Exit node + C = N->getChildGraph()->getExit(); + // Get OutputType of this node + StructType* OutTy = N->getOutputType(); + Value *retVal = UndefValue::get(F_X86->getReturnType()); + // Find all the input edges to exit node + for (unsigned i=0; i < OutTy->getNumElements(); i++) { + DEBUG(errs() << "Output Edge " << i << "\n"); // Find the incoming edge at the requested input port - DFEdge* E = Child->getInDFEdgeAt(i); - assert(E && "No incoming edge or binding for input element!"); + DFEdge* E = C->getInDFEdgeAt(i); + + assert(E && "No Binding for output element!"); // Find the Source DFNode associated with the incoming edge DFNode* SrcDF = E->getSourceDF(); + DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); + // If Source DFNode is a dummyNode, edge is from parent. Get the // argument from argument list of this internal node Value* inputVal; if(SrcDF->isEntryNode()) { - inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition()); + inputVal = getArgumentAt(F_X86, i); DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); } else { - // edge is from a sibling + // edge is from a internal node // Check - code should already be generated for this source dfnode assert(OutputMap.count(SrcDF) && "Source node call not found. Dependency violation!"); - // Find CallInst associated with the Source DFNode using OutputMap + // Find Output Value associated with the Source DFNode using OutputMap Value* CI = OutputMap[SrcDF]; // Extract element at source position from this call instruction @@ -328,1037 +674,730 @@ namespace { IndexList.push_back(E->getSourcePosition()); DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, - "", InsertBefore); + "",RI); inputVal = EI; } - return inputVal; - } - - // Generate Code for declaring a constant string [L x i8] and return a pointer - // to the start of it. - Value* CodeGenTraversal::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) { - Constant* SConstant = ConstantDataArray::getString(M.getContext(), S.str(), true); - Value* SGlobal = new GlobalVariable(M, SConstant->getType(), true, - GlobalValue::InternalLinkage, SConstant, Name); - Value* Zero = ConstantInt::get(Type::getInt64Ty(getGlobalContext()), 0); - Value* GEPArgs[] = {Zero, Zero}; - GetElementPtrInst* SPtr = GetElementPtrInst::Create(SGlobal, - ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB); - return SPtr; + std::vector<unsigned> IdxList; + IdxList.push_back(i); + retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI); } - - // Generate Code to call the kernel - // The plan is to replace the internal node with a leaf node. This method is - // used to generate a function to associate with this leaf node. The function - // is responsible for all the memory allocation/transfer and invoking the - // kernel call on the device - void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileName) { - // Check if clone already exists. If it does, it means we have visited this - // function before. - assert(N->getGenFunc() == NULL && "Code already generated for this node"); - - // Useful values - Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1); - Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0); - - // If kernel struct has not been initialized with kernel function, then fail - assert(kernel != NULL && "No kernel found!!"); - - DEBUG(errs() << "Generating kernel call code\n"); - - Function* F = N->getFuncPointer(); + DEBUG(errs() << "Extracted all\n"); + retVal->setName("output"); + ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); + ReplaceInstWithInst(RI, newRI); +} - // Create of clone of F with no instructions. Only the type is the same as F - // without the extra arguments. - Function* F_X86; +// Right now, only targeting the one level case. In general, device functions +// can return values so we don't need to change them +void CodeGenTraversal::codeGen(DFInternalNode* N) { - // Clone the function, if we are seeing this function for the first time. We - // only need a clone in terms of type. - ValueToValueMapTy VMap; + if (!KernelLaunchNode) { + DEBUG(errs() << "No code generated (host code for kernel launch complete).\n"); + return; + } - // Create new function with the same type - F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); + if (N == KernelLaunchNode) { + DEBUG(errs() << "Found kernel launch node. Generating host code.\n"); + //TODO - // Loop over the arguments, copying the names of arguments over. - Function::arg_iterator dest_iterator = F_X86->arg_begin(); - for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); - i != e; ++i) { - dest_iterator->setName(i->getName()); // Copy the name over... - // Add mapping to VMap and increment dest iterator - VMap[i] = dest_iterator++; - } + // Now the remaining nodes to be visited should be ignored + KernelLaunchNode = NULL; + errs() << "Insert Runtime calls\n"; + insertRuntimeCalls(N, getPTXFilename(M)); + writeKernelsModule(); - // Add a basic block to this empty function - BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86); - ReturnInst* RI = ReturnInst::Create(M.getContext(), - UndefValue::get(F_X86->getReturnType()), BB); + } else { + DEBUG(errs() << "Found intermediate node. Getting size parameters.\n"); + //TODO : Check that the arguments order of root to intermediate matches + // the intermediate to leaf. - //Add the generated function info to DFNode - N->setGenFunc(F_X86, DFNode::X86); - - // FIXME: Adding Index and Dim arguments are probably not required except - // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do - // have those arguments) + } - // Add Index and Dim arguments except for the root node - if(!N->isRoot()) - addIdxDimArgs(F_X86); +} - // Sort children in topological order before code generation for kernel call - N->getChildGraph()->sortChildren(); +void CodeGenTraversal::codeGen(DFLeafNode* N) { - // The DFNode N has the property that it has only one child (leaving Entry - // and Exit dummy nodes). This child is the PTX kernel. This simplifies code - // generation for kernel calls significantly. All the inputs to this child - // node would either be constants or from the parent node N. + // Skip code generation if it is a dummy node + if(N->isDummyNode()) { + DEBUG(errs() << "Skipping dummy node\n"); + return; + } - assert(N->getChildGraph()->size() == 3 - && "Node expected to have just one non-dummy node!"); + // Checking which node is the kernel launch + DFNode* PNode = N->getParent(); + int pLevel = PNode->getLevel(); + int pReplFactor = PNode->getNumOfDim(); + + if (!pLevel || !pReplFactor) { + KernelLaunchNode = PNode; + kernel = new Kernel(NULL, N->getNumOfDim(), N->getDimLimits()); + // TODO: Find a better way of choosing parameters + //kernel->gridDim = N->getNumOfDim(); + //kernel->blockDim = N->getNumOfDim(); + //kernel->globalWGSize = N->getDimLimits(); + //kernel->localWGSize = N->getDimLimits(); + //FIXME: Comment this out as we can provide localWGSize as null + //IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext()); + // TODO: How to choose the div factor; + //ConstantInt* divFactor = ConstantInt::getSigned(IntTy, (int64_t) 16); + //std::vector<Value*> tmp(kernel->gridDim, divFactor); + //for (unsigned i = 0; i < kernel->gridDim; i++) { + // BinaryOperator* SDivInst = BinaryOperator::CreateSDiv(kernel->globalWGSize[i],tmp[i]); + // kernel->localWGSize.push_back(SDivInst); + //} + } + else { + errs() << "*************** Entering else part **************\n"; + /* + KernelLaunchNode = PNode->getParent(); + kernel->gridDim = PNode->getNumOfDim(); + kernel->blockDim = N->getNumOfDim(); + // TODO: Handle different number of dimensions + assert((kernel->gridDim == kernel->blockDim) && "Dimension number must match"); + std::vector<Value*> numOfBlocks = PNode->getDimLimits(); + kernel->localWGSize = N->getDimLimits(); + for (unsigned i = 0; i < kernel->gridDim; i++) { + //BinaryOperator* MulInst = BinaryOperator::CreateMul(kernel->localWGSize[i],numOfBlocks[i]); + //kernel->globalWGSize.push_back(MulInst); + }*/ + } - DFNode* C; - for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), - ce = N->getChildGraph()->end(); ci != ce; ++ci) { - C = *ci; - // Skip dummy node call - if (!C->isDummyNode()) - break; - } + std::vector<IntrinsicInst *> IItoRemove; + BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap; - assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!"); + // Get the function associated with the dataflow node + Function *F = N->getFuncPointer(); - Function* CF = C->getFuncPointer(); - // Initialize context - DEBUG(errs() << "Initializing context" << "\n"); - CallInst::Create(llvm_visc_ptx_initContext, None, "", RI); + // Look up if we have visited this function before. If we have, then just + // get the cloned function pointer from DFNode. Otherwise, create the cloned + // function and add it to the DFNode GenFunc. + Function *F_nvptx = N->getGenFunc(); + if(F_nvptx == NULL) { + // Clone the function + ValueToValueMapTy VMap; + F_nvptx = CloneFunction(F, VMap, true); - DEBUG(errs() << "Initializing commandQ" << "\n"); - // Initialize command queue - Value* fileStr = getStringPointer(FileName, RI, "Filename"); - errs() << *fileStr << "\n"; - errs() << "Generating code for kernel - " << kernel->KernelFunction->getName()<< "\n"; - Value* kernelStr = getStringPointer(kernel->KernelFunction->getName(), RI,"KernelName"); + // Insert the cloned function into the kernels module + KernelM.getFunctionList().push_back(F_nvptx); - Value* LaunchInstArgs[] = {fileStr, kernelStr}; + DEBUG(errs() << *F_nvptx->getType()); + DEBUG(errs() << *F_nvptx); - DEBUG(errs() << "Inserting launch call" << "\n"); - CallInst* GraphID = CallInst::Create(llvm_visc_ptx_launch, - ArrayRef<Value*>(LaunchInstArgs, 2), - "graph"+CF->getName(), - RI); - DEBUG(errs() << *GraphID << "\n"); - // Iterate over the required input edges of the node and use the visc-rt API - // to set inputs - DEBUG(errs() << "Iterate over input edges of node and insert visc api\n"); - - std::vector<OutputPtr> OutputPointers; - for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) { - - Value* inputVal = getInValueAt(C, i, F_X86, RI); - // input value has been obtained. - // Check if input is a scalar value or a pointer operand - // For scalar values such as int, float, etc. the size is simply the size of - // type on target machine, but for pointers, the size of data would be the - // next integer argument - if(inputVal->getType()->isPointerTy()) { - // CheckAttribute - Value* isOutput = (hasAttribute(CF, i, Attribute::Out))? True : False; - Value* isInput = ((hasAttribute(CF, i, Attribute::Out)) - && !(hasAttribute(CF, i, Attribute::In)))? False : True; - - Argument* A = getArgumentAt(CF, i); - if(isOutput == True) { - errs() << *A << " is an OUTPUT argument\n"; - } - if(isInput == True) { - errs() << *A << " is an INPUT argument\n"; - } + //Add generated function info to DFNode + N->setGenFunc(F_nvptx, DFNode::PTX); + } else { + errs() << "WARNING: Visiting a node for which code already generated!\n"; + } - Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal, - Type::getInt8PtrTy(M.getContext()), - inputVal->getName()+".i8ptr", - RI); - // Pointer Input - Value* inputSize = getInValueAt(C, i+1, F_X86, RI); - assert(inputSize->getType() == Type::getInt64Ty(M.getContext()) - && "Pointer type input must always be followed by size (integer type)"); - Value* setInputArgs[] = {GraphID, - inputValI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()),i), - inputSize, - isInput, - isOutput - }; - Value* d_ptr = CallInst::Create(llvm_visc_ptx_argument_ptr, - ArrayRef<Value*>(setInputArgs, 6), "", RI); - // If this has out attribute, store the returned device pointer in - // memory to read device memory later - if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize)); + transformFunctionToVoid(F_nvptx); + + // Go through all the instructions + for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) { + Instruction *I = &(*i); + // Leaf nodes should not contain VISC graph intrinsics or launch + assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!"); + assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!"); + + if (BuildDFG::isViscQueryIntrinsic(I)) { + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + IntrinsicInst* ArgII; + DFNode* ArgDFNode; + + /************************ Handle VISC Query intrinsics ************************/ + + switch (II->getIntrinsicID()) { + /**************************** llvm.visc.getNode() *****************************/ + case Intrinsic::visc_getNode: { + DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNode\n"); + // add mapping <intrinsic, this node> to the node-specific map + Leaf_HandleToDFNodeMap[II] = N; + IItoRemove.push_back(II); } - else { // Scalar Input - // Store the scalar value on stack and then pass the pointer to its - // location - AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), inputVal->getName()+".ptr", RI); - StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI); - - Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr, - Type::getInt8PtrTy(M.getContext()), - inputVal->getName()+".i8ptr", - RI); - - Value* setInputArgs[] = {GraphID, - inputValI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()),i), - ConstantExpr::getSizeOf(inputVal->getType()) - }; - CallInst::Create(llvm_visc_ptx_argument_scalar, - ArrayRef<Value*>(setInputArgs, 4), "", RI); + break; + /************************* llvm.visc.getParentNode() **************************/ + case Intrinsic::visc_getParentNode: { + DEBUG(errs() << F_nvptx->getName() << "\t: Handling getParentNode\n"); + // get the parent node of the arg node + // get argument node + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + // get the parent node of the arg node + // Add mapping <intrinsic, parent node> to the node-specific map + // the argument node must have been added to the map, orelse the + // code could not refer to it + Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent(); + + IItoRemove.push_back(II); } + break; + /*************************** llvm.visc.getNumDims() ***************************/ + case Intrinsic::visc_getNumDims: { + DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumDims\n"); + // get node from map + // get the appropriate field + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + int numOfDim = ArgDFNode->getNumOfDim(); + DEBUG(errs() << "\t Got node dimension : " << numOfDim << "\n"); +// IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext()); + IntegerType* IntTy = Type::getInt32Ty(getGlobalContext()); + ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim); - } - DEBUG(errs() << "Setup output edges of node and insert visc api\n"); - // Setup output - // FIXME: Note - There is a tricky question. In X86 we do not need to care - // about pointer inputs which modify data in memory implicitly (without - // showing it as output). There is no extra cost needed to handle such inputs - // For PTX, we need to read back such data from device memory to host memory. - // The cost is huge and hence we need to differentiate between readonly - // pointer inputs vs read/write pointer inputs. Currently supporting only a - // simple model in which all input edges are readonly and output is - // writeonly. - - // Set output - StructType* OutputTy = C->getOutputType(); - unsigned outputIndex = CF->getFunctionType()->getNumParams(); - Value* outputSize = ConstantExpr::getSizeOf(OutputTy); - Value* setOutputArgs[] = {GraphID, - Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), - ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex), - ConstantExpr::getSizeOf(OutputTy), - False, - True}; - - CallInst* d_Output = CallInst::Create(llvm_visc_ptx_argument_ptr, - ArrayRef<Value*>(setOutputArgs, 6), - "d_output."+CF->getName(), - RI); + // Replace the result of the intrinsic with the computed value + II->replaceAllUsesWith(numOfDimConstant); - // Enqueue kernel - // Need work dim, localworksize, globalworksize - // FIXME: Talk to DFG2LLVM_PTX pass to figure out the workdim, loacal work - // size and global work size - // Allocate size_t[numDims] space on stack. Store the work group sizes and - // pass it as an argument to ExecNode - Type* Int64Ty = Type::getInt64Ty(M.getContext()); - Type* GlobalWGTy = ArrayType::get(Int64Ty, kernel->gridDim); - AllocaInst* GlobalWG = new AllocaInst(GlobalWGTy, "GlobalWGSize", RI); - Value* GlobalWGPtr = BitCastInst::CreatePointerCast(GlobalWG, Int64Ty->getPointerTo(), GlobalWG->getName()+".0", RI); - Value* nextDim = GlobalWGPtr; - errs() << *GlobalWGPtr << "\n"; - Constant* IntOne = ConstantInt::get(Int64Ty, 1); - errs() << *IntOne << "\n"; - for(unsigned i=0; i < kernel->gridDim; i++) { - errs() << *kernel->globalWGSize[i]->getType() << "\n"; - errs() << *nextDim->getType() << "\n"; - assert(kernel->globalWGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!"); - if(kernel->globalWGSize[i]->getType() != Int64Ty) { - kernel->globalWGSize[i] = BitCastInst::CreateIntegerCast(VMap[kernel->globalWGSize[i]], Int64Ty, true, "", RI); - StoreInst* SI = new StoreInst(kernel->globalWGSize[i], nextDim, RI); - errs() << *SI << "\n"; - } else { - StoreInst* SI = new StoreInst(VMap[kernel->globalWGSize[i]], nextDim, RI); - errs() << *SI << "\n"; + IItoRemove.push_back(II); } - if(i+1 < kernel->gridDim) { - GetElementPtrInst* GEP = GetElementPtrInst::Create(nextDim, ArrayRef<Value*>(IntOne), GlobalWG->getName()+"."+Twine(i+1), RI); - errs() << *GEP << "\n"; - nextDim = GEP; - } - } - errs() << *llvm_visc_ptx_executeNode << "\n"; - errs() << *GlobalWGPtr << "\n"; - Value* ExecNodeArgs[] = {GraphID, - ConstantInt::get(Type::getInt32Ty(M.getContext()), C->getNumOfDim()), - Constant::getNullValue(Type::getInt64PtrTy(M.getContext())), - GlobalWGPtr - }; - CallInst* Event = CallInst::Create(llvm_visc_ptx_executeNode, - ArrayRef<Value*>(ExecNodeArgs, 4), - "event."+CF->getName(), - RI); - errs() << *Event << "\n"; - // Wait for Kernel to Finish - CallInst::Create(llvm_visc_ptx_wait, - ArrayRef<Value*>(GraphID), - "", - RI); - // Read Output Struct - Value* GetOutputArgs[] = {GraphID, - Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), - d_Output, - outputSize}; - CallInst* h_Output = CallInst::Create(llvm_visc_ptx_getOutput, - ArrayRef<Value*>(GetOutputArgs, 4), - "h_output."+CF->getName()+".addr", - RI); - // Read each device pointer listed in output struct - // Load the output struct - CastInst* BI = BitCastInst::CreatePointerCast(h_Output, CF->getReturnType()->getPointerTo(), "output.ptr", RI); - Value* KernelOutput = new LoadInst(BI, "", RI); - - // Read all the pointer arguments which had side effects i.e., had out - // attribute - for(auto output: OutputPointers) { - errs() << "Read: " << *output.d_ptr << "\n"; - errs() << "\t To: " << *output.h_ptr << "\n"; - errs() << "\t #bytes: " << *output.bytes << "\n"; - Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes}; - CallInst* CI = CallInst::Create(llvm_visc_ptx_getOutput, - ArrayRef<Value*>(GetOutputArgs, 4), - "", RI); - } - /*for(unsigned i=0; i < OutputTy->getNumElements(); i++) { - Type* elemTy = OutputTy->getElementType(i); - if(elemTy->isPointerTy()) { - // Pointer type - assert(OutputTy->getElementType(i+1) == Type::getInt64Ty(M.getContext()) - && "Every Pointer type must be followed by an integer"); - ExtractValueInst* d_ptr = ExtractValueInst::Create(KernelOutput, ArrayRef<unsigned>(i), "", RI); - // Change d_ptr to i8* - CastInst* d_ptr_i8 = BitCastInst::CreatePointerCast(d_ptr, Type::getInt8PtrTy(M.getContext()), "", RI); - ExtractValueInst* len = ExtractValueInst::Create(KernelOutput, ArrayRef<unsigned>(i+1), "", RI); - // GetOutputPtr call - Value* GetOutputArgs[] = {GraphID, - d_ptr_i8, - len}; - CallInst* h_ptr_i8 = CallInst::Create(llvm_visc_ptx_getOutput, - ArrayRef<Value*>(GetOutputArgs, 3), - "", - RI); - // Change h_ptr to correct type - CastInst* h_ptr = CastInst::CreatePointerCast(h_ptr_i8, - cast<StructType>(KernelOutput->getType())->getElementType(i), - "", - RI); - KernelOutput = InsertValueInst::Create(KernelOutput, h_ptr, ArrayRef<unsigned>(i), "", RI); + break; + /*********************** llvm.visc.getNodeInstanceID() ************************/ + case Intrinsic::visc_getNodeInstanceID_x: + case Intrinsic::visc_getNodeInstanceID_y: + case Intrinsic::visc_getNodeInstanceID_z: { + DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n"); + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + assert(ArgDFNode && "Arg node is NULL"); + // A leaf node always has a parent + DFNode* ParentDFNode = ArgDFNode->getParent(); + assert(ParentDFNode && "Parent node of a leaf is NULL"); + + // Get the number associated with the required dimension + // FIXME: The order is important! + // These three intrinsics need to be consecutive x,y,z + uint64_t dim = II->getIntrinsicID() - + Intrinsic::visc_getNodeInstanceID_x; + assert((dim >= 0) && (dim < 3) && "Invalid dimension argument"); + DEBUG(errs() << "\t dimension = " << dim << "\n"); + + // Argument of the function to be called + ConstantInt * DimConstant = + ConstantInt::get(Type::getInt32Ty(getGlobalContext()) /*KernelM.getContext()*/ , dim); + ArrayRef<Value *> Args(DimConstant); + + // The following is to find which function to call + Function * OpenCLFunction; + int parentLevel = ParentDFNode->getLevel(); + int parentReplFactor = ParentDFNode->getNumOfDim(); + + if (!parentLevel || !parentReplFactor) { + // We only have one level in the hierarchy or the parent node is not + // replicated. This indicates that the parent node is the kernel + // launch, so we need to specify a global id + + FunctionType* FT = + FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), + std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)), + false); + OpenCLFunction = cast<Function> + (KernelM.getOrInsertFunction(StringRef("get_global_id"), FT)); + } else if (Leaf_HandleToDFNodeMap[ArgII] == Leaf_HandleToDFNodeMap[II]) { + // We are asking for this node's id with respect to its parent + // this is a local id call + FunctionType* FT = + FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), + std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)), + false); + OpenCLFunction = cast<Function> + (KernelM.getOrInsertFunction(StringRef("get_local_id"), FT)); + } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) { + // We are asking for this node's parent's id with respect to its + // parent: this is a group id call + FunctionType* FT = + FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), + std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)), + false); + OpenCLFunction = cast<Function> + (KernelM.getOrInsertFunction(StringRef("get_group_id"), FT)); + } else { + assert(false && "Unable to translate this intrinsic"); + } - } - }*/ + // Create call instruction, insert it before the intrinsic and + // replace the uses of the previous instruction with the new one + CallInst* CI = CallInst::Create(OpenCLFunction, Args, "", II); + II->replaceAllUsesWith(CI); - // Prepare output - KernelOutput->setName("output."+CF->getName()); - OutputMap[C] = KernelOutput; - - DEBUG(errs() << "*** Generating epilogue code for the function****\n"); - // Generate code for output bindings - // Get Exit node - C = N->getChildGraph()->getExit(); - // Get OutputType of this node - StructType* OutTy = N->getOutputType(); - Value *retVal = UndefValue::get(F_X86->getReturnType()); - // Find all the input edges to exit node - for (unsigned i=0; i < OutTy->getNumElements(); i++) { - DEBUG(errs() << "Output Edge " << i << "\n"); - // Find the incoming edge at the requested input port - DFEdge* E = C->getInDFEdgeAt(i); - - assert(E && "No Binding for output element!"); - // Find the Source DFNode associated with the incoming edge - DFNode* SrcDF = E->getSourceDF(); - - DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); - - // If Source DFNode is a dummyNode, edge is from parent. Get the - // argument from argument list of this internal node - Value* inputVal; - if(SrcDF->isEntryNode()) { - inputVal = getArgumentAt(F_X86, i); - DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); - } - else { - // edge is from a internal node - // Check - code should already be generated for this source dfnode - assert(OutputMap.count(SrcDF) - && "Source node call not found. Dependency violation!"); - - // Find Output Value associated with the Source DFNode using OutputMap - Value* CI = OutputMap[SrcDF]; - - // Extract element at source position from this call instruction - std::vector<unsigned> IndexList; - IndexList.push_back(E->getSourcePosition()); - DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); - ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, - "",RI); - inputVal = EI; + IItoRemove.push_back(II); } - std::vector<unsigned> IdxList; - IdxList.push_back(i); - retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI); - } - DEBUG(errs() << "Extracted all\n"); - retVal->setName("output"); - ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); - ReplaceInstWithInst(RI, newRI); - } - - -// Right now, only targeting the one level case. In general, device functions -// can return values so we don't need to change them - void CodeGenTraversal::codeGen(DFInternalNode* N) { - - if (!KernelLaunchNode) { - DEBUG(errs() << "No code generated (host code for kernel launch complete).\n"); - return; - } - - if (N == KernelLaunchNode) { - DEBUG(errs() << "Found kernel launch node. Generating host code.\n"); - //TODO - - // Now the remaining nodes to be visited should be ignored - KernelLaunchNode = NULL; - errs() << "Insert Runtime calls\n"; - insertRuntimeCalls(N, getPTXFilename(M)); - writeKernelsModule(); - - } else { - DEBUG(errs() << "Found intermediate node. Getting size parameters.\n"); - //TODO : Check that the arguments order of root to intermediate matches - // the intermediate to leaf. - - } - - } - - void CodeGenTraversal::codeGen(DFLeafNode* N) { - - // Skip code generation if it is a dummy node - if(N->isDummyNode()) { - DEBUG(errs() << "Skipping dummy node\n"); - return; - } - - // Checking which node is the kernel launch - DFNode* PNode = N->getParent(); - int pLevel = PNode->getLevel(); - int pReplFactor = PNode->getNumOfDim(); - - if (!pLevel || !pReplFactor) { - KernelLaunchNode = PNode; - kernel = new Kernel(NULL, N->getNumOfDim(), N->getDimLimits()); - // TODO: Find a better way of choosing parameters - //kernel->gridDim = N->getNumOfDim(); - //kernel->blockDim = N->getNumOfDim(); - //kernel->globalWGSize = N->getDimLimits(); - //kernel->localWGSize = N->getDimLimits(); - //FIXME: Comment this out as we can provide localWGSize as null - //IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext()); - // TODO: How to choose the div factor; - //ConstantInt* divFactor = ConstantInt::getSigned(IntTy, (int64_t) 16); - //std::vector<Value*> tmp(kernel->gridDim, divFactor); - //for (unsigned i = 0; i < kernel->gridDim; i++) { - // BinaryOperator* SDivInst = BinaryOperator::CreateSDiv(kernel->globalWGSize[i],tmp[i]); - // kernel->localWGSize.push_back(SDivInst); - //} - } - else { - errs() << "*************** Entering else part **************\n"; - /* - KernelLaunchNode = PNode->getParent(); - kernel->gridDim = PNode->getNumOfDim(); - kernel->blockDim = N->getNumOfDim(); - // TODO: Handle different number of dimensions - assert((kernel->gridDim == kernel->blockDim) && "Dimension number must match"); - std::vector<Value*> numOfBlocks = PNode->getDimLimits(); - kernel->localWGSize = N->getDimLimits(); - for (unsigned i = 0; i < kernel->gridDim; i++) { - //BinaryOperator* MulInst = BinaryOperator::CreateMul(kernel->localWGSize[i],numOfBlocks[i]); - //kernel->globalWGSize.push_back(MulInst); - }*/ - } - - std::vector<IntrinsicInst *> IItoRemove; - BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap; - - // Get the function associated with the dataflow node - Function *F = N->getFuncPointer(); - - // Look up if we have visited this function before. If we have, then just - // get the cloned function pointer from DFNode. Otherwise, create the cloned - // function and add it to the DFNode GenFunc. - Function *F_nvptx = N->getGenFunc(); - if(F_nvptx == NULL) { - // Clone the function - ValueToValueMapTy VMap; - F_nvptx = CloneFunction(F, VMap, true); - - // Insert the cloned function into the kernels module - KernelM.getFunctionList().push_back(F_nvptx); - - DEBUG(errs() << *F_nvptx->getType()); - DEBUG(errs() << *F_nvptx); - - //Add generated function info to DFNode - N->setGenFunc(F_nvptx, DFNode::PTX); - } else { - errs() << "WARNING: Visiting a node for which code already generated!\n"; - } - - transformFunctionToVoid(F_nvptx); - - // Go through all the instructions - for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) { - Instruction *I = &(*i); - // Leaf nodes should not contain VISC graph intrinsics or launch - assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!"); - assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!"); - - if (BuildDFG::isViscQueryIntrinsic(I)) { - IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); - IntrinsicInst* ArgII; - DFNode* ArgDFNode; - -/************************ Handle VISC Query intrinsics ************************/ - - switch (II->getIntrinsicID()) { -/**************************** llvm.visc.getNode() *****************************/ - case Intrinsic::visc_getNode: { - DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNode\n"); - // add mapping <intrinsic, this node> to the node-specific map - Leaf_HandleToDFNodeMap[II] = N; - IItoRemove.push_back(II); - } - break; -/************************* llvm.visc.getParentNode() **************************/ - case Intrinsic::visc_getParentNode: { - DEBUG(errs() << F_nvptx->getName() << "\t: Handling getParentNode\n"); - // get the parent node of the arg node - // get argument node - ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); - ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; - // get the parent node of the arg node - // Add mapping <intrinsic, parent node> to the node-specific map - // the argument node must have been added to the map, orelse the - // code could not refer to it - Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent(); - - IItoRemove.push_back(II); - } - break; -/*************************** llvm.visc.getNumDims() ***************************/ - case Intrinsic::visc_getNumDims: { - DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumDims\n"); - // get node from map - // get the appropriate field - ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); - ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; - int numOfDim = ArgDFNode->getNumOfDim(); - DEBUG(errs() << "\t Got node dimension : " << numOfDim << "\n"); -// IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext()); - IntegerType* IntTy = Type::getInt32Ty(getGlobalContext()); - ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim); - - // Replace the result of the intrinsic with the computed value - II->replaceAllUsesWith(numOfDimConstant); - - IItoRemove.push_back(II); - } - break; -/*********************** llvm.visc.getNodeInstanceID() ************************/ - case Intrinsic::visc_getNodeInstanceID_x: - case Intrinsic::visc_getNodeInstanceID_y: - case Intrinsic::visc_getNodeInstanceID_z: { - DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n"); - ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); - ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; - assert(ArgDFNode && "Arg node is NULL"); - // A leaf node always has a parent - DFNode* ParentDFNode = ArgDFNode->getParent(); - assert(ParentDFNode && "Parent node of a leaf is NULL"); - - // Get the number associated with the required dimension - // FIXME: The order is important! - // These three intrinsics need to be consecutive x,y,z - uint64_t dim = II->getIntrinsicID() - - Intrinsic::visc_getNodeInstanceID_x; - assert((dim >= 0) && (dim < 3) && "Invalid dimension argument"); - DEBUG(errs() << "\t dimension = " << dim << "\n"); - - // Argument of the function to be called - ConstantInt * DimConstant = - ConstantInt::get(Type::getInt32Ty(getGlobalContext()) /*KernelM.getContext()*/ , dim); - ArrayRef<Value *> Args(DimConstant); - - // The following is to find which function to call - Function * OpenCLFunction; - int parentLevel = ParentDFNode->getLevel(); - int parentReplFactor = ParentDFNode->getNumOfDim(); - - if (!parentLevel || !parentReplFactor) { - // We only have one level in the hierarchy or the parent node is not - // replicated. This indicates that the parent node is the kernel - // launch, so we need to specify a global id - - FunctionType* FT = - FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), - std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)), - false); - OpenCLFunction = cast<Function> - (KernelM.getOrInsertFunction(StringRef("get_global_id"), FT)); - } else if (Leaf_HandleToDFNodeMap[ArgII] == Leaf_HandleToDFNodeMap[II]) { - // We are asking for this node's id with respect to its parent - // this is a local id call - FunctionType* FT = - FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), - std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)), - false); - OpenCLFunction = cast<Function> - (KernelM.getOrInsertFunction(StringRef("get_local_id"), FT)); - } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) { - // We are asking for this node's parent's id with respect to its - // parent: this is a group id call - FunctionType* FT = - FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), - std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)), - false); - OpenCLFunction = cast<Function> - (KernelM.getOrInsertFunction(StringRef("get_group_id"), FT)); - } else { - assert(false && "Unable to translate this intrinsic"); - } - - // Create call instruction, insert it before the intrinsic and - // replace the uses of the previous instruction with the new one - CallInst* CI = CallInst::Create(OpenCLFunction, Args, "", II); - II->replaceAllUsesWith(CI); - - IItoRemove.push_back(II); - } - break; -/********************** llvm.visc.getNumNodeInstances() ***********************/ - case Intrinsic::visc_getNumNodeInstances_x: - case Intrinsic::visc_getNumNodeInstances_y: - case Intrinsic::visc_getNumNodeInstances_z: { + break; + /********************** llvm.visc.getNumNodeInstances() ***********************/ + case Intrinsic::visc_getNumNodeInstances_x: + case Intrinsic::visc_getNumNodeInstances_y: + case Intrinsic::visc_getNumNodeInstances_z: { //TODO: think about whether this is the best way to go // there are hw specific registers. therefore it is good to have the intrinsic // but then, why do we need to keep that info in the graph? // (only for the kernel configuration during the call) - DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumNodeInstances\n"); - ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); - ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; - // A leaf node always has a parent - DFNode* ParentDFNode = ArgDFNode->getParent(); - assert(ParentDFNode && "Parent node of a leaf is NULL"); - - // Get the number associated with the required dimension - // FIXME: The order is important! - // These three intrinsics need to be consecutive x,y,z - uint64_t dim = II->getIntrinsicID() - - Intrinsic::visc_getNumNodeInstances_x; - assert((dim >= 0) && (dim < 3) && "Invalid dimension argument"); - DEBUG(errs() << "\t dimension = " << dim << "\n"); - - // Argument of the function to be called - ConstantInt * DimConstant = - ConstantInt::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), dim); - ArrayRef<Value *> Args(DimConstant); - - // The following is to find which function to call - Function * OpenCLFunction; - int parentLevel = ParentDFNode->getLevel(); - int parentReplFactor = ParentDFNode->getNumOfDim(); - - if (!parentLevel || !parentReplFactor) { - // We only have one level in the hierarchy or the parent node is not - // replicated. This indicates that the parent node is the kernel - // launch, so the instances are global_size (gridDim x blockDim) - FunctionType* FT = - FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), - std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)), - false); - OpenCLFunction = cast<Function> - (KernelM.getOrInsertFunction(StringRef("get_global_size"), FT)); - } else if (Leaf_HandleToDFNodeMap[ArgII] == Leaf_HandleToDFNodeMap[II]) { - // We are asking for this node's instances - // this is a local size (block dim) call - FunctionType* FT = - FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), - std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)), - false); - OpenCLFunction = cast<Function> - (KernelM.getOrInsertFunction(StringRef("get_local_size"), FT)); - } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) { - // We are asking for this node's parent's instances - // this is a (global_size/local_size) (grid dim) call - FunctionType* FT = - FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), - std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)), - false); - OpenCLFunction = cast<Function> - (KernelM.getOrInsertFunction(StringRef("get_num_groups"), FT)); - } else { - assert(false && "Unable to translate this intrinsic"); - } - - // Create call instruction, insert it before the intrinsic and - // replace the uses of the previous instruction with the new one - CallInst* CI = CallInst::Create(OpenCLFunction, Args, "", II); - II->replaceAllUsesWith(CI); - - IItoRemove.push_back(II); - } - break; - default: - assert(false && "Unknown VISC Intrinsic!"); - break; + DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumNodeInstances\n"); + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + // A leaf node always has a parent + DFNode* ParentDFNode = ArgDFNode->getParent(); + assert(ParentDFNode && "Parent node of a leaf is NULL"); + + // Get the number associated with the required dimension + // FIXME: The order is important! + // These three intrinsics need to be consecutive x,y,z + uint64_t dim = II->getIntrinsicID() - + Intrinsic::visc_getNumNodeInstances_x; + assert((dim >= 0) && (dim < 3) && "Invalid dimension argument"); + DEBUG(errs() << "\t dimension = " << dim << "\n"); + + // Argument of the function to be called + ConstantInt * DimConstant = + ConstantInt::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), dim); + ArrayRef<Value *> Args(DimConstant); + + // The following is to find which function to call + Function * OpenCLFunction; + int parentLevel = ParentDFNode->getLevel(); + int parentReplFactor = ParentDFNode->getNumOfDim(); + + if (!parentLevel || !parentReplFactor) { + // We only have one level in the hierarchy or the parent node is not + // replicated. This indicates that the parent node is the kernel + // launch, so the instances are global_size (gridDim x blockDim) + FunctionType* FT = + FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), + std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)), + false); + OpenCLFunction = cast<Function> + (KernelM.getOrInsertFunction(StringRef("get_global_size"), FT)); + } else if (Leaf_HandleToDFNodeMap[ArgII] == Leaf_HandleToDFNodeMap[II]) { + // We are asking for this node's instances + // this is a local size (block dim) call + FunctionType* FT = + FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), + std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)), + false); + OpenCLFunction = cast<Function> + (KernelM.getOrInsertFunction(StringRef("get_local_size"), FT)); + } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) { + // We are asking for this node's parent's instances + // this is a (global_size/local_size) (grid dim) call + FunctionType* FT = + FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), + std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)), + false); + OpenCLFunction = cast<Function> + (KernelM.getOrInsertFunction(StringRef("get_num_groups"), FT)); + } else { + assert(false && "Unable to translate this intrinsic"); } - } else { - //TODO: how to handle address space qualifiers in load/store + // Create call instruction, insert it before the intrinsic and + // replace the uses of the previous instruction with the new one + CallInst* CI = CallInst::Create(OpenCLFunction, Args, "", II); + II->replaceAllUsesWith(CI); + + IItoRemove.push_back(II); + } + break; + default: + assert(false && "Unknown VISC Intrinsic!"); + break; } + } else { + //TODO: how to handle address space qualifiers in load/store } - // We need to do this explicitly: DCE pass will not remove them because we - // have assumed theworst memory behaviour for these function calls - // Traverse the vector backwards, otherwise definitions are deleted while - // their subsequent uses are still around - for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(), - re = IItoRemove.rend(); ri != re; ++ri) - (*ri)->eraseFromParent(); - - addCLMetadata(F_nvptx); - kernel->KernelFunction = F_nvptx; - errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n"; - DEBUG(errs() << KernelM); - - return; } - bool DFG2LLVM_NVPTX::runOnModule(Module &M) { + // We need to do this explicitly: DCE pass will not remove them because we + // have assumed theworst memory behaviour for these function calls + // Traverse the vector backwards, otherwise definitions are deleted while + // their subsequent uses are still around + for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(), + re = IItoRemove.rend(); ri != re; ++ri) + (*ri)->eraseFromParent(); - // Get the BuildDFG Analysis Results: - // - Dataflow graph - // - Maps from i8* hansles to DFNode and DFEdge - BuildDFG &DFG = getAnalysis<BuildDFG>(); + addCLMetadata(F_nvptx); + kernel->KernelFunction = F_nvptx; + errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n"; + DEBUG(errs() << KernelM); - DFInternalNode *Root = DFG.getRoot(); -// BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); -// BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); + return; +} - // Visitor for Code Generation Graph Traversal - CodeGenTraversal *CGTVisitor = new CodeGenTraversal(M, DFG); +bool DFG2LLVM_NVPTX::runOnModule(Module &M) { - // Initiate code generation for root DFNode - CGTVisitor->visit(Root); - //TODO: Edit module epilogue to remove the VISC intrinsic declarations - delete CGTVisitor; + // Get the BuildDFG Analysis Results: + // - Dataflow graph + // - Maps from i8* hansles to DFNode and DFEdge + BuildDFG &DFG = getAnalysis<BuildDFG>(); - return true; - } - - std::string CodeGenTraversal::getKernelsModuleName(Module &M) { - /*SmallString<128> currentDir; - llvm::sys::fs::current_path(currentDir); - std::string fileName = getFilenameFromModule(M); - Twine output = Twine(currentDir) + "/Output/" + fileName + ""; - return output.str().append(".kernels.ll");*/ - std::string mid = M.getModuleIdentifier(); - return mid.append(".kernels.ll"); - } + DFInternalNode *Root = DFG.getRoot(); +// BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); +// BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); - void CodeGenTraversal::fixValueAddrspace(Value* V, unsigned addrspace) { - assert(isa<PointerType>(V->getType()) - && "Value should be of Pointer Type!"); - PointerType* OldTy = cast<PointerType>(V->getType()); - PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace); - V->mutateType(NewTy); - for(Value::use_iterator ui = V->use_begin(), ue = V->use_end(); ui != ue; ui++) { - // Change all uses producing pointer type in same address space to new - // addressspace. - if(PointerType* PTy = dyn_cast<PointerType>(ui->getType())) { - if(PTy->getAddressSpace() == OldTy->getAddressSpace()) { - fixValueAddrspace(*ui, addrspace); - } + // Visitor for Code Generation Graph Traversal + CodeGenTraversal *CGTVisitor = new CodeGenTraversal(M, DFG); + + // Initiate code generation for root DFNode + CGTVisitor->visit(Root); + //TODO: Edit module epilogue to remove the VISC intrinsic declarations + delete CGTVisitor; + + return true; +} + +std::string CodeGenTraversal::getKernelsModuleName(Module &M) { + /*SmallString<128> currentDir; + llvm::sys::fs::current_path(currentDir); + std::string fileName = getFilenameFromModule(M); + Twine output = Twine(currentDir) + "/Output/" + fileName + ""; + return output.str().append(".kernels.ll");*/ + std::string mid = M.getModuleIdentifier(); + return mid.append(".kernels.ll"); +} + +void CodeGenTraversal::fixValueAddrspace(Value* V, unsigned addrspace) { + assert(isa<PointerType>(V->getType()) + && "Value should be of Pointer Type!"); + PointerType* OldTy = cast<PointerType>(V->getType()); + PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace); + V->mutateType(NewTy); + for(Value::use_iterator ui = V->use_begin(), ue = V->use_end(); ui != ue; ui++) { + // Change all uses producing pointer type in same address space to new + // addressspace. + if(PointerType* PTy = dyn_cast<PointerType>(ui->getType())) { + if(PTy->getAddressSpace() == OldTy->getAddressSpace()) { + fixValueAddrspace(*ui, addrspace); } } } - - void CodeGenTraversal::changeArgAddrspace(Function* F, unsigned addrspace) { - std::vector<Type*> ArgTypes; - for(auto& arg: F->getArgumentList()) { - DEBUG(errs() << arg << "\n"); - if(PointerType* argTy = dyn_cast<PointerType>(arg.getType())) { - if(argTy->getAddressSpace() == 0) { - fixValueAddrspace(&arg, addrspace); - } +} + +void CodeGenTraversal::changeArgAddrspace(Function* F, unsigned addrspace) { + std::vector<Type*> ArgTypes; + for(auto& arg: F->getArgumentList()) { + DEBUG(errs() << arg << "\n"); + if(PointerType* argTy = dyn_cast<PointerType>(arg.getType())) { + if(argTy->getAddressSpace() == 0) { + fixValueAddrspace(&arg, addrspace); } - ArgTypes.push_back(arg.getType()); } - FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, false); - PointerType* PTy = FTy->getPointerTo(cast<PointerType>(F->getType())->getAddressSpace()); - - F->mutateType(PTy); - DEBUG(errs() << *F->getFunctionType() << "\n" <<*F << "\n"); + ArgTypes.push_back(arg.getType()); } + FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, false); + PointerType* PTy = FTy->getPointerTo(cast<PointerType>(F->getType())->getAddressSpace()); - /* Add metadata to module KernelM, for OpenCL kernels */ - void CodeGenTraversal::addCLMetadata(Function *F) { + F->mutateType(PTy); + DEBUG(errs() << *F->getFunctionType() << "\n" <<*F << "\n"); +} - IRBuilder<true> Builder(F->begin()); +/* Add metadata to module KernelM, for OpenCL kernels */ +void CodeGenTraversal::addCLMetadata(Function *F) { - SmallVector<Value*,8> KernelMD; - KernelMD.push_back(F); + IRBuilder<true> Builder(F->begin()); + + SmallVector<Value*,8> KernelMD; + KernelMD.push_back(F); //TODO: For now, we don not add any additional metadata -/* - // MDNode for the kernel argument address space qualifiers. - SmallVector<llvm::Value*, 8> addressQuals; - addressQuals.push_back(MDString::get(KernelM.getContext(), "kernel_arg_addr_space")); - - // We don't support images - // MDNode for the kernel argument access qualifiers (images only). -// SmallVector<llvm::Value*, 8> accessQuals; -// accessQuals.push_back(MDString::get(KernelM.getContext(), "kernel_arg_access_qual")); - - // MDNode for the kernel argument type names. - SmallVector<llvm::Value*, 8> argTypeNames; - argTypeNames.push_back(MDString::get(KernelM.getContext(), "kernel_arg_type")); - - //TODO: MDNode for the kernel argument type qualifiers. -// SmallVector<llvm::Value*, 8> argTypeQuals; -// argTypeQuals.push_back(MDString::get(KernelM.getContext(), "kernel_arg_type_qual")); - - // MDNode for the kernel argument names. - SmallVector<llvm::Value*, 8> argNames; - argNames.push_back(MDString::get(KernelM.getContext(), "kernel_arg_name")); - - for (Function::arg_iterator ai = F->arg_begin(), - ae = F->arg_end(); ai != ae; ++ai) { - Argument *arg = &*ai; - Type *argTy = arg->getType(); - - if (argTy->isPointerTy()) { - Type *pointeeTy = argTy->getPointerElementType(); - std::string typeName = printType(pointeeTy) + "*"; - // Get argument type name. - argTypeNames.push_back(MDString::get(KernelM.getContext(), typeName)); - - // Get address qualifier. - addressQuals.push_back(Builder.getInt32(argTy->getPointerAddressSpace())); - } else { - std::string typeName = printType(argTy); - // Get argument type name. - argTypeNames.push_back(MDString::get(KernelM.getContext(), typeName)); - - // Get address qualifier. - addressQuals.push_back(Builder.getInt32(GENERIC_ADDRSPACE)); + /* + // MDNode for the kernel argument address space qualifiers. + SmallVector<llvm::Value*, 8> addressQuals; + addressQuals.push_back(MDString::get(KernelM.getContext(), "kernel_arg_addr_space")); + + // We don't support images + // MDNode for the kernel argument access qualifiers (images only). + // SmallVector<llvm::Value*, 8> accessQuals; + // accessQuals.push_back(MDString::get(KernelM.getContext(), "kernel_arg_access_qual")); + + // MDNode for the kernel argument type names. + SmallVector<llvm::Value*, 8> argTypeNames; + argTypeNames.push_back(MDString::get(KernelM.getContext(), "kernel_arg_type")); + + //TODO: MDNode for the kernel argument type qualifiers. + // SmallVector<llvm::Value*, 8> argTypeQuals; + // argTypeQuals.push_back(MDString::get(KernelM.getContext(), "kernel_arg_type_qual")); + + // MDNode for the kernel argument names. + SmallVector<llvm::Value*, 8> argNames; + argNames.push_back(MDString::get(KernelM.getContext(), "kernel_arg_name")); + + for (Function::arg_iterator ai = F->arg_begin(), + ae = F->arg_end(); ai != ae; ++ai) { + Argument *arg = &*ai; + Type *argTy = arg->getType(); + + if (argTy->isPointerTy()) { + Type *pointeeTy = argTy->getPointerElementType(); + std::string typeName = printType(pointeeTy) + "*"; + // Get argument type name. + argTypeNames.push_back(MDString::get(KernelM.getContext(), typeName)); + + // Get address qualifier. + addressQuals.push_back(Builder.getInt32(argTy->getPointerAddressSpace())); + } else { + std::string typeName = printType(argTy); + // Get argument type name. + argTypeNames.push_back(MDString::get(KernelM.getContext(), typeName)); + + // Get address qualifier. + addressQuals.push_back(Builder.getInt32(GENERIC_ADDRSPACE)); - } + } - // Get argument name. - argNames.push_back(MDString::get(KernelM.getContext(), arg->getName())); - } + // Get argument name. + argNames.push_back(MDString::get(KernelM.getContext(), arg->getName())); + } - KernelMD.push_back(MDNode::get(KernelM.getContext(), addressQuals)); -// KernelMD.push_back(MDNode::get(KernelM.getContext(), accessQuals)); - KernelMD.push_back(MDNode::get(KernelM.getContext(), argTypeNames)); -// KernelMD.push_back(MDNode::get(KernelM.getContext(), argTypeQuals)); - KernelMD.push_back(MDNode::get(KernelM.getContext(), argNames)); -*/ - MDNode *MDKernelNode = MDNode::get(KernelM.getContext(), KernelMD); - NamedMDNode *MDN_kernels = KernelM.getOrInsertNamedMetadata("opencl.kernels"); - MDN_kernels->addOperand(MDKernelNode); - - KernelMD.push_back(MDNode::get(KernelM.getContext(), - MDString::get(KernelM.getContext(), "kernel"))); - // TODO: Replace 1 with the number of the kernel. - // Add when support for multiple launces is added - KernelMD.push_back(MDNode::get(KernelM.getContext(), - ConstantInt::get(Type::getInt32Ty(KernelM.getContext()),1))); - MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM.getContext(), KernelMD); - NamedMDNode *MDN_annotations = KernelM.getOrInsertNamedMetadata("nvvm.annotations"); - MDN_annotations->addOperand(MDNvvmAnnotationsNode); + KernelMD.push_back(MDNode::get(KernelM.getContext(), addressQuals)); + // KernelMD.push_back(MDNode::get(KernelM.getContext(), accessQuals)); + KernelMD.push_back(MDNode::get(KernelM.getContext(), argTypeNames)); + // KernelMD.push_back(MDNode::get(KernelM.getContext(), argTypeQuals)); + KernelMD.push_back(MDNode::get(KernelM.getContext(), argNames)); + */ + MDNode *MDKernelNode = MDNode::get(KernelM.getContext(), KernelMD); + NamedMDNode *MDN_kernels = KernelM.getOrInsertNamedMetadata("opencl.kernels"); + MDN_kernels->addOperand(MDKernelNode); + + KernelMD.push_back(MDNode::get(KernelM.getContext(), + MDString::get(KernelM.getContext(), "kernel"))); + // TODO: Replace 1 with the number of the kernel. + // Add when support for multiple launces is added + KernelMD.push_back(MDNode::get(KernelM.getContext(), + ConstantInt::get(Type::getInt32Ty(KernelM.getContext()),1))); + MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM.getContext(), KernelMD); + NamedMDNode *MDN_annotations = KernelM.getOrInsertNamedMetadata("nvvm.annotations"); + MDN_annotations->addOperand(MDNvvmAnnotationsNode); //!1 = metadata !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*, i32, i32)* @matrixMul, metadata !"kernel", i32 1} +} + +void CodeGenTraversal::writeKernelsModule() { + + char* ErrorMessage = NULL; + LLVMModuleRef KernelMRef = wrap(&KernelM); + errs() << "Writing to File --- "; + errs() << getKernelsModuleName(M).c_str() << "\n"; + LLVMPrintModuleToFile(KernelMRef, + getKernelsModuleName(M).c_str(), + &ErrorMessage); + if (ErrorMessage) { + LLVMDisposeMessage(ErrorMessage); } + LLVMDisposeModule(KernelMRef); +} - void CodeGenTraversal::writeKernelsModule() { - - char* ErrorMessage = NULL; - LLVMModuleRef KernelMRef = wrap(&KernelM); - errs() << "Writing to File --- "; - errs() << getKernelsModuleName(M).c_str() << "\n"; - LLVMPrintModuleToFile(KernelMRef, - getKernelsModuleName(M).c_str(), - &ErrorMessage); - if (ErrorMessage) { - LLVMDisposeMessage(ErrorMessage); - } - LLVMDisposeModule(KernelMRef); - } - - void CodeGenTraversal::transformFunctionToVoid(Function* F) { +void CodeGenTraversal::transformFunctionToVoid(Function* F) { - // FIXME: Maybe do that using the Node? - StructType* FRetTy = cast<StructType>(F->getReturnType()); - assert(FRetTy && "Return Type must always be a struct"); + // FIXME: Maybe do that using the Node? + StructType* FRetTy = cast<StructType>(F->getReturnType()); + assert(FRetTy && "Return Type must always be a struct"); - // Keeps return statements, because we will need to replace them - std::vector<ReturnInst *> RItoRemove; - findReturnInst(F, RItoRemove); + // Keeps return statements, because we will need to replace them + std::vector<ReturnInst *> RItoRemove; + findReturnInst(F, RItoRemove); - // Check for { } return struct, which means that the function returns void - if (FRetTy->getNumElements() == 0) { + // Check for { } return struct, which means that the function returns void + if (FRetTy->getNumElements() == 0) { - DEBUG(errs() << "\tFunction output struct is void\n"); - DEBUG(errs() << "\tNo parameters added\n"); + DEBUG(errs() << "\tFunction output struct is void\n"); + DEBUG(errs() << "\tNo parameters added\n"); - // Replacing return statements with others returning void - for (std::vector<ReturnInst *>::iterator i = RItoRemove.begin(), - e = RItoRemove.end(); i != e; ++i) { - ReturnInst::Create((F->getContext()), 0, (*i)); - (*i)->eraseFromParent(); - } - DEBUG(errs() << "\tChanged return statements to return void\n"); - - return; + // Replacing return statements with others returning void + for (std::vector<ReturnInst *>::iterator i = RItoRemove.begin(), + e = RItoRemove.end(); i != e; ++i) { + ReturnInst::Create((F->getContext()), 0, (*i)); + (*i)->eraseFromParent(); } + DEBUG(errs() << "\tChanged return statements to return void\n"); - // The struct has return values, thus needs to be converted to parameter + return; + } - int initialNumParams = F->arg_size(); + // The struct has return values, thus needs to be converted to parameter - Type* ArgType = FRetTy->getPointerTo(GENERIC_ADDRSPACE); - new Argument(ArgType, "ret_struct_ptr", F); - DEBUG(errs() << "\tCreated parameter\n"); + int initialNumParams = F->arg_size(); - // Create the argument type list with the added argument's type - std::vector<Type*> ArgTypes; - for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - ArgTypes.push_back(ai->getType()); - } + Type* ArgType = FRetTy->getPointerTo(GENERIC_ADDRSPACE); + new Argument(ArgType, "ret_struct_ptr", F); + DEBUG(errs() << "\tCreated parameter\n"); - // Find where the new parameter is in the header - Function::arg_iterator ai, ae; - int check = 0; - for (ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - if (ai->getName().equals("ret_struct_ptr")) break; - check++; - } + // Create the argument type list with the added argument's type + std::vector<Type*> ArgTypes; + for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + ArgTypes.push_back(ai->getType()); + } + + // Find where the new parameter is in the header + Function::arg_iterator ai, ae; + int check = 0; + for (ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + if (ai->getName().equals("ret_struct_ptr")) break; + check++; + } // DEBUG(errs() << "\tcheck = " << check << "\tinitialNumParams = " << initialNumParams << "\n"); - assert(check == initialNumParams); - - DEBUG(errs() << "\tReplacing Return statements\n"); - // Replace return statements with extractValue and store instructions - for (std::vector<ReturnInst *>::iterator rii = RItoRemove.begin(), - rie = RItoRemove.end(); rii != rie; ++rii) { - ReturnInst* RI = (*rii); - Value* RetVal = RI->getReturnValue(); - // assert(RetVal && "Return value should not be null at this point"); - // StructType* RetType = cast<StructType>(RetVal->getType()); - // assert(RetType && "Return type is not a struct"); - - new StoreInst(RetVal, &(*ai), RI); - ReturnInst::Create((F->getContext()), 0, RI); - RI->eraseFromParent(); + assert(check == initialNumParams); + + DEBUG(errs() << "\tReplacing Return statements\n"); + // Replace return statements with extractValue and store instructions + for (std::vector<ReturnInst *>::iterator rii = RItoRemove.begin(), + rie = RItoRemove.end(); rii != rie; ++rii) { + ReturnInst* RI = (*rii); + Value* RetVal = RI->getReturnValue(); + // assert(RetVal && "Return value should not be null at this point"); + // StructType* RetType = cast<StructType>(RetVal->getType()); + // assert(RetType && "Return type is not a struct"); + + new StoreInst(RetVal, &(*ai), RI); + ReturnInst::Create((F->getContext()), 0, RI); + RI->eraseFromParent(); - } + } - DEBUG(errs() << "\tReplaced return statements\n"); + DEBUG(errs() << "\tReplaced return statements\n"); - // Adding new arguments to the function argument list, would not change the - // function type. We need to change the type of this function to reflect the - // added arguments - Type* VoidRetType = Type::getVoidTy(F->getContext()); - FunctionType* FTy = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg()); - PointerType* PTy = PointerType::get(FTy, cast<PointerType>(F->getType())->getAddressSpace()); + // Adding new arguments to the function argument list, would not change the + // function type. We need to change the type of this function to reflect the + // added arguments + Type* VoidRetType = Type::getVoidTy(F->getContext()); + FunctionType* FTy = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg()); + PointerType* PTy = PointerType::get(FTy, cast<PointerType>(F->getType())->getAddressSpace()); - // Change the function type - F->mutateType(PTy); + // Change the function type + F->mutateType(PTy); - } +} /****************************************************************************** * Helper functions * ******************************************************************************/ - // Find if argument has the given attribute - static bool hasAttribute(Function* F, unsigned arg_index, Attribute::AttrKind AK) { - return F->getAttributes().hasAttribute(arg_index+1, AK); - } - // Get generated PTX binary name - static std::string getPTXFilename(const Module& M) { - std::string moduleID = M.getModuleIdentifier(); - moduleID.append(".nvptx.s"); - return moduleID; - } - - // Get the name of the input file from module ID - static std::string getFilenameFromModule(const Module& M) { - std::string moduleID = M.getModuleIdentifier(); - return moduleID.substr(moduleID.find_last_of("/")+1); - } - - // Changes the data layout of the Module to be compiled with NVPTX backend - // TODO: Figure out when to call it, probably after duplicating the modules - static void changeDataLayout(Module &M) { - std::string nvptx32_layoutStr = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"; - std::string nvptx64_layoutStr = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"; - - if (TARGET_PTX == 32) - M.setDataLayout(StringRef(nvptx32_layoutStr)); - else if (TARGET_PTX == 64) - M.setDataLayout(StringRef(nvptx64_layoutStr)); - else assert(false && "Invalid PTX target"); - - return; +// Calculate execute node parameters which include, number of diemnsions for +// dynamic instances of the kernel, local and global work group sizes. +static void getExecuteNodeParams(Value* &workDim, Value* &LocalWGPtr, Value* + &GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) { + + // Get int64_t and or ease of use + Type* Int64Ty = Type::getInt64Ty(getGlobalContext()); + + // Assign number of dimenstions a constant value + workDim = ConstantInt::get(Type::getInt32Ty(getGlobalContext()), kernel->gridDim); + + // For now, local work group size if null + LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(getGlobalContext())); + + // Global Work Group type is [#dim x i64] + Type* GlobalWGTy = ArrayType::get(Int64Ty, kernel->gridDim); + // Allocate space of Global work group data on stack and get pointer to + // first element. + AllocaInst* GlobalWG = new AllocaInst(GlobalWGTy, "GlobalWGSize", IB); + GlobalWGPtr = BitCastInst::CreatePointerCast(GlobalWG, Int64Ty->getPointerTo(), GlobalWG->getName()+".0", IB); + Value* nextDim = GlobalWGPtr; + DEBUG(errs() << *GlobalWGPtr << "\n"); + + // Iterate over the number of dimensions and store the global work group + // size in that dimension + for(unsigned i=0; i < kernel->gridDim; i++) { + assert(kernel->globalWGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!"); + if(kernel->globalWGSize[i]->getType() != Int64Ty) { + // If number of dimensions are mentioned in any other integer format, + // generate code to extend it to i64. We need to use the mapped value in + // the new generated function, hence the use of VMap + kernel->globalWGSize[i] = BitCastInst::CreateIntegerCast(VMap[kernel->globalWGSize[i]], Int64Ty, true, "", IB); + StoreInst* SI = new StoreInst(kernel->globalWGSize[i], nextDim, IB); + DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n"); + } else { + // Store the value representing work group size in ith dimension on + // stack + StoreInst* SI = new StoreInst(VMap[kernel->globalWGSize[i]], nextDim, IB); + DEBUG(errs() << "\t Work group size: " << *SI << "\n"); + } + if(i+1 < kernel->gridDim) { + // Move to next dimension + GetElementPtrInst* GEP = GetElementPtrInst::Create(nextDim, + ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)), + GlobalWG->getName()+"."+Twine(i+1), + IB); + DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n"); + nextDim = GEP; + } } - static void changeTargetTriple(Module &M) { - std::string nvptx32_TargetTriple = "nvptx--nvidiacl"; - std::string nvptx64_TargetTriple = "nvptx64--nvidiacl"; - - if (TARGET_PTX == 32) - M.setTargetTriple(StringRef(nvptx32_TargetTriple)); - else if (TARGET_PTX == 64) - M.setTargetTriple(StringRef(nvptx64_TargetTriple)); - else assert(false && "Invalid PTX target"); - - return; - } + DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n"); +} + +// Find if argument has the given attribute +static bool hasAttribute(Function* F, unsigned arg_index, Attribute::AttrKind AK) { + return F->getAttributes().hasAttribute(arg_index+1, AK); +} +// Get generated PTX binary name +static std::string getPTXFilename(const Module& M) { + std::string moduleID = M.getModuleIdentifier(); + moduleID.append(".nvptx.s"); + return moduleID; +} + +// Get the name of the input file from module ID +static std::string getFilenameFromModule(const Module& M) { + std::string moduleID = M.getModuleIdentifier(); + return moduleID.substr(moduleID.find_last_of("/")+1); +} + +// Changes the data layout of the Module to be compiled with NVPTX backend +// TODO: Figure out when to call it, probably after duplicating the modules +static void changeDataLayout(Module &M) { + std::string nvptx32_layoutStr = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"; + std::string nvptx64_layoutStr = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"; + + if (TARGET_PTX == 32) + M.setDataLayout(StringRef(nvptx32_layoutStr)); + else if (TARGET_PTX == 64) + M.setDataLayout(StringRef(nvptx64_layoutStr)); + else assert(false && "Invalid PTX target"); + + return; +} + +static void changeTargetTriple(Module &M) { + std::string nvptx32_TargetTriple = "nvptx--nvidiacl"; + std::string nvptx64_TargetTriple = "nvptx64--nvidiacl"; + + if (TARGET_PTX == 32) + M.setTargetTriple(StringRef(nvptx32_TargetTriple)); + else if (TARGET_PTX == 64) + M.setTargetTriple(StringRef(nvptx64_TargetTriple)); + else assert(false && "Invalid PTX target"); + + return; +} // Helper function, generate a string representation of a type - static std::string printType(Type* ty) { - std::string type_str; - raw_string_ostream rso(type_str); - ty->print(rso); - return rso.str(); - } +static std::string printType(Type* ty) { + std::string type_str; + raw_string_ostream rso(type_str); + ty->print(rso); + return rso.str(); +} // Helper function, convert int to string - static std::string convertInt(int number) { - std::stringstream ss;//create a stringstream - ss << number;//add number to the stream - return ss.str();//return a string with the contents of the stream - } +static std::string convertInt(int number) { + std::stringstream ss;//create a stringstream + ss << number;//add number to the stream + return ss.str();//return a string with the contents of the stream +} // Helper function, populate a vector with all return statements in a function - static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) { - for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { - Instruction *I = &(*i); - ReturnInst* RI = dyn_cast<ReturnInst>(I); - if (RI) { - ReturnInstVec.push_back(RI); - } +static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) { + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &(*i); + ReturnInst* RI = dyn_cast<ReturnInst>(I); + if (RI) { + ReturnInstVec.push_back(RI); } } +} } // End of namespace -- GitLab