diff --git a/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp b/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp index 05869fa0dfca5b742d828f330e2f17fd21b28aa6..4b392b2c966d802664598c3f2d09331910d6e555 100644 --- a/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp +++ b/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp @@ -65,12 +65,12 @@ public: virtual void visit(DFInternalNode* N) { // Follows a bottom-up approach for code generation. // First generate code for all the child nodes - DEBUG(errs() << "Erasing Node (I) - " << N->getFuncPointer()->getName() << "\n"); for(DFGraph::children_iterator i = N->getChildGraph()->begin(), e = N->getChildGraph()->end(); i != e; ++i) { DFNode* child = *i; child->applyDFNodeVisitor(*this); } + DEBUG(errs() << "Erasing Node (I) - " << N->getFuncPointer()->getName() << "\n"); // Generate code for this internal node now. This way all the cloned // functions for children exist. deleteNode(N); @@ -80,7 +80,7 @@ public: virtual void visit(DFLeafNode* N) { DEBUG(errs() << "Erasing Node (L) - " << N->getFuncPointer()->getName() << "\n"); deleteNode(N); - //errs() << "DONE: Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n"; + errs() << "DONE" << "\n"; } }; diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp index 190b9af288bbd77f11e720fd979dc6eb0832bb82..869ca5d5927ffb59db62f1056f77ea357daa77d0 100644 --- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -54,12 +54,12 @@ public: // calls class Kernel { public: - Kernel(Function* _KF, std::vector<unsigned> _inArgMap = + Kernel(Function* _KF, DFLeafNode* _KLeafNode,std::vector<unsigned> _inArgMap = std::vector<unsigned>(), unsigned _gridDim = 0, std::vector<Value*> _globalWGSize = std::vector<Value*>(), unsigned _blockDim = 0, std::vector<Value*> _localWGSize = std::vector<Value*>()) - : KernelFunction(_KF), inArgMap(_inArgMap), + : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap), gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) { @@ -70,6 +70,7 @@ public: } Function* KernelFunction; + DFLeafNode* KernelLeafNode; std::vector<unsigned> inArgMap; unsigned gridDim; unsigned blockDim; @@ -169,7 +170,7 @@ private: Argument* getArgumentAt(Function* F, unsigned offset); Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore); - void insertRuntimeCalls(DFInternalNode* N, const Twine& FileName); + void insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName); void codeGen(DFInternalNode* N); void codeGen(DFLeafNode* N); @@ -318,16 +319,16 @@ void CodeGenTraversal::addIdxDimArgs(Function* F) { /* Traverse the function F argument list to get argument at offset*/ Argument* CodeGenTraversal::getArgumentAt(Function* F, unsigned offset) { + DEBUG(errs() << "Finding argument " << offset << ":\n"); assert((F->getFunctionType()->getNumParams() > offset && offset >= 0) && "Invalid offset to access arguments!"); - Argument* arg; Function::arg_iterator i = F->arg_begin(), e = F->arg_end(); for(; offset != 0 && i!=e; i++) { offset--; } arg = i; - DEBUG(errs() << *arg <<"\n"); + DEBUG(errs() << "\t" << *arg <<"\n"); return arg; } @@ -388,7 +389,7 @@ Value* CodeGenTraversal::getStringPointer(const Twine& S, Instruction* IB, const // used to generate a function to associate with this leaf node. The function // is responsible for all the memory allocation/transfer and invoking the // kernel call on the device -void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileName) { +void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName) { // Check if clone already exists. If it does, it means we have visited this // function before. assert(N->getGenFunc() == NULL && "Code already generated for this node"); @@ -398,7 +399,7 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0); // If kernel struct has not been initialized with kernel function, then fail - assert(kernel != NULL && "No kernel found!!"); + assert(K != NULL && "No kernel found!!"); DEBUG(errs() << "Generating kernel call code\n"); @@ -441,6 +442,8 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa if(!N->isRoot()) addIdxDimArgs(F_X86); + /* TODO: Use this code to verufy if this is a good pattern for PTX kernel + // Sort children in topological order before code generation for kernel call N->getChildGraph()->sortChildren(); @@ -464,6 +467,8 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!"); Function* CF = C->getFuncPointer(); + */ + Function* KF = K->KernelLeafNode->getFuncPointer(); // Initialize context DEBUG(errs() << "Initializing context" << "\n"); CallInst::Create(llvm_visc_ptx_initContext, None, "", RI); @@ -472,26 +477,26 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa // Initialize command queue Value* fileStr = getStringPointer(FileName, RI, "Filename"); DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n"); - DEBUG(errs() << "Generating code for kernel - " << kernel->KernelFunction->getName()<< "\n"); - Value* kernelStr = getStringPointer(kernel->KernelFunction->getName(), RI,"KernelName"); + DEBUG(errs() << "Generating code for kernel - " << K->KernelFunction->getName()<< "\n"); + Value* kernelStr = getStringPointer(K->KernelFunction->getName(), RI,"KernelName"); Value* LaunchInstArgs[] = {fileStr, kernelStr}; DEBUG(errs() << "Inserting launch call" << "\n"); CallInst* GraphID = CallInst::Create(llvm_visc_ptx_launch, ArrayRef<Value*>(LaunchInstArgs, 2), - "graph"+CF->getName(), + "graph"+KF->getName(), RI); DEBUG(errs() << *GraphID << "\n"); // Iterate over the required input edges of the node and use the visc-rt API // to set inputs DEBUG(errs() << "Iterate over input edges of node and insert visc api\n"); std::vector<OutputPtr> OutputPointers; - for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) { + for(unsigned i=0; i<KF->getFunctionType()->getNumParams(); i++) { // The kernel object gives us the mapping of arguments from kernel launch // node function (F_X86) to kernel (kernel->KF) - Value* inputVal = getArgumentAt(F_X86, kernel->getInArgMap()[i]); + Value* inputVal = getArgumentAt(F_X86, K->getInArgMap()[i]); DEBUG(errs() << "\tArgument "<< i<< " = " << *inputVal << "\n"); // input value has been obtained. @@ -502,11 +507,11 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa if(inputVal->getType()->isPointerTy()) { // Pointer Input // CheckAttribute - Value* isOutput = (hasAttribute(CF, i, Attribute::Out))? True : False; - Value* isInput = ((hasAttribute(CF, i, Attribute::Out)) - && !(hasAttribute(CF, i, Attribute::In)))? False : True; + Value* isOutput = (hasAttribute(KF, i, Attribute::Out))? True : False; + Value* isInput = ((hasAttribute(KF, i, Attribute::Out)) + && !(hasAttribute(KF, i, Attribute::In)))? False : True; - Argument* A = getArgumentAt(CF, i); + Argument* A = getArgumentAt(KF, i); if(isOutput == True) { DEBUG(errs() << *A << " is an OUTPUT argument\n"); } @@ -519,7 +524,7 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa Type::getInt8PtrTy(M.getContext()), inputVal->getName()+".i8ptr", RI); - Value* inputSize = getArgumentAt(F_X86, kernel->getInArgMap()[i+1]); + Value* inputSize = getArgumentAt(F_X86, K->getInArgMap()[i+1]); assert(inputSize->getType() == Type::getInt64Ty(M.getContext()) && "Pointer type input must always be followed by size (integer type)"); Value* setInputArgs[] = {GraphID, @@ -560,11 +565,11 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa DEBUG(errs() << "Setup output edges of node and insert visc api\n"); // Set output if struct is not an empty struct - StructType* OutputTy = C->getOutputType(); + StructType* OutputTy = K->KernelLeafNode->getOutputType(); Value *outputSize, *d_Output; if(!OutputTy->isEmptyTy()) { // Not an empty struct - unsigned outputIndex = CF->getFunctionType()->getNumParams(); + unsigned outputIndex = KF->getFunctionType()->getNumParams(); outputSize = ConstantExpr::getSizeOf(OutputTy); Value* setOutputArgs[] = {GraphID, Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), @@ -576,7 +581,7 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa d_Output = CallInst::Create(llvm_visc_ptx_argument_ptr, ArrayRef<Value*>(setOutputArgs, 6), - "d_output."+CF->getName(), + "d_output."+KF->getName(), RI); } @@ -586,7 +591,7 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa // pass it as an argument to ExecNode Value *workDim, *LocalWGPtr, *GlobalWGPtr; - getExecuteNodeParams(workDim, LocalWGPtr, GlobalWGPtr, kernel, VMap, RI); + getExecuteNodeParams(workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI); Value* ExecNodeArgs[] = {GraphID, workDim, LocalWGPtr, @@ -594,7 +599,7 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa }; CallInst* Event = CallInst::Create(llvm_visc_ptx_executeNode, ArrayRef<Value*>(ExecNodeArgs, 4), - "event."+CF->getName(), + "event."+KF->getName(), RI); DEBUG(errs() << "Execute Node Call: " << *Event << "\n"); @@ -613,13 +618,13 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa }; CallInst* h_Output = CallInst::Create(llvm_visc_ptx_getOutput, ArrayRef<Value*>(GetOutputArgs, 4), - "h_output."+CF->getName()+".addr", + "h_output."+KF->getName()+".addr", RI); // Read each device pointer listed in output struct // Load the output struct - CastInst* BI = BitCastInst::CreatePointerCast(h_Output, CF->getReturnType()->getPointerTo(), "output.ptr", RI); - Value* KernelOutput = new LoadInst(BI, "output."+CF->getName(), RI); - OutputMap[C] = KernelOutput; + CastInst* BI = BitCastInst::CreatePointerCast(h_Output, KF->getReturnType()->getPointerTo(), "output.ptr", RI); + Value* KernelOutput = new LoadInst(BI, "output."+KF->getName(), RI); + OutputMap[K->KernelLeafNode] = KernelOutput; } // Read all the pointer arguments which had side effects i.e., had out @@ -639,7 +644,7 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa DEBUG(errs() << "*** Generating epilogue code for the function****\n"); // Generate code for output bindings // Get Exit node - C = N->getChildGraph()->getExit(); + DFNode* C = N->getChildGraph()->getExit(); // Get OutputType of this node StructType* OutTy = N->getOutputType(); Value *retVal = UndefValue::get(F_X86->getReturnType()); @@ -665,6 +670,10 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa else { // edge is from a internal node // Check - code should already be generated for this source dfnode + // FIXME: Since the 2-level kernel code gen has aspecific structure, we + // can assume the SrcDF is same as Kernel Leaf node. + // Use outArgMap to get correct mapping + SrcDF = K->KernelLeafNode; assert(OutputMap.count(SrcDF) && "Source node call not found. Dependency violation!"); @@ -707,7 +716,7 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) { // Now the remaining nodes to be visited should be ignored KernelLaunchNode = NULL; errs() << "Insert Runtime calls\n"; - insertRuntimeCalls(N, getPTXFilename(M)); + insertRuntimeCalls(N, kernel, getPTXFilename(M)); writeKernelsModule(); } else { @@ -770,7 +779,7 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) { // (2) Parent does not have multiple instances if (!pLevel || !pReplFactor) { KernelLaunchNode = PNode; - kernel = new Kernel(NULL, N->getInArgMap(), N->getNumOfDim(), N->getDimLimits()); + kernel = new Kernel(NULL, N, N->getInArgMap(), N->getNumOfDim(), N->getDimLimits()); } else { // Converting a 2-level DFG to opencl kernel @@ -779,6 +788,7 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) { assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match"); // Contains the instructions generating the kernel configuration parameters kernel = new Kernel(NULL, // kernel function + N, // kernel leaf node N->getInArgMap(), // kenel argument mapping PNode->getNumOfDim(), // gridDim PNode->getDimLimits(),// grid size @@ -1316,9 +1326,17 @@ static void getExecuteNodeParams(Value* &workDim, Value* &LocalWGPtr, Value* LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(getGlobalContext())); } else { + for(unsigned i = 0; i < kernel->localWGSize.size(); i++) { + if(isa<Argument>(kernel->localWGSize[i])) + kernel->localWGSize[i] = VMap[kernel->localWGSize[i]]; + } LocalWGPtr = genWorkGroupPtr(kernel->localWGSize, VMap, IB, "LocalWGSize"); } + for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) { + if(isa<Argument>(kernel->globalWGSize[i])) + kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]]; + } // For OpenCL, global work group size is the total bumber of instances in each // dimension. So, multiply local and global dim limits. std::vector<Value*> globalWGSizeInsts; @@ -1350,16 +1368,7 @@ static Value* genWorkGroupPtr(std::vector<Value*> WGSize, ValueToValueMapTy& VMa // size in that dimension for(unsigned i=0; i < WGSize.size(); i++) { assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!"); - - // If WGSize[i] is not a constant or a instruction, use mapped value in the new function - Value* WGSizeMapped; - if(isa<Argument>(WGSize[i])) - WGSizeMapped = VMap[WGSize[i]]; - else { - WGSizeMapped = WGSize[i]; - errs() << "Mapping value is not required: "; - errs() << *WGSize[i] << "\n"; - } + if(WGSize[i]->getType() != Int64Ty) { // If number of dimensions are mentioned in any other integer format, // generate code to extend it to i64. We need to use the mapped value in @@ -1367,8 +1376,7 @@ static Value* genWorkGroupPtr(std::vector<Value*> WGSize, ValueToValueMapTy& VMa // FIXME: Why are we changing the kernel WGSize vector here? errs() << "Not i64. Zero extend required.\n"; errs() << *WGSize[i] << "\n"; - errs() << *WGSizeMapped << "\n"; - CastInst* CI = BitCastInst::CreateIntegerCast(WGSizeMapped, Int64Ty, true, "", IB); + CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB); errs() << "Bitcast done.\n"; StoreInst* SI = new StoreInst(CI, nextDim, IB); errs() << "Zero extend done.\n"; @@ -1376,7 +1384,7 @@ static Value* genWorkGroupPtr(std::vector<Value*> WGSize, ValueToValueMapTy& VMa } else { // Store the value representing work group size in ith dimension on // stack - StoreInst* SI = new StoreInst(WGSizeMapped, nextDim, IB); + StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB); DEBUG(errs() << "\t Work group size: " << *SI << "\n"); }