diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp index 7df93bd099e837335656eb6f5f30aa34e4cb0f41..190b9af288bbd77f11e720fd979dc6eb0832bb82 100644 --- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -75,20 +75,29 @@ public: unsigned blockDim; std::vector<Value*> globalWGSize; std::vector<Value*> localWGSize; + std::vector<int> localDimMap; std::vector<unsigned> getInArgMap() { return inArgMap; } + void setInArgMap(std::vector<unsigned> map) { + inArgMap = map; + } + + void setLocalWGSize(std::vector<Value*> V) { + localWGSize = V; + } + bool hasLocalWG() { - return blockDim == 0; + return blockDim != 0; } }; // Helper function declarations static void getExecuteNodeParams(Value* &, Value* &, Value* &, Kernel*, ValueToValueMapTy&, Instruction*); -static Value* genWorkGroupPtr(std::vector<Value*>&, ValueToValueMapTy&, +static Value* genWorkGroupPtr(std::vector<Value*>, ValueToValueMapTy&, Instruction*, const Twine& WGName = "WGSize"); static bool hasAttribute(Function*, unsigned, Attribute::AttrKind); static std::string getPTXFilename(const Module&); @@ -615,10 +624,11 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa // Read all the pointer arguments which had side effects i.e., had out // attribute + errs() << "Output Pointers : " << OutputPointers.size() << "\n"; for(auto output: OutputPointers) { - DEBUG(errs() << "Read: " << *output.d_ptr << "\n"); - DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n"); - DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n"); + errs() << "Read: " << *output.d_ptr << "\n"; + errs() << "\tTo: " << *output.h_ptr << "\n"; + errs() << "\t#bytes: " << *output.bytes << "\n"; Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes}; CallInst* CI = CallInst::Create(llvm_visc_ptx_getOutput, @@ -703,7 +713,6 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) { } else { DEBUG(errs() << "Found intermediate node. Getting size parameters.\n"); // Keep track of the arguments order. - std::vector<unsigned> map1 = N->getInArgMap(); std::vector<unsigned> map2 = kernel->getInArgMap(); @@ -711,7 +720,33 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) { for (unsigned i = 0; i < map2.size(); i++) { map2[i] = map1[map2[i]]; } - + kernel->setInArgMap(map2); + + // Track the source of local dimlimits for the kernel + // Dimension limit can either be a constant or an argument of parent + // function. Since Internal node would no longer exist, we need to insert the + // localWGSize with values from the parent of N. + std::vector<Value*> localWGSizeMapped; + for (unsigned i = 0; i < kernel->localWGSize.size(); i++) { + if (isa<Constant>(kernel->localWGSize[i])) { + // if constant, use as it is + localWGSizeMapped.push_back(kernel->localWGSize[i]); + } + else if (Argument* Arg = dyn_cast<Argument>(kernel->localWGSize[i])) { + // if argument, find the argument location in N. Use InArgMap of N to + // find the source location in Parent of N. Retrieve the argument from + // parent to insert in the vector. + unsigned argNum = Arg->getArgNo(); + unsigned parentArgNum = N->getInArgMap()[argNum]; + Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum); + localWGSizeMapped.push_back(A); + } + else { + assert(false && "LocalWGsize using value which is neither argument nor constant!"); + } + } + // Update localWGSize vector of kernel + kernel->setLocalWGSize(localWGSizeMapped); } } @@ -736,20 +771,6 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) { if (!pLevel || !pReplFactor) { KernelLaunchNode = PNode; kernel = new Kernel(NULL, N->getInArgMap(), N->getNumOfDim(), N->getDimLimits()); - // TODO: Find a good way of choosing parameters - is this required? - //kernel->gridDim = N->getNumOfDim(); - //kernel->blockDim = N->getNumOfDim(); - //kernel->globalWGSize = N->getDimLimits(); - //kernel->localWGSize = N->getDimLimits(); - //FIXME: Comment this out as we can provide localWGSize as null - //IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext()); - // TODO: How to choose the div factor; - //ConstantInt* divFactor = ConstantInt::getSigned(IntTy, (int64_t) 16); - //std::vector<Value*> tmp(kernel->gridDim, divFactor); - //for (unsigned i = 0; i < kernel->gridDim; i++) { - // BinaryOperator* SDivInst = BinaryOperator::CreateSDiv(kernel->globalWGSize[i],tmp[i]); - // kernel->localWGSize.push_back(SDivInst); - //} } else { // Converting a 2-level DFG to opencl kernel @@ -757,15 +778,10 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) { KernelLaunchNode = PNode->getParent(); assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match"); // Contains the instructions generating the kernel configuration parameters - std::vector<Value*> globalWGSizeInsts; - for (unsigned i = 0; i < PNode->getNumOfDim(); i++) { - BinaryOperator* MulInst = BinaryOperator::CreateMul(PNode->getDimLimits()[i],N->getDimLimits()[i]); - globalWGSizeInsts.push_back(MulInst); - } kernel = new Kernel(NULL, // kernel function N->getInArgMap(), // kenel argument mapping PNode->getNumOfDim(), // gridDim - globalWGSizeInsts, // grid size + PNode->getDimLimits(),// grid size N->getNumOfDim(), // blockDim N->getDimLimits()); // block size @@ -885,21 +901,26 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) { // The following is to find which function to call Function * OpenCLFunction; - int parentLevel = ParentDFNode->getLevel(); - int parentReplFactor = ParentDFNode->getNumOfDim(); + int parentLevel = N->getParent()->getLevel(); + int parentReplFactor = N->getParent()->getNumOfDim(); + errs() << "Parent Level = " << parentLevel << "\n"; + errs() << "Parent Repl factor = " << parentReplFactor << "\n"; - if (!parentLevel || !parentReplFactor) { + if ((!parentLevel || !parentReplFactor) && ArgDFNode == N) { // We only have one level in the hierarchy or the parent node is not // replicated. This indicates that the parent node is the kernel - // launch, so we need to specify a global id - + // launch, so we need to specify a global id. + // We can translate this only if the argument is the current node + // itself + errs() << "Substitute with get_global_id()\n"; + errs() << *II << "\n"; FunctionType* FT = FunctionType::get(Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/), std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)), false); OpenCLFunction = cast<Function> (KernelM.getOrInsertFunction(StringRef("get_global_id"), FT)); - } else if (Leaf_HandleToDFNodeMap[ArgII] == Leaf_HandleToDFNodeMap[II]) { + } else if (Leaf_HandleToDFNodeMap[ArgII] == N) { // We are asking for this node's id with respect to its parent // this is a local id call FunctionType* FT = @@ -918,7 +939,11 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) { OpenCLFunction = cast<Function> (KernelM.getOrInsertFunction(StringRef("get_group_id"), FT)); } else { - assert(false && "Unable to translate this intrinsic"); + errs() << N->getFuncPointer()->getName() << "\n"; + errs() << N->getParent()->getFuncPointer()->getName() << "\n"; + errs() << *II << "\n"; + + assert(false && "Unable to translate getNodeInstanceID intrinsic"); } // Create call instruction, insert it before the intrinsic and @@ -973,7 +998,7 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) { false); OpenCLFunction = cast<Function> (KernelM.getOrInsertFunction(StringRef("get_global_size"), FT)); - } else if (Leaf_HandleToDFNodeMap[ArgII] == Leaf_HandleToDFNodeMap[II]) { + } else if (Leaf_HandleToDFNodeMap[ArgII] == N) { // We are asking for this node's instances // this is a local size (block dim) call FunctionType* FT = @@ -992,7 +1017,7 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) { OpenCLFunction = cast<Function> (KernelM.getOrInsertFunction(StringRef("get_num_groups"), FT)); } else { - assert(false && "Unable to translate this intrinsic"); + assert(false && "Unable to translate getNumNodeInstances intrinsic"); } // Create call instruction, insert it before the intrinsic and @@ -1294,11 +1319,20 @@ static void getExecuteNodeParams(Value* &workDim, Value* &LocalWGPtr, Value* LocalWGPtr = genWorkGroupPtr(kernel->localWGSize, VMap, IB, "LocalWGSize"); } - GlobalWGPtr = genWorkGroupPtr(kernel->globalWGSize, VMap, IB, "GlobalWGSize"); + // For OpenCL, global work group size is the total bumber of instances in each + // dimension. So, multiply local and global dim limits. + std::vector<Value*> globalWGSizeInsts; + for (unsigned i = 0; i < kernel->gridDim; i++) { + BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB); + globalWGSizeInsts.push_back(MulInst); + } + GlobalWGPtr = genWorkGroupPtr(globalWGSizeInsts, VMap, IB, "GlobalWGSize"); DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n"); } -static Value* genWorkGroupPtr(std::vector<Value*>& WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) { +// CodeGen for allocating space for Work Group on stack and returning a pointer +// to its address +static Value* genWorkGroupPtr(std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) { Value* WGPtr; // Get int64_t and or ease of use Type* Int64Ty = Type::getInt64Ty(getGlobalContext()); @@ -1316,18 +1350,34 @@ static Value* genWorkGroupPtr(std::vector<Value*>& WGSize, ValueToValueMapTy& VM // size in that dimension for(unsigned i=0; i < WGSize.size(); i++) { assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!"); + + // If WGSize[i] is not a constant or a instruction, use mapped value in the new function + Value* WGSizeMapped; + if(isa<Argument>(WGSize[i])) + WGSizeMapped = VMap[WGSize[i]]; + else { + WGSizeMapped = WGSize[i]; + errs() << "Mapping value is not required: "; + errs() << *WGSize[i] << "\n"; + } if(WGSize[i]->getType() != Int64Ty) { // If number of dimensions are mentioned in any other integer format, // generate code to extend it to i64. We need to use the mapped value in // the new generated function, hence the use of VMap // FIXME: Why are we changing the kernel WGSize vector here? - WGSize[i] = BitCastInst::CreateIntegerCast(VMap[WGSize[i]], Int64Ty, true, "", IB); - StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB); + errs() << "Not i64. Zero extend required.\n"; + errs() << *WGSize[i] << "\n"; + errs() << *WGSizeMapped << "\n"; + CastInst* CI = BitCastInst::CreateIntegerCast(WGSizeMapped, Int64Ty, true, "", IB); + errs() << "Bitcast done.\n"; + StoreInst* SI = new StoreInst(CI, nextDim, IB); + errs() << "Zero extend done.\n"; DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n"); } else { // Store the value representing work group size in ith dimension on // stack - StoreInst* SI = new StoreInst(VMap[WGSize[i]], nextDim, IB); + StoreInst* SI = new StoreInst(WGSizeMapped, nextDim, IB); + DEBUG(errs() << "\t Work group size: " << *SI << "\n"); } if(i+1 < WGSize.size()) { diff --git a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp index 32ffaa13410a0321de1c1ad094effe6b6daadb0c..818be97f6ff22d73fe362feff59cb5062bcef987 100644 --- a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp @@ -90,9 +90,15 @@ public: void codeGenLaunch(DFInternalNode* Root); virtual void visit(DFInternalNode* N) { + // If code has already been generated for this internal node, skip the + // children + if(N->getGenFunc() != NULL) + return; + + DEBUG(errs() << "Start: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n"); + // Follows a bottom-up approach for code generation. // First generate code for all the child nodes - DEBUG(errs() << "Start: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n"); for(DFGraph::children_iterator i = N->getChildGraph()->begin(), e = N->getChildGraph()->end(); i != e; ++i) { DFNode* child = *i;