diff --git a/llvm/include/llvm/IR/DFGraph.h b/llvm/include/llvm/IR/DFGraph.h index 11473cec41f0df45b14b0bc6ab4b480c08a5aedc..ab17261f19d23228e7aaf2903a19e010249e0146 100644 --- a/llvm/include/llvm/IR/DFGraph.h +++ b/llvm/include/llvm/IR/DFGraph.h @@ -373,7 +373,8 @@ public: bool isExitNode(); DFEdge* getInDFEdgeAt(unsigned inPort); DFEdge* getOutDFEdgeAt(unsigned outPort); - std::vector<unsigned> getInArgMap(); + std::map<unsigned, unsigned> getInArgMap(); + std::map<unsigned, Value*> getSharedInArgMap(); std::vector<unsigned> getOutArgMap(); int getAncestorHops(DFNode* N); bool hasSideEffects(); @@ -652,16 +653,31 @@ DFEdge* DFNode::getOutDFEdgeAt(unsigned outPort) { return NULL; } -std::vector<unsigned> DFNode::getInArgMap() { - std::vector<unsigned> map(InDFEdges.size()); +// Ignore Allocation Nodes +std::map<unsigned, unsigned> DFNode::getInArgMap() { + std::map<unsigned, unsigned> map; for (unsigned i = 0; i < InDFEdges.size(); i++) { DFEdge* E = getInDFEdgeAt(i); + if (E->getSourceDF()->isAllocationNode()) + continue; unsigned pos = E->getSourcePosition(); map[i] = pos; } return map; } +// Only Allocation Nodes +std::map<unsigned, Value*> DFNode::getSharedInArgMap() { + std::map<unsigned, Value*> map; + for (unsigned i = 0; i < InDFEdges.size(); i++) { + DFEdge* E = getInDFEdgeAt(i); + if (!E->getSourceDF()->isAllocationNode()) + continue; + map[i] = NULL; + } + return map; +} + std::vector<unsigned> DFNode::getOutArgMap() { std::vector<unsigned> map(OutDFEdges.size()); for (unsigned i = 0; i < OutDFEdges.size(); i++) { diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp index 525367fa05853010fa29f85b476bd44211a0899b..b3ea4c6216776bc380c0e46e3d1c35e3ad63aee3 100644 --- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -11,6 +11,7 @@ #define TARGET_PTX 32 #define GENERIC_ADDRSPACE 0 #define GLOBAL_ADDRSPACE 1 +#define SHARED_ADDRSPACE 3 #define DEBUG_TYPE "DFG2LLVM_NVPTX" #include "llvm/IR/DataLayout.h" @@ -58,15 +59,16 @@ public: // calls class Kernel { public: - Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::vector<unsigned> _inArgMap = - std::vector<unsigned>(), std::vector<unsigned> _outArgMap = + Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::map<unsigned, unsigned> _inArgMap = + std::map<unsigned, unsigned>(), std::map<unsigned, Value*> _sharedInArgMap = + std::map<unsigned, Value*>(), std::vector<unsigned> _outArgMap = std::vector<unsigned>(), unsigned _gridDim = 0, std::vector<Value*> _globalWGSize = std::vector<Value*>(), unsigned _blockDim = 0, std::vector<Value*> _localWGSize = std::vector<Value*>()) : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap), - outArgMap(_outArgMap), gridDim(_gridDim), globalWGSize(_globalWGSize), - blockDim(_blockDim), localWGSize(_localWGSize) { + sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), gridDim(_gridDim), + globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) { assert(gridDim == globalWGSize.size() && "gridDim should be same as the size of vector globalWGSize"); @@ -76,7 +78,13 @@ public: Function* KernelFunction; DFLeafNode* KernelLeafNode; - std::vector<unsigned> inArgMap; + std::map<unsigned, unsigned> inArgMap; + // Map for shared memory arguments + std::map<unsigned, Value*> sharedInArgMap; + // Fields for (potential) allocation node + DFLeafNode* AllocationNode; + std::map<unsigned, unsigned> allocInArgMap; + std::vector<unsigned> outArgMap; unsigned gridDim; std::vector<Value*> globalWGSize; @@ -84,13 +92,20 @@ public: std::vector<Value*> localWGSize; std::vector<int> localDimMap; - std::vector<unsigned> getInArgMap() { + std::map<unsigned, unsigned> getInArgMap() { return inArgMap; } - void setInArgMap(std::vector<unsigned> map) { + void setInArgMap(std::map<unsigned, unsigned> map) { inArgMap = map; } + std::map<unsigned, Value*> getSharedInArgMap() { + return sharedInArgMap; + } + void setSharedInArgMap(std::map<unsigned, Value*> map) { + sharedInArgMap = map; + } + std::vector<unsigned> getOutArgMap() { return outArgMap; } @@ -138,7 +153,7 @@ private: Module &KernelM; DFNode* KernelLaunchNode; Kernel* kernel; - + // VISC Runtime API Constant* llvm_visc_ocl_launch; Constant* llvm_visc_ocl_wait; @@ -155,6 +170,7 @@ private: std::string getKernelsModuleName(Module &M); void fixValueAddrspace(Value* V, unsigned addrspace); void changeArgAddrspace(Function* F, unsigned i); + void changeArgAddrspace(Function* F, std::vector<unsigned> &Ags, unsigned i); void addCLMetadata(Function* F); void transformFunctionToVoid(Function* F); void insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName); @@ -403,11 +419,21 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi // Vector to hold the device memory object that need to be cleared before we release // context std::vector<Value*> DevicePointers; + + std::map<unsigned, unsigned> kernelInArgMap = K->getInArgMap(); +/* for(unsigned i=0; i<KF->getFunctionType()->getNumParams(); i++) { // The kernel object gives us the mapping of arguments from kernel launch // node function (F_X86) to kernel (kernel->KF) Value* inputVal = getArgumentAt(F_X86, K->getInArgMap()[i]); + +*/ + + for(std::map<unsigned, unsigned>::iterator ib = kernelInArgMap.begin(), + ie = kernelInArgMap.end(); ib != ie; ++ib) { + unsigned i = ib->first; + Value* inputVal = getArgumentAt(F_X86, ib->second); DEBUG(errs() << "\tArgument "<< i<< " = " << *inputVal << "\n"); // input value has been obtained. @@ -437,7 +463,11 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi Type::getInt8PtrTy(M.getContext()), inputVal->getName()+".i8ptr", RI); - Value* inputSize = getArgumentAt(F_X86, K->getInArgMap()[i+1]); + + // Assert that the pointer argument size (next argument) is in the map + assert(kernelInArgMap.find(i+1) != kernelInArgMap.end()); + + Value* inputSize = getArgumentAt(F_X86, kernelInArgMap[i+1]); assert(inputSize->getType() == Type::getInt64Ty(M.getContext()) && "Pointer type input must always be followed by size (integer type)"); Value* setInputArgs[] = {GraphID, @@ -475,10 +505,56 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi CallInst::Create(llvm_visc_ocl_argument_scalar, ArrayRef<Value*>(setInputArgs, 4), "", RI); } - } DEBUG(errs() << "Setup output edges of node and insert visc api\n"); + std::map<unsigned, Value*> kernelSharedInArgMap = K->getSharedInArgMap(); + + for(std::map<unsigned, Value*>::iterator ib = kernelSharedInArgMap.begin(), + ie = kernelSharedInArgMap.end(); ib != ie; ++ib) { + unsigned i = ib->first; + Value* inputVal = ib->second; + + DEBUG(errs() << "\tArgument "<< i<< " = " << *inputVal << "\n"); + + // input value has been obtained. + // inputVal is a scalar value + if (i % 2 == 0) { // Sharem memory ptr argument - scalar at size position + switchToTimer(visc_TimerID_COPY_SCALAR, RI); + + assert(isa<Constant>(inputVal) && "Only constant shared memory size is supported"); + + Value* setInputArgs[] = {GraphID, + ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())), + ConstantInt::get(Type::getInt32Ty(M.getContext()),i), + inputVal + }; + CallInst::Create(llvm_visc_ocl_argument_scalar, + ArrayRef<Value*>(setInputArgs, 4), "", RI); + + } else { // Sharem memory size argument - scalar at address position + switchToTimer(visc_TimerID_COPY_SCALAR, RI); + // Store the scalar value on stack and then pass the pointer to its + // location + AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), kernel->KernelFunction->getName()+".sharedMem."+Twine(i)+".ptr", RI); + StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI); + + Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr, + Type::getInt8PtrTy(M.getContext()), + kernel->KernelFunction->getName()+".sharedMem."+Twine(i)+".i8ptr", + RI); + + Value* setInputArgs[] = {GraphID, + inputValI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),i), + ConstantExpr::getSizeOf(inputVal->getType()) + }; + CallInst::Create(llvm_visc_ocl_argument_scalar, + ArrayRef<Value*>(setInputArgs, 4), "", RI); + } + } + DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n"); + // Set output if struct is not an empty struct StructType* OutputTy = K->KernelLeafNode->getOutputType(); std::vector<Value*> d_Outputs; @@ -675,13 +751,17 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) { } else { DEBUG(errs() << "Found intermediate node. Getting size parameters.\n"); // Keep track of the arguments order. - std::vector<unsigned> inmap1 = N->getInArgMap(); - std::vector<unsigned> inmap2 = kernel->getInArgMap(); - // TODO: Verify when we have incoming edges from more than one nodes The - // limit is the size of inmap2, because this is the number of kernel - // arguments - for (unsigned i = 0; i < inmap2.size(); i++) { inmap2[i] = - inmap1[inmap2[i]]; } kernel->setInArgMap(inmap2); + std::map<unsigned, unsigned> inmap1 = N->getInArgMap(); + std::map<unsigned, unsigned> inmap2 = kernel->getInArgMap(); + // TODO: Structure assumed: one thread node, one allocation node (at most), + // TB node + std::map<unsigned, unsigned> inmapFinal; + for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), ie = inmap2.end(); + ib != ie; ++ib) { + inmapFinal[ib->first] = inmap1[ib->second]; + } + + kernel->setInArgMap(inmapFinal); // Keep track of the output arguments order. std::vector<unsigned> outmap1 = N->getOutArgMap(); @@ -715,6 +795,10 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) { // find the source location in Parent of N. Retrieve the argument from // parent to insert in the vector. unsigned argNum = Arg->getArgNo(); + // This argument will be coming from the parent node, not the allocation + // Node + assert(N->getInArgMap().find(argNum) != N->getInArgMap().end()); + unsigned parentArgNum = N->getInArgMap()[argNum]; Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum); localWGSizeMapped.push_back(A); @@ -736,13 +820,18 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { DEBUG(errs() << "Skipping dummy node\n"); return; } + + // Skip code generation if it is an allocation node + if(N->isAllocationNode()) { + DEBUG(errs() << "Skipping allocation node\n"); + return; + } // Generate code only if it has the right hint if(!checkPreferredTarget(N, visc::GPU_TARGET)) { errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; return; } - // Checking which node is the kernel launch DFNode* PNode = N->getParent(); @@ -758,6 +847,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { kernel = new Kernel(NULL, N, N->getInArgMap(), + N->getSharedInArgMap(), N->getOutArgMap(), N->getNumOfDim(), N->getDimLimits()); @@ -771,6 +861,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { kernel = new Kernel(NULL, // kernel function N, // kernel leaf node N->getInArgMap(), // kenel argument mapping + N->getSharedInArgMap(), N->getOutArgMap(), // kernel output mapping from the leaf to the interemediate node PNode->getNumOfDim(), // gridDim PNode->getDimLimits(),// grid size @@ -812,6 +903,48 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { transformFunctionToVoid(F_nvptx); + + //FIXME: For now, assume only one allocation node + kernel->AllocationNode = NULL; + + for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), iee = N->indfedge_end(); + ieb != iee; ++ieb) { + DFNode *SrcDFNode = (*ieb)->getSourceDF(); + errs() << "Found edge from node: "<< SrcDFNode->getFuncPointer()->getName() << "\n"; + errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n"; + if (SrcDFNode != PNode) { + assert(SrcDFNode->isAllocationNode()); + kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode); + kernel->allocInArgMap = SrcDFNode->getInArgMap(); + break; + } + } + + // Vector for shared memory arguments + std::vector<unsigned> SharedMemArgs; + + // If no allocation node was found, SharedMemArgs is empty + if (kernel->AllocationNode) { + std::map<unsigned, Value*> sharedInMap = kernel->getSharedInArgMap(); + AllocationNodeProperty* APN = + (AllocationNodeProperty*) kernel->AllocationNode->getProperty(DFNode::Allocation); + for (auto& AllocPair: APN->getAllocationList()) { + unsigned destPos = AllocPair.first->getDestPosition(); + SharedMemArgs.push_back(destPos); + sharedInMap[destPos] = AllocPair.second; + sharedInMap[destPos+1] = AllocPair.second; + } + kernel->setSharedInArgMap(sharedInMap); + } + std::sort(SharedMemArgs.begin(), SharedMemArgs.end()); + + /* At this point, we assume that chescks for the fact that SharedMemArgs only + contains pointer arguments to GLOBAL_ADDRSPACE have been performed by the + analysis pass */ + + changeArgAddrspace(F_nvptx, SharedMemArgs, SHARED_ADDRSPACE); + + // Go through all the instructions for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) { Instruction *I = &(*i); @@ -1138,6 +1271,25 @@ void CGT_NVPTX::changeArgAddrspace(Function* F, unsigned addrspace) { DEBUG(errs() << *F->getFunctionType() << "\n" <<*F << "\n"); } +void CGT_NVPTX::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) { + unsigned idx = 0; + std::vector<Type*> ArgTypes; + for(auto& arg: F->getArgumentList()) { + DEBUG(errs() << arg << "\n"); + unsigned argno = arg.getArgNo(); + if ((idx < Args.size()) && (argno == Args[idx])) { + fixValueAddrspace(&arg, addrspace); + idx++; + } + ArgTypes.push_back(arg.getType()); + } + FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, false); + PointerType* PTy = FTy->getPointerTo(cast<PointerType>(F->getType())->getAddressSpace()); + + F->mutateType(PTy); + DEBUG(errs() << *F->getFunctionType() << "\n" <<*F << "\n"); +} + /* Add metadata to module KernelM, for OpenCL kernels */ void CGT_NVPTX::addCLMetadata(Function *F) {