From 330fc7698e1ad28ddc6770e21255e480c737ab3d Mon Sep 17 00:00:00 2001 From: Maria Kotsifakou <kotsifa2@illinois.edu> Date: Mon, 7 Aug 2017 16:19:05 -0500 Subject: [PATCH] Compiler and runtime support for scheduling on different target (CPU/GPU/SPIR) based on policy. Support for scheduling per iteration included. Further testing required. --- llvm/include/llvm/IR/DFGraph.h | 157 +++++ llvm/include/llvm/SupportVISC/DFG2LLVM.h | 40 ++ llvm/include/llvm/SupportVISC/VISCHint.h | 6 +- llvm/include/llvm/SupportVISC/VISCUtils.h | 182 ++++++ llvm/lib/Transforms/BuildDFG/BuildDFG.cpp | 20 +- .../DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp | 28 +- .../Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp | 577 +++++++++++++++--- llvm/lib/Transforms/GenVISC/GenVISC.cpp | 60 +- llvm/projects/visc-rt/CMakeLists.txt | 7 + llvm/projects/visc-rt/policy.h | 35 ++ llvm/projects/visc-rt/visc-rt.cpp | 14 + llvm/projects/visc-rt/visc-rt.h | 8 +- 12 files changed, 958 insertions(+), 176 deletions(-) create mode 100644 llvm/projects/visc-rt/policy.h diff --git a/llvm/include/llvm/IR/DFGraph.h b/llvm/include/llvm/IR/DFGraph.h index 210270e809..d068301bc7 100644 --- a/llvm/include/llvm/IR/DFGraph.h +++ b/llvm/include/llvm/IR/DFGraph.h @@ -29,6 +29,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/GraphWriter.h" #include "llvm/SupportVISC/VISCHint.h" +#include "llvm/SupportVISC/VISCUtils.h" namespace llvm { @@ -42,6 +43,17 @@ class DFTreeTraversal; class DFEdgeVisitor; class DFGraph; +struct TargetGenFunctions { + Function *CPUGenFunc; + Function *GPUGenFunc; + Function *SPIRGenFunc; +}; + +struct TargetGenFuncInfo { + bool cpu_hasX86Func; + bool gpu_hasX86Func; + bool spir_hasX86Func; +}; class DFGraph { @@ -194,6 +206,12 @@ private: IntrinsicInst* II; ///< Associated IntrinsicInst/Value Function* FuncPointer; ///< Associated Function Function* GenFunc = NULL; ///< Associated Function generated by backend + struct TargetGenFunctions GenFuncs; + ///< Associated Functions generated by backends + ///< (if multiple are available) + struct TargetGenFuncInfo GenFuncInfo; + ///< True for each target generated function + ///< if the associated genFunc is an x86 function DFInternalNode* Parent; ///< Pointer to parent dataflow Node unsigned NumOfDim; ///< Number of dimensions std::vector<Value*> DimLimits; ///< Number of instances in each dimension @@ -375,6 +393,136 @@ public: return GenFunc; } + void setHasX86FuncForTarget(visc::Target T, bool isX86Func) { + switch (T) { + case visc::CPU_TARGET: + GenFuncInfo.cpu_hasX86Func = isX86Func; + break; + case visc::GPU_TARGET: + GenFuncInfo.gpu_hasX86Func = isX86Func; + break; + case visc::SPIR_TARGET: + GenFuncInfo.spir_hasX86Func = isX86Func; + break; + case visc::CPU_OR_GPU_TARGET: + case visc::CPU_OR_SPIR_TARGET: + assert(false && "Single target expected (CPU/GPU/SPIR)\n"); + break; + default: + assert(false && "Unknown target\n"); + break; + } + return; + } + + bool hasX86GenFuncForTarget(visc::Target T) { + switch (T) { + case visc::CPU_TARGET: + return GenFuncInfo.cpu_hasX86Func; + case visc::GPU_TARGET: + return GenFuncInfo.gpu_hasX86Func; + case visc::SPIR_TARGET: + return GenFuncInfo.spir_hasX86Func; + case visc::CPU_OR_GPU_TARGET: + case visc::CPU_OR_SPIR_TARGET: + assert(false && "Single target expected (CPU/GPU/SPIR)\n"); + return false; + default: + assert(false && "Unknown target\n"); + return false; + } + + } + + void addGenFunc(Function* F, visc::Target T, bool isX86Func) { + + switch (T) { + case visc::CPU_TARGET: + if (GenFuncs.CPUGenFunc != NULL) { + errs() << "Warning: Second generated CPU function for node " + << FuncPointer->getName() << "\n"; + } + GenFuncs.CPUGenFunc = F; + GenFuncInfo.cpu_hasX86Func = isX86Func; + break; + case visc::GPU_TARGET: + if (GenFuncs.GPUGenFunc != NULL) { + errs() << "Warning: Second generated GPU function for node " + << FuncPointer->getName() << "\n"; + } + GenFuncs.GPUGenFunc = F; + GenFuncInfo.gpu_hasX86Func = isX86Func; + break; + case visc::SPIR_TARGET: + if (GenFuncs.SPIRGenFunc != NULL) { + errs() << "Warning: Second generated SPIR function for node " + << FuncPointer->getName() << "\n"; + } + GenFuncs.SPIRGenFunc = F; + GenFuncInfo.spir_hasX86Func = isX86Func; + break; + case visc::CPU_OR_GPU_TARGET: + case visc::CPU_OR_SPIR_TARGET: + assert(false && + "A node function should be set with a tag specifying its \ + type, not the node hint itself\n"); + break; + default: + assert(false && "Unknown target for generated function\n"); + break; + } + + Tag = viscUtils::getUpdatedTag(Tag,T); + } + + Function* getGenFuncForTarget(visc::Target T) { + switch (T) { + case visc::CPU_TARGET: + return GenFuncs.CPUGenFunc; + case visc::GPU_TARGET: + return GenFuncs.GPUGenFunc; + case visc::SPIR_TARGET: + return GenFuncs.SPIRGenFunc; + case visc::CPU_OR_GPU_TARGET: + case visc::CPU_OR_SPIR_TARGET: + assert(false && + "Requesting genarated node function with dual tag instead of \ + CPU/GPU/SPIR\n"); + return NULL; + default: + assert(false && "Unknown target for generated function\n"); + return NULL; + } + } + + void removeGenFuncForTarget(visc::Target T) { + errs() << "Target tag = " << T << "\n"; + switch (T) { + case visc::CPU_TARGET: + GenFuncs.CPUGenFunc = NULL; + GenFuncInfo.cpu_hasX86Func = false; + break; + case visc::GPU_TARGET: + GenFuncs.GPUGenFunc = NULL; + GenFuncInfo.gpu_hasX86Func = false; + break; + case visc::SPIR_TARGET: + GenFuncs.SPIRGenFunc = NULL; + GenFuncInfo.spir_hasX86Func = false; + break; + case visc::CPU_OR_GPU_TARGET: + case visc::CPU_OR_SPIR_TARGET: + assert(false && + "Removing genarated node function with dual tag instead of \ + CPU/GPU/SPIR\n"); + break; + default: + assert(false && "Unknown target for generated function\n"); + break; + } + return; + } + void setTargetHint(visc::Target T) { Hint = T; } @@ -645,6 +793,15 @@ DFNode::DFNode(IntrinsicInst* _II, Function* _FuncPointer, visc::Target _Hint, OutputType = cast<StructType>(Ty); Level = (_Parent) ? _Parent->getLevel() + 1 : 0 ; Rank = 0; + + Tag = visc::None; + GenFuncs.CPUGenFunc = NULL; + GenFuncs.GPUGenFunc = NULL; + GenFuncs.SPIRGenFunc = NULL; + + GenFuncInfo.cpu_hasX86Func = false; + GenFuncInfo.gpu_hasX86Func = false; + GenFuncInfo.spir_hasX86Func = false; } void DFNode::setRank(int r) { diff --git a/llvm/include/llvm/SupportVISC/DFG2LLVM.h b/llvm/include/llvm/SupportVISC/DFG2LLVM.h index 5cc8a79564..a036d255c8 100644 --- a/llvm/include/llvm/SupportVISC/DFG2LLVM.h +++ b/llvm/include/llvm/SupportVISC/DFG2LLVM.h @@ -111,6 +111,7 @@ public: CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {} static bool checkPreferredTarget(DFNode* N, visc::Target T); + static bool preferredTargetIncludes(DFNode* N, visc::Target T); virtual void visit(DFInternalNode* N) { @@ -169,6 +170,45 @@ bool CodeGenTraversal::checkPreferredTarget(DFNode* N, visc::Target T) { return false; } +bool CodeGenTraversal::preferredTargetIncludes(DFNode* N, visc::Target T) { + Function* F = N->getFuncPointer(); + Module* M = F->getParent(); + std::vector<NamedMDNode *> HintNode; + switch (T) { + case visc::GPU_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_gpu")); + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu")); + break; + case visc::SPIR_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_spir")); + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir")); + break; + case visc::CPU_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu")); + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu")); + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir")); + break; + case visc::CPU_OR_GPU_TARGET: + case visc::CPU_OR_SPIR_TARGET: + assert(false && "Target should be one of CPU/GPU/SPIR\n"); + break; + default: + llvm_unreachable("Target Not supported yet!"); + } + + for (unsigned h = 0; h < HintNode.size(); h++) { + for (unsigned i = 0; i < HintNode[h]->getNumOperands(); i++) { + MDNode *MetaNode = HintNode[h]->getOperand(i); + Value *FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue(); + if (F == FHint) + return true; + } + } + + return false; +} + + // Generate Code for declaring a constant string [L x i8] and return a pointer // to the start of it. Value* CodeGenTraversal::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) { diff --git a/llvm/include/llvm/SupportVISC/VISCHint.h b/llvm/include/llvm/SupportVISC/VISCHint.h index 405b4e359f..aa9ee86364 100644 --- a/llvm/include/llvm/SupportVISC/VISCHint.h +++ b/llvm/include/llvm/SupportVISC/VISCHint.h @@ -20,10 +20,14 @@ namespace visc { CPU_TARGET, GPU_TARGET, SPIR_TARGET, + CPU_OR_GPU_TARGET, + CPU_OR_SPIR_TARGET, +// ALL_TARGETS, NUM_TARGETS }; #ifdef __cplusplus } #endif -#endif //VISC_RT_HEADER + +#endif //VISC_HINT_HEADER diff --git a/llvm/include/llvm/SupportVISC/VISCUtils.h b/llvm/include/llvm/SupportVISC/VISCUtils.h index 47e7582e5e..0077c81441 100644 --- a/llvm/include/llvm/SupportVISC/VISCUtils.h +++ b/llvm/include/llvm/SupportVISC/VISCUtils.h @@ -8,13 +8,24 @@ // //===----------------------------------------------------------------------===// +#ifndef VISC_UTILS_HEADER +#define VISC_UTILS_HEADER + +#include <assert.h> + #include "llvm/IR/Module.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Value.h" #include "llvm/Pass.h" #include "llvm/IR/Metadata.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/SupportVISC/VISCHint.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" using namespace llvm; @@ -147,6 +158,16 @@ void fixHintMetadata(Module &M, Function* F, Function* G) { if(HintNode->getOperand(i) == MDT_F) HintNode->setOperand(0, MDT_G); } + HintNode = M.getOrInsertNamedMetadata("visc_hint_cpu_gpu"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + if(HintNode->getOperand(i) == MDT_F) + HintNode->setOperand(0, MDT_G); + } + HintNode = M.getOrInsertNamedMetadata("visc_hint_cpu_spir"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + if(HintNode->getOperand(i) == MDT_F) + HintNode->setOperand(0, MDT_G); + } } // Assuming that the changed function is a node function, it is only used as a @@ -276,6 +297,167 @@ Function* cloneFunction(Function* F, FunctionType* newFT, bool return newF; } + //------------------- Helper Functions For Handling Hints -------------------// + +// Return true if 1st arg (tag) contains 2nd (target) +bool tagIncludesTarget(visc::Target Tag, visc::Target T) { + switch (Tag) { + case visc::None: + return false; + case visc::CPU_TARGET: + if (T == visc::CPU_TARGET) + return true; + else + return false; + case visc::GPU_TARGET: + if (T == visc::GPU_TARGET) + return true; + else + return false; + case visc::SPIR_TARGET: + if (T == visc::SPIR_TARGET) + return true; + else + return false; + case visc::CPU_OR_GPU_TARGET: + if ((T == visc::CPU_TARGET) || + (T == visc::GPU_TARGET) || + (T == visc::CPU_OR_GPU_TARGET)) + return true; + else + return false; + case visc::CPU_OR_SPIR_TARGET: + if ((T == visc::CPU_TARGET) || + (T == visc::SPIR_TARGET) || + (T == visc::CPU_OR_SPIR_TARGET)) + return true; + else + return false; + default: + assert(false && "Unknown Target\n"); + } +} + +bool isSingleTargetTag(visc::Target T) { + return ((T == visc::CPU_TARGET) || + (T == visc::GPU_TARGET) || + (T == visc::SPIR_TARGET)); +} + +// Add the specified target to the given tag +visc::Target getUpdatedTag(visc::Target Tag, visc::Target T) { + assert(((T == visc::CPU_TARGET) || + (T == visc::GPU_TARGET) || + (T == visc::SPIR_TARGET)) && + "The target is only allowed to be CPU, GPU, or SPIR\n"); + + switch (Tag) { + case visc::None: + return T; + case visc::CPU_TARGET: + if (T == visc::CPU_TARGET) + return visc::CPU_TARGET; + if (T == visc::GPU_TARGET) + return visc::CPU_OR_GPU_TARGET; + if (T == visc::SPIR_TARGET) + return visc::CPU_OR_SPIR_TARGET; + case visc::GPU_TARGET: + assert((T != visc::SPIR_TARGET) && "Unsupported target combination\n"); + if (T == visc::CPU_TARGET) + return visc::CPU_OR_GPU_TARGET; + if (T == visc::GPU_TARGET) + return visc::GPU_TARGET; + case visc::SPIR_TARGET: + assert((T != visc::GPU_TARGET) && "Unsupported target combination\n"); + if (T == visc::CPU_TARGET) + return visc::CPU_OR_SPIR_TARGET; + if (T == visc::SPIR_TARGET) + return visc::SPIR_TARGET; + case visc::CPU_OR_GPU_TARGET: + assert((T != visc::SPIR_TARGET) && "Unsupported target combination\n"); + return visc::CPU_OR_GPU_TARGET; + case visc::CPU_OR_SPIR_TARGET: + assert((T != visc::GPU_TARGET) && "Unsupported target combination\n"); + return visc::CPU_OR_SPIR_TARGET; + default: + assert(false && "Unknown Target\n"); + } +} + +// This functions add the hint as metadata in visc code +void addHint(Function* F, visc::Target T) { + // Get Module + Module* M = F->getParent(); + DEBUG(errs() << "Set preferred target for " << F->getName() << ": "); + + // Based on the hint, get the hint metadata + NamedMDNode* HintNode; + switch (T) { + case visc::GPU_TARGET: + DEBUG(errs() << "GPU Target\n"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); + break; + case visc::SPIR_TARGET: + DEBUG(errs() << "SPIR Target\n"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_spir"); + break; + case visc::CPU_TARGET: + DEBUG(errs() << "CPU Target\n"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); + break; + case visc::CPU_OR_GPU_TARGET: + DEBUG(errs() << "CPU or GPU Target\n"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"); + break; + case visc::CPU_OR_SPIR_TARGET: + DEBUG(errs() << "CPU or SPIR Target\n"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_spir"); + break; + default: + llvm_unreachable("Unsupported Target Hint!"); + break; + } + + // Create a node for the function and add it to the hint node + MDTuple* N = MDNode::get(M->getContext(), ArrayRef<Metadata*>(ValueAsMetadata::get(F))); + HintNode->addOperand(N); +} + +visc::Target getPreferredTarget(Function* F) { + DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n"); + Module* M = F->getParent(); + NamedMDNode* HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* N = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::GPU_TARGET; + } + HintNode = M->getOrInsertNamedMetadata("visc_hint_spir"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* N = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::SPIR_TARGET; + } + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* N = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::CPU_OR_GPU_TARGET; + } + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_spir"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* N = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::CPU_OR_SPIR_TARGET; + } + return visc::CPU_TARGET; +} + } // End of namespace +#endif //VISC_UTILS_HEADER diff --git a/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp b/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp index 0abeb095a9..04b01e332b 100644 --- a/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp +++ b/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp @@ -16,24 +16,12 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Support/Debug.h" #include "llvm/SupportVISC/VISCHint.h" +#include "llvm/SupportVISC/VISCUtils.h" using namespace llvm; namespace builddfg { -static visc::Target getPreferredTarget(Function* F) { - DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n"); - Module* M = F->getParent(); - NamedMDNode* HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); - for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { - MDNode* N = HintNode->getOperand(i); - Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); - if(F == FHint) - return visc::GPU_TARGET; - } - return visc::CPU_TARGET; -} - bool BuildDFG::runOnModule(Module &M) { errs() << "\nBUILDDFG PASS\n"; DEBUG(errs() << "-------- Searching for launch sites ----------\n"); @@ -55,7 +43,7 @@ bool BuildDFG::runOnModule(Module &M) { // Intrinsic Instruction has been initialized from this point on. Function* F = cast<Function>(II->getOperand(0)->stripPointerCasts()); - Root = DFInternalNode::Create(II, F, getPreferredTarget(F)); + Root = DFInternalNode::Create(II, F, viscUtils::getPreferredTarget(F)); Roots.push_back(Root); BuildGraph(Root, F); @@ -198,14 +186,14 @@ void BuildDFG::handleCreateNode(DFInternalNode* N, IntrinsicInst* II) { if(isInternalNode) { // Create Internal DFNode, add it to the map and recursively build its // dataflow graph - DFInternalNode* childDFNode = DFInternalNode::Create(II, F, getPreferredTarget(F), N, numOfDim, dimLimits); + DFInternalNode* childDFNode = DFInternalNode::Create(II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits); N->addChildToDFGraph(childDFNode); HandleToDFNodeMap[II] = childDFNode; BuildGraph(childDFNode, F); } else { // Create Leaf DFnode and add it to the map. - DFLeafNode* childDFNode = DFLeafNode::Create(II, F, getPreferredTarget(F), N, numOfDim, dimLimits); + DFLeafNode* childDFNode = DFLeafNode::Create(II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits); N->addChildToDFGraph(childDFNode); HandleToDFNodeMap[II] = childDFNode; } diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp index f4b7fe2500..bb83544969 100644 --- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -321,7 +321,10 @@ void CGT_NVPTX::initRuntimeAPI() { void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName) { // Check if clone already exists. If it does, it means we have visited this // function before. - assert(N->getGenFunc() == NULL && "Code already generated for this node"); +// assert(N->getGenFunc() == NULL && "Code already generated for this node"); + + assert(N->getGenFuncForTarget(visc::GPU_TARGET) == NULL && + "Code already generated for this node"); // Useful values Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1); @@ -362,7 +365,8 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi UndefValue::get(F_X86->getReturnType()), BB); //Add the generated function info to DFNode - N->setGenFunc(F_X86, visc::CPU_TARGET); +// N->setGenFunc(F_X86, visc::CPU_TARGET); + N->addGenFunc(F_X86, visc::GPU_TARGET, true); // FIXME: Adding Index and Dim arguments are probably not required except // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do @@ -814,7 +818,7 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi // Right now, only targeting the one level case. In general, device functions // can return values so we don't need to change them void CGT_NVPTX::codeGen(DFInternalNode* N) { - errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n"; + errs () << "Inside internal node: " << N->getFuncPointer()->getName() << "\n"; if(KernelLaunchNode == NULL) errs () << "No kernel launch node\n"; else { @@ -901,6 +905,7 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) { } void CGT_NVPTX::codeGen(DFLeafNode* N) { + errs () << "Inside leaf node: " << N->getFuncPointer()->getName() << "\n"; // Skip code generation if it is a dummy node if(N->isDummyNode()) { @@ -915,7 +920,11 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { } // Generate code only if it has the right hint - if(!checkPreferredTarget(N, visc::GPU_TARGET)) { +// if(!checkPreferredTarget(N, visc::GPU_TARGET)) { +// errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; +// return; +// } + if(!preferredTargetIncludes(N, visc::GPU_TARGET)) { errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; return; } @@ -969,7 +978,8 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { // Look up if we have visited this function before. If we have, then just // get the cloned function pointer from DFNode. Otherwise, create the cloned // function and add it to the DFNode GenFunc. - Function *F_nvptx = N->getGenFunc(); +// Function *F_nvptx = N->getGenFunc(); + Function *F_nvptx = N->getGenFuncForTarget(visc::GPU_TARGET); assert(F_nvptx == NULL && "Error: Visiting a node for which code already generated"); // Clone the function @@ -978,13 +988,12 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { Twine FName = F->getName(); F_nvptx = CloneFunction(F, VMap); F_nvptx->setName(FName+"_nvptx"); - errs() << "Old Function Name: " << F->getName() << "\n"; - errs() << "New Function Name: " << F_nvptx->getName() << "\n"; +// errs() << "Old Function Name: " << F->getName() << "\n"; +// errs() << "New Function Name: " << F_nvptx->getName() << "\n"; F_nvptx->removeFromParent(); - // Insert the cloned function into the kernels module KernelM->getFunctionList().push_back(F_nvptx); @@ -999,7 +1008,8 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { F_nvptx = transformFunctionToVoid(F_nvptx); //Add generated function info to DFNode - N->setGenFunc(F_nvptx, visc::GPU_TARGET); +// N->setGenFunc(F_nvptx, visc::GPU_TARGET); + N->addGenFunc(F_nvptx, visc::GPU_TARGET, false); DEBUG(errs() << "Removing all attributes from Kernel Function and adding nounwind\n"); F_nvptx->removeAttributes(AttributeSet::FunctionIndex, F_nvptx->getAttributes().getFnAttributes()); diff --git a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp index 6da326f701..8794f423e0 100644 --- a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp @@ -29,8 +29,26 @@ using namespace dfg2llvm; static cl::opt<bool> VISCTimer_X86("visc-timers-x86", cl::desc("Enable visc timers")); + namespace { +// Helper Functions +static bool isVISCCall_llvm_visc_policy_getVersion(Instruction *I) { + if (!isa<CallInst>(I)) + return false; + CallInst *CI = cast<CallInst>(I); + return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("llvm_visc_policy_getVersion"); +} + +CallInst *get_llvm_visc_policy_getVersion_call(Function *F) { + for (inst_iterator ib = inst_begin(F), ie = inst_end(F); ib != ie; ++ib) { + Instruction *I = &*ib; + if (isVISCCall_llvm_visc_policy_getVersion(I)) + return cast<CallInst>(I); + } + return NULL; +} + // DFG2LLVM_X86 - The first implementation. struct DFG2LLVM_X86 : public DFG2LLVM { static char ID; // Pass identification, replacement for typeid @@ -79,6 +97,8 @@ private: Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = ""); void addDoWhileLoop(Instruction*, Instruction*, Value*); void addWhileLoop(Instruction*, Instruction*, Instruction*, Value*); + void addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *, + Instruction *); Argument* getArgumentFromEnd(Function* F, unsigned offset); Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore); @@ -204,13 +224,24 @@ void CGT_X86::initRuntimeAPI() { Instruction* I = cast<Instruction>(*VI->user_begin()); initializeTimerSet(I); switchToTimer(visc_TimerID_NONE, I); + // Insert code for initializing the sceduling policy + Function *IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_init", + runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType())); + CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I); + DEBUG(errs() << *IPCallInst << "\n"); // Insert print instruction at visc exit Function* VC = M.getFunction("llvm.visc.cleanup"); assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once"); - DEBUG(errs() << "Inserting x86 timer print\n"); + // Insert code for clearing the sceduling policy I = cast<Instruction>(*VC->user_begin()); + IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_clear", + runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType())); + IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I); + errs() << *IPCallInst << "\n"; + + DEBUG(errs() << "Inserting x86 timer print\n"); printTimerSet(I); } @@ -284,6 +315,30 @@ void CGT_X86::addWhileLoop(Instruction* CondBlockStart, Instruction* BodyStart, } +void CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond, + BasicBlock *Body, Instruction *Cnt) { + Module *M = Entry->getParent()->getParent(); + Type *Int64Ty = Type::getInt64Ty(M->getContext()); + + // Insert a PHI instruction at the beginning of the condition block + Instruction *IB = Cond->getFirstNonPHI(); + PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB); + + ConstantInt *IConst = + ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true); + Instruction *CounterIncr = + BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst, + "cnt_incr", Body->getTerminator()); + + // Set incoming values for Phi node + IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true); + CounterPhi->addIncoming(IConst, Entry); + CounterPhi->addIncoming(CounterIncr, Body); + + // Return the pointer to the created PHI node in the corresponding argument + Cnt = CounterPhi; +} + /* Add Loop around the instruction I * Algorithm: * (1) Split the basic block of instruction I into three parts, where the @@ -755,7 +810,12 @@ void CGT_X86::codeGenLaunch(DFInternalNode* Root) { DEBUG(errs() << "Created Empty Launch Function\n"); // Find the X86 function generated for Root and - Function* RootF_X86 = Root->getGenFunc(); +// Function* RootF_X86 = Root->getGenFunc(); + Function* RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET); + assert(RootF_X86 && "Error: No generated CPU function for Root node\n"); + assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) && + "Error: Generated Function for Root node with no x86 wrapper\n"); + // Generate a call to RootF_X86 with null parameters for now std::vector<Value*>Args; for(unsigned i=0; i< RootF_X86->getFunctionType()->getNumParams(); i++) { @@ -873,10 +933,13 @@ void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,Instruction* IB) { Function* CF = C->getFuncPointer(); - Function* CF_X86 = C->getGenFunc(); - DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n"); +// Function* CF_X86 = C->getGenFunc(); + Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET); assert(CF_X86 != NULL - && "Found leaf node for which code generation has not happened yet!"); + && "Found leaf node for which code generation has not happened yet!\n"); + assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) && + "The generated function to be called from x86 backend is not an x86 function\n"); + DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n"); std::vector<Value*> Args; // Create argument list to pass to call instruction @@ -1102,9 +1165,17 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) { } /* Add a call to the generated function of the child node */ DEBUG(errs() << "\tAdd a call to the generated function of the child node\n"); - DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n"); - CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs, - C->getGenFunc()->getName()+".output", RI); +// DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n"); +// CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs, +// C->getGenFunc()->getName()+".output", RI); + Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET); + DEBUG(errs() << "Type: " + << *CGenF->getType() + << "\n"); + CallInst* CI = CallInst::Create(CGenF, + InputArgs, + CGenF->getName()+".output", + RI); /* Add runtime API calls to push output for each of the streaming outputs */ // FIXME: Assumption @@ -1134,8 +1205,59 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) { // Add loop around the basic block, which exits the loop if isLastInput is false //addDoWhileLoop(cast<Instruction>(Cond)->getNextNode(), RI, Cond); - addWhileLoop(cast<Instruction>(isLastInputPop), cast<Instruction>(Cond)->getNextNode(), - RI, Cond); +// addWhileLoop(cast<Instruction>(isLastInputPop), cast<Instruction>(Cond)->getNextNode(), +// RI, Cond); + + // Add loop around the basic block, which exits the loop if isLastInput is false + // Pointers to keep the created loop structure + BasicBlock *EntryBB, *CondBB, *BodyBB; + Instruction *CondStartI = cast<Instruction>(isLastInputPop); + Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode(); + EntryBB = CondStartI->getParent(); + + addWhileLoop(CondStartI, BodyStartI, RI, Cond); + CondBB = CondStartI->getParent(); + BodyBB = CI->getParent(); + Instruction *CntI = NULL; + CallInst *GetPolicyCI = get_llvm_visc_policy_getVersion_call(CGenF); + + // If the node function calls the visc runtime call to get policy, we update + // it with the counter information. This means we need to pass an additional + // argument to the generated function, that is the iteration number, and then + // use it as an argument to the policy_getVersion call + if (GetPolicyCI) { + addWhileLoopCounter(EntryBB, CondBB, BodyBB, CntI); + assert(CntI && "Counter instruction not found\n"); + + // Create new function type (with additional argument for iteration number) + Type *NewRetTy = CGenF->getFunctionType()->getReturnType(); + std::vector<Type*> NewArgTypes; + for (Function::arg_iterator ai = CGenF->arg_begin(), ae = CGenF->arg_end(); + ai != ae ; ++ai) { + NewArgTypes.push_back(ai->getType()); + } + NewArgTypes.push_back(Type::getInt64Ty(M.getContext())); + FunctionType *NewFT = FunctionType::get(NewRetTy, NewArgTypes, false); + Function *NewCGenF = viscUtils::cloneFunction(CGenF, NewFT, false); + // At least one (the last) argument exists (we added it) + Function::arg_iterator ae = NewCGenF->arg_end(); + --ae; + Argument *CntArg = &*ae; + CntArg->setName("iteration"); + // Replace the old cpu gen func with this one + C->addGenFunc(NewCGenF, visc::CPU_TARGET, true); + + // Add counter to the actual parameter list, to create the new call + InputArgs.push_back(CntI); + CallInst* newCI = CallInst::Create(NewCGenF, + InputArgs, + NewCGenF->getName()+".output"); + ReplaceInstWithInst(CI, newCI); + + // Set second operand of the policy_getVersion call to the last function + // argument + GetPolicyCI->setArgOperand(1, CntArg); + } // Return the Function pointer DEBUG(errs() << "Pipeline Version of " << CF->getName() << ":\n"); @@ -1151,47 +1273,23 @@ void CGT_X86::codeGen(DFInternalNode* N) { // Check if clone already exists. If it does, it means we have visited this // function before and nothing else needs to be done for this leaf node. - if(N->getGenFunc() != NULL) +// if(N->getGenFunc() != NULL) +// return; + if (!preferredTargetIncludes(N, visc::CPU_TARGET)) { + errs() << "No CPU hint for node " << N->getFuncPointer()->getName() << + " : skipping it\n"; return; - - Function* F = N->getFuncPointer(); - // Create of clone of F with no instructions. Only the type is the same as F - // without the extra arguments. - Function* F_X86; - - // Clone the function, if we are seeing this function for the first time. We - // only need a clone in terms of type. - ValueToValueMapTy VMap; - - // Create new function with the same type - F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); - - // Loop over the arguments, copying the names of arguments over. - Function::arg_iterator dest_iterator = F_X86->arg_begin(); - for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); - i != e; ++i) { - dest_iterator->setName(i->getName()); // Copy the name over... - // Add mapping to VMap and increment dest iterator - ++ dest_iterator; - VMap[&*i] = &*dest_iterator; } - // Add a basic block to this empty function - BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86); - ReturnInst* RI = ReturnInst::Create(F_X86->getContext(), - UndefValue::get(F_X86->getReturnType()), BB); - - //Add generated function info to DFNode - N->setGenFunc(F_X86, visc::CPU_TARGET); - - // Add Index and Dim arguments except for the root node and the child graph of - // parent node is not streaming - if(!N->isRoot() && !N->getParent()->isChildGraphStreaming()) - addIdxDimArgs(F_X86); + assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL && + "Error: Visiting a node for which code already generated\n"); // Sort children in topological order before code generation N->getChildGraph()->sortChildren(); + // Only process if all children have a CPU x86 function + // Otherwise skip to end + bool codeGen = true; for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), ce = N->getChildGraph()->end(); ci != ce; ++ci) { DFNode* C = *ci; @@ -1199,61 +1297,344 @@ void CGT_X86::codeGen(DFInternalNode* N) { if (C->isDummyNode()) continue; - // Check if Child Node has PTX tag or X86 tag - invokeChild_X86(C, F_X86, VMap, RI); + if (!(C->hasX86GenFuncForTarget(visc::CPU_TARGET))) { + errs() << "No CPU x86 version for child node " + << C->getFuncPointer()->getName() + << " . Skip code gen for parent node " + << N->getFuncPointer()->getName() << "\n"; + codeGen = false; + } } - DEBUG(errs() << "*** Generating epilogue code for the function****\n"); - // Generate code for output bindings - // Get Exit node - DFNode* C = N->getChildGraph()->getExit(); - // Get OutputType of this node - StructType* OutTy = N->getOutputType(); - Value *retVal = UndefValue::get(F_X86->getReturnType()); - // Find all the input edges to exit node - for (unsigned i=0; i < OutTy->getNumElements(); i++) { - DEBUG(errs() << "Output Edge " << i << "\n"); - // Find the incoming edge at the requested input port - DFEdge* E = C->getInDFEdgeAt(i); + if (codeGen) { + Function* F = N->getFuncPointer(); + // Create of clone of F with no instructions. Only the type is the same as F + // without the extra arguments. + Function* F_X86; + + // Clone the function, if we are seeing this function for the first time. We + // only need a clone in terms of type. + ValueToValueMapTy VMap; + + // Create new function with the same type + F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); + errs() << "--------------" << F->getName() << "\n"; + // Loop over the arguments, copying the names of arguments over. + Function::arg_iterator dest_iterator = F_X86->arg_begin(); + assert(false && "Got here\n"); + for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); + i != e; ++i) { + dest_iterator->setName(i->getName()); // Copy the name over... + // Add mapping to VMap and increment dest iterator + VMap[&*i] = &*dest_iterator; + ++dest_iterator; + } + + assert(false && "Got here\n"); + + // Add a basic block to this empty function + BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86); + ReturnInst* RI = ReturnInst::Create(F_X86->getContext(), + UndefValue::get(F_X86->getReturnType()), BB); + + //Add generated function info to DFNode +// N->setGenFunc(F_X86, visc::CPU_TARGET); + N->addGenFunc(F_X86, visc::CPU_TARGET, true); + + // Add Index and Dim arguments except for the root node and the child graph of + // parent node is not streaming + if(!N->isRoot() && !N->getParent()->isChildGraphStreaming()) + addIdxDimArgs(F_X86); + + // Iterate over children in topological order + for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); ci != ce; ++ci) { + DFNode* C = *ci; + // Skip dummy node call + if (C->isDummyNode()) + continue; + + // Create calls to CPU function of child node + invokeChild_X86(C, F_X86, VMap, RI); + + } + + DEBUG(errs() << "*** Generating epilogue code for the function****\n"); + // Generate code for output bindings + // Get Exit node + DFNode* C = N->getChildGraph()->getExit(); + // Get OutputType of this node + StructType* OutTy = N->getOutputType(); + Value *retVal = UndefValue::get(F_X86->getReturnType()); + // Find all the input edges to exit node + for (unsigned i=0; i < OutTy->getNumElements(); i++) { + DEBUG(errs() << "Output Edge " << i << "\n"); + // Find the incoming edge at the requested input port + DFEdge* E = C->getInDFEdgeAt(i); + + assert(E && "No Binding for output element!"); + // Find the Source DFNode associated with the incoming edge + DFNode* SrcDF = E->getSourceDF(); + + DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); + + // If Source DFNode is a dummyNode, edge is from parent. Get the + // argument from argument list of this internal node + Value* inputVal; + if(SrcDF->isEntryNode()) { + inputVal = getArgumentAt(F_X86, i); + DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + } + else { + // edge is from a internal node + // Check - code should already be generated for this source dfnode + assert(OutputMap.count(SrcDF) + && "Source node call not found. Dependency violation!"); + + // Find Output Value associated with the Source DFNode using OutputMap + Value* CI = OutputMap[SrcDF]; + + // Extract element at source position from this call instruction + std::vector<unsigned> IndexList; + IndexList.push_back(E->getSourcePosition()); + DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); + ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, + "",RI); + inputVal = EI; + } + std::vector<unsigned> IdxList; + IdxList.push_back(i); + retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI); + } + DEBUG(errs() << "Extracted all\n"); + retVal->setName("output"); + ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); + ReplaceInstWithInst(RI, newRI); - assert(E && "No Binding for output element!"); - // Find the Source DFNode associated with the incoming edge - DFNode* SrcDF = E->getSourceDF(); + } - DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); + //-------------------------------------------------------------------------// + // Here, we need to check if this node (N) has more than one versions + // If so, we query the policy and have a call to each version + // If not, we see which version exists, check that it is in fact an x86 + // function and save it as the CPU_TARGET function + + // TODO: visc_id per node, so we can use this for id for policies + // For now, use node function name and change it later + errs() << "Node Name (for policy) : " + << N->getFuncPointer()->getName() << "\n"; + Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET); + Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET); + Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET); + + bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); + bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); + bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET); + + errs() << "Node: " << N->getFuncPointer()->getName() + << " with tag " << N->getTag() << "\n"; + errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n"; + errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n"; + errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n"; + + + if (viscUtils::isSingleTargetTag(N->getTag())) { + // There is a single version for this node according to code gen hints. + // Therefore, we do not need to check the policy, we simply use the + // available implementation, whichever target it is for. + + // Sanity check - to be removed TODO + switch (N->getTag()) { + case visc::CPU_TARGET: + assert(N->getGenFuncForTarget(visc::CPU_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && ""); + assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && ""); + break; + case visc::GPU_TARGET: + assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(N->getGenFuncForTarget(visc::GPU_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && ""); + assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && ""); + break; + case visc::SPIR_TARGET: + assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(N->getGenFuncForTarget(visc::SPIR_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::SPIR_TARGET) && ""); + break; + default: + assert(false && "Unreachable: we checked that tag was single target!\n"); + break; + } - // If Source DFNode is a dummyNode, edge is from parent. Get the - // argument from argument list of this internal node - Value* inputVal; - if(SrcDF->isEntryNode()) { - inputVal = getArgumentAt(F_X86, i); - DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + N->addGenFunc(N->getGenFuncForTarget(N->getTag()), + visc::CPU_TARGET, + true); + N->removeGenFuncForTarget(visc::GPU_TARGET); + N->removeGenFuncForTarget(visc::SPIR_TARGET); + N->setTag(visc::CPU_TARGET); + + // Sanity checks - to be removed TODO + CF = N->getGenFuncForTarget(visc::CPU_TARGET); + GF = N->getGenFuncForTarget(visc::GPU_TARGET); + SF = N->getGenFuncForTarget(visc::SPIR_TARGET); + + CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); + GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); + SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET); + + errs() << "After editing\n"; + errs() << "Node: " << N->getFuncPointer()->getName() + << " with tag " << N->getTag() << "\n"; + errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n"; + errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n"; + errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n"; + + // assert(false && "got to the point where we have to select\n"); + } else { + // We have more than one targets + + Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET); + Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET); + Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET); + + bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); + bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); + bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET); + + // These assertions express what we can support with the current runtime. + // Code generation works the same way even for other target combinations. + // For now, we want either CPU and GPU, or CPU and SPIR + assert((CF && (GF && !SF || !GF && SF)) && "Invalid target selection\n"); + assert((CFx86 && (GFx86 && !SFx86 || !GFx86 && SFx86)) && + "Generated functions without appropriate x86 wrapper\n"); + + FunctionType *FT = CF->getFunctionType(); + if (GF) + assert(FT == GF->getFunctionType() && + "Type mismatch between generated functions for GPU and CPU targets.\n"); + if (SF) + assert(FT == SF->getFunctionType() && + "Type mismatch between generated functions for SPIR and CPU targets.\n"); + + // Code generation of wrapper function + Function *F_wrapper; + ValueToValueMapTy VMap; + F_wrapper = Function::Create(FT, CF->getLinkage(), CF->getName()+"_wrapper", &M); + + // Copy argument names over + Function::arg_iterator dest_iterator = F_wrapper->arg_begin(); + for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); + i != e; ++i) { + dest_iterator->setName(i->getName()); + VMap[&*i] = &*dest_iterator; + ++dest_iterator; } - else { - // edge is from a internal node - // Check - code should already be generated for this source dfnode - assert(OutputMap.count(SrcDF) - && "Source node call not found. Dependency violation!"); - - // Find Output Value associated with the Source DFNode using OutputMap - Value* CI = OutputMap[SrcDF]; - - // Extract element at source position from this call instruction - std::vector<unsigned> IndexList; - IndexList.push_back(E->getSourcePosition()); - DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); - ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, - "",RI); - inputVal = EI; + // Gather all arguments of wrapper in a vector, to prepare the call to + // the individual gen functions + std::vector<Value *> GenFuncCallArgs; + for (Function::arg_iterator i = F_wrapper->arg_begin(), e = F_wrapper->arg_end(); + i != e; ++i) { + GenFuncCallArgs.push_back(&*i); + } + + BasicBlock *BBcurrent, *BBtrue, *BBfalse; + + BBcurrent = BasicBlock::Create(M.getContext(), "entry", F_wrapper); + + StringRef FName = N->getFuncPointer()->getName(); + size_t nameSize = FName.size()+1; + std::vector<Constant *> NameV; + for (char c: FName) { + NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), c)); } - std::vector<unsigned> IdxList; - IdxList.push_back(i); - retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI); + NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), '\0')); + ArrayType *NameType = + ArrayType::get(IntegerType::get(M.getContext(), 8), nameSize); + AllocaInst *AI = new AllocaInst(NameType, nullptr, "", BBcurrent); + Constant *NameConst = ConstantArray::get(NameType, NameV); + StoreInst *StI = new StoreInst(NameConst, AI, BBcurrent); + CastInst *BI = BitCastInst::CreatePointerCast(AI, + Type::getInt8PtrTy(M.getContext()), "", BBcurrent); + std::vector<Value *> Args; + Args.push_back(BI); + Args.push_back(ConstantInt::get(Type::getInt64Ty(M.getContext()), -1, true)); + Function *RTF = + cast<Function>(M.getOrInsertFunction("llvm_visc_policy_getVersion", + runtimeModule->getFunction("llvm_visc_policy_getVersion")->getFunctionType())); + CallInst *RTFInst = CallInst::Create(RTF, Args, "", BBcurrent); + + ConstantInt *CmpConst = + ConstantInt::get(Type::getInt32Ty(M.getContext()), 0, true); + CmpInst *CmpI = CmpInst::Create(Instruction::ICmp, + CmpInst::ICMP_EQ, + RTFInst, CmpConst, + "", BBcurrent); + + BBtrue = BasicBlock::Create(M.getContext(), "version_cpu", F_wrapper); + BBfalse = BasicBlock::Create(M.getContext(), "not_cpu", F_wrapper); + BranchInst *BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); + + CallInst *GenFuncCI = CallInst::Create(CF, GenFuncCallArgs, "", BBtrue); + ReturnInst *RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); + + // Switch basic block pointers + BBcurrent = BBfalse; + if (GF) { + // We have a GPU version. Generate policy check and call + CmpConst = + ConstantInt::get(Type::getInt32Ty(M.getContext()), 1, true); + CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + RTFInst, CmpConst, "", BBcurrent); + BBtrue = BasicBlock::Create(M.getContext(), "version_gpu", F_wrapper); + BBfalse = BasicBlock::Create(M.getContext(), "not_gpu", F_wrapper); + BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); + + GenFuncCI = CallInst::Create(GF, GenFuncCallArgs, "", BBtrue); + RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); + } + + // Switch basic block pointers + BBcurrent = BBfalse; + if (SF) { + // We have a GPU version. Generate policy check and call + CmpConst = + ConstantInt::get(Type::getInt32Ty(M.getContext()), 2, true); + CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + RTFInst, CmpConst, "", BBcurrent); + BBtrue = BasicBlock::Create(M.getContext(), "version_spir", F_wrapper); + BBfalse = BasicBlock::Create(M.getContext(), "not_spir", F_wrapper); + BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); + + GenFuncCI = CallInst::Create(SF, GenFuncCallArgs, "", BBtrue); + RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); + } + + RI = ReturnInst::Create(M.getContext(), + UndefValue::get(FT->getReturnType()), BBfalse); + + // Now, make the node cpu gen func to be this one + // Remove all other versions and update the tag + N->addGenFunc(F_wrapper, visc::CPU_TARGET, true); + N->removeGenFuncForTarget(visc::GPU_TARGET); + N->removeGenFuncForTarget(visc::SPIR_TARGET); + N->setTag(visc::CPU_TARGET); + + assert(false && "got to the point where we have to combine\n"); } - DEBUG(errs() << "Extracted all\n"); - retVal->setName("output"); - ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); - ReplaceInstWithInst(RI, newRI); } @@ -1267,8 +1648,17 @@ void CGT_X86::codeGen(DFLeafNode* N) { // Check if clone already exists. If it does, it means we have visited this // function before and nothing else needs to be done for this leaf node. - if(N->getGenFunc() != NULL) +// if(N->getGenFunc() != NULL) +// return; + + if (!preferredTargetIncludes(N, visc::CPU_TARGET)) { + errs() << "No CPU hint for node " << N->getFuncPointer()->getName() << + " : skipping it\n"; return; + } + + assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL && + "Error: Visiting a node for which code already generated\n"); std::vector<IntrinsicInst *> IItoRemove; std::vector<std::pair<IntrinsicInst *, Value *> > IItoReplace; @@ -1286,7 +1676,8 @@ void CGT_X86::codeGen(DFLeafNode* N) { M.getFunctionList().push_back(F_X86); // Add generated function info to DFNode - N->setGenFunc(F_X86, visc::CPU_TARGET); +// N->setGenFunc(F_X86, visc::CPU_TARGET); + N->addGenFunc(F_X86, visc::CPU_TARGET, true); // Add the new argument to the argument list. Add arguments only if the cild // graph of parent node is not streaming diff --git a/llvm/lib/Transforms/GenVISC/GenVISC.cpp b/llvm/lib/Transforms/GenVISC/GenVISC.cpp index 8c0429af57..8de95a28d4 100644 --- a/llvm/lib/Transforms/GenVISC/GenVISC.cpp +++ b/llvm/lib/Transforms/GenVISC/GenVISC.cpp @@ -18,6 +18,7 @@ #include "llvm/IRReader/IRReader.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/SupportVISC/VISCHint.h" +#include "llvm/SupportVISC/VISCUtils.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/ValueMapper.h" @@ -240,62 +241,6 @@ static Value* genCodeForReturn(CallInst* CI) { return IV; } - -// This functions add the hint as metadata in visc code -static void addHint(Function* F, visc::Target T) { - // Get Module - Module* M = F->getParent(); - DEBUG(errs() << "Set preferred target for " << F->getName() << ": "); - - //assert(isa<ConstantInt>(CI->getArgOperand(0)) - //&& "Argument to hint must be constant integer!"); - //ConstantInt* hint = cast<ConstantInt>(CI->getArgOperand(0)); - - // Based on the hint, get the hint metadata - NamedMDNode* HintNode; - switch (T) { - case visc::GPU_TARGET: - DEBUG(errs() << "GPU Target\n"); - HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); - break; - case visc::SPIR_TARGET: - DEBUG(errs() << "SPIR Target\n"); - HintNode = M->getOrInsertNamedMetadata("visc_hint_spir"); - break; - case visc::CPU_TARGET: - DEBUG(errs() << "CPU Target\n"); - HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); - break; - default: - llvm_unreachable("Unsupported Target Hint!"); - break; - } - - // Create a node for the function and add it to the hint node - MDTuple* N = MDNode::get(M->getContext(), ArrayRef<Metadata*>(ValueAsMetadata::get(F))); - HintNode->addOperand(N); -} - -static visc::Target getPreferredTarget(Function* F) { - DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n"); - Module* M = F->getParent(); - NamedMDNode* HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); - for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { - MDNode* N = HintNode->getOperand(i); - Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); - if(F == FHint) - return visc::GPU_TARGET; - } - HintNode = M->getOrInsertNamedMetadata("visc_hint_spir"); - for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { - MDNode* N = HintNode->getOperand(i); - Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); - if(F == FHint) - return visc::SPIR_TARGET; - } - return visc::CPU_TARGET; -} - // The visc launch intrinsic requires all the input parameters to the kernel // function be placed in contiguous memory and pointer to that input be passed // as the second argument to the launch intrinsic. This generates code to bring @@ -462,6 +407,9 @@ static Function* genInternalNode(Function* KernelF, unsigned level, if(level > 1) { ChildNodeF = genInternalNode(KernelF, level-1, numArgs, numDims, dimOffset, CI); addHint(ChildNodeF, getPreferredTarget(KernelF)); +// Internal nodes always get a CPU hint. If code geneation for them is not +// needed and can be skipped, this is handled by the accelerator backends +// addHint(ChildNodeF, visc::CPU_TARGET); } else { ChildNodeF = KernelF; } diff --git a/llvm/projects/visc-rt/CMakeLists.txt b/llvm/projects/visc-rt/CMakeLists.txt index 824f751baf..e7c5f56ef8 100644 --- a/llvm/projects/visc-rt/CMakeLists.txt +++ b/llvm/projects/visc-rt/CMakeLists.txt @@ -1,4 +1,11 @@ add_custom_target(visc-rt ALL) +add_custom_command( + TARGET visc-rt PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_SOURCE_DIR}/policy.h + ${CMAKE_CURRENT_BINARY_DIR}/policy.h + DEPENDS policy.h + COMMENT "Copying policy.h") add_custom_command( TARGET visc-rt PRE_BUILD COMMAND ${CMAKE_COMMAND} -E copy diff --git a/llvm/projects/visc-rt/policy.h b/llvm/projects/visc-rt/policy.h new file mode 100644 index 0000000000..3aaafce539 --- /dev/null +++ b/llvm/projects/visc-rt/policy.h @@ -0,0 +1,35 @@ +#ifndef __POLICY__ +#define __POLICY__ + +#include <string> + + /************************* Policies *************************************/ +class Policy { + public: + virtual int getVersion(const char *, int64_t) = 0; +}; + +class NodePolicy : public Policy { + virtual int getVersion(const char *name, int64_t it) override { +// if (name == "sgemm") +// return 1; +// else +// return 0; + std::string s(name); + std::string NodeNames[1] = { " _Z9mysgemmNTPfiS_iS_iiff_clonedInternal_level2_cloned" }; + if (s.compare(NodeNames[0])) // if this is the kernel launch node + return 1; + return 0; + } +}; + +class IterationPolicy : public Policy { + virtual int getVersion(const char *name, int64_t it) override { + if (it % 2 == 0) + return 0; + else + return 1; + } +}; + +#endif // __POLICY__ diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp index 4cb652135b..e61802d25e 100644 --- a/llvm/projects/visc-rt/visc-rt.cpp +++ b/llvm/projects/visc-rt/visc-rt.cpp @@ -51,6 +51,7 @@ cl_context globalOCLContext; cl_device_id* clDevices; cl_command_queue globalCommandQue; +Policy *policy = NULL; MemTracker MTracker; vector<DFGDepth> DStack; // Mutex to prevent concurrent access by multiple thereads in pipeline @@ -67,6 +68,19 @@ static inline void checkErr(cl_int err, cl_int success, const char * name) { } } +/************************* Policies *************************************/ +void llvm_visc_policy_init() { + policy = new NodePolicy(); +} + +void llvm_visc_policy_clear() { + if (policy) free(policy); +} + +int llvm_visc_policy_getVersion(const char *name, int64_t i) { + policy->getVersion(name, i); +} + /************************* Depth Stack Routines ***************************/ void llvm_visc_x86_dstack_push(unsigned n, unsigned limitX, unsigned iX, unsigned limitY, diff --git a/llvm/projects/visc-rt/visc-rt.h b/llvm/projects/visc-rt/visc-rt.h index 00e270d002..20cc6e35a6 100644 --- a/llvm/projects/visc-rt/visc-rt.h +++ b/llvm/projects/visc-rt/visc-rt.h @@ -15,7 +15,7 @@ #include "llvm/SupportVISC/VISCHint.h" #include "llvm/SupportVISC/VISCTimer.h" - +#include "policy.h" #ifndef DEBUG_BUILD #define DEBUG(s) {} #else @@ -28,6 +28,12 @@ using namespace std; extern "C" { +/************************* Policies *************************************/ + +void llvm_visc_policy_init(); +void llvm_visc_policy_clear(); +int llvm_visc_policy_getVersion(const char *, int64_t); + /********************* DFG Depth Stack **********************************/ class DFGDepth { private: -- GitLab