diff --git a/hpvm/include/GenHPVM/GenHPVM.h b/hpvm/include/GenHPVM/GenHPVM.h index f61d4a7c90dff2e4bff5f781c6c9cc92e3246232..4380433b979216ff3fd86591d117e857abfc5db4 100644 --- a/hpvm/include/GenHPVM/GenHPVM.h +++ b/hpvm/include/GenHPVM/GenHPVM.h @@ -26,7 +26,8 @@ using namespace llvm; namespace genhpvm { // GenHPVM - The first implementation. -struct GenHPVM : public ModulePass { +class GenHPVM : public ModulePass { + public: static char ID; // Pass identification, replacement for typeid GenHPVM() : ModulePass(ID) {} diff --git a/hpvm/include/SupportHPVM/DFGTreeTraversal.h b/hpvm/include/SupportHPVM/DFGTreeTraversal.h index e357bb3dd98527fd773570593beac7ce0386c6cf..cb3963b150e8a55b8de482dcb12292b24804a5ff 100644 --- a/hpvm/include/SupportHPVM/DFGTreeTraversal.h +++ b/hpvm/include/SupportHPVM/DFGTreeTraversal.h @@ -1,6 +1,3 @@ -#ifndef __DFGTREETRAVERSAL_H__ -#define __DFGTREETRAVERSAL_H__ - //=== DFGTreeTraversal.h - Header file for Tree Traversal of the HPVM DFG ====// // // The LLVM Compiler Infrastructure @@ -10,6 +7,9 @@ // //===----------------------------------------------------------------------===// +#ifndef __DFGTREETRAVERSAL_H__ +#define __DFGTREETRAVERSAL_H__ + #include "BuildDFG/BuildDFG.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" @@ -38,19 +38,22 @@ public: void visit(DFInternalNode *N) { // May visit a nodemore than once, there is no marking it as visited - DEBUG(errs() << "Start: In Node (I) - " << N->getFuncPointer()->getName() - << "\n"); + DEBUG(errs() << "In Node (I) - " << N->getFuncPointer()->getName() << "\n"); + + std::vector<DFNode *> Children; // Follows a bottom-up approach. for (DFGraph::children_iterator i = N->getChildGraph()->begin(), e = N->getChildGraph()->end(); i != e; ++i) { DFNode *child = *i; - child->applyDFNodeVisitor(*this); + Children.push_back(child); } - + for (auto child : Children) + child->applyDFNodeVisitor(*this); // Process this internal node now. - process(N); + if (N != nullptr) + process(N); DEBUG(errs() << "DONE: In Node (I) - " << N->getFuncPointer()->getName() << "\n"); } diff --git a/hpvm/include/SupportHPVM/DFGraph.h b/hpvm/include/SupportHPVM/DFGraph.h index 3da7c0b01a79d52f668795eb072fdcb6381813a9..f6c697bd2daf902bd24276f40bb2a0b8082bf120 100644 --- a/hpvm/include/SupportHPVM/DFGraph.h +++ b/hpvm/include/SupportHPVM/DFGraph.h @@ -20,8 +20,8 @@ #ifndef LLVM_IR_DFGRAPH_H #define LLVM_IR_DFGRAPH_H -#include "SupportHPVM/HPVMHint.h" -#include "SupportHPVM/HPVMUtils.h" +#include "HPVMHint.h" +#include "HPVMUtils.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" @@ -47,6 +47,9 @@ struct TargetGenFunctions { Function *GPUGenFunc; Function *CUDNNGenFunc; Function *PROMISEGenFunc; + Function *FFTGenFunc; + Function *VITERBIGenFunc; + Function *EPOCHSGenFunc; }; struct TargetGenFuncInfo { @@ -54,6 +57,9 @@ struct TargetGenFuncInfo { bool gpu_hasCPUFunc; bool cudnn_hasCPUFunc; bool promise_hasCPUFunc; + bool fft_hasCPUFunc; + bool viterbi_hasCPUFunc; + bool epochs_hasCPUFunc; }; class DFGraph { @@ -72,11 +78,15 @@ private: DFEdgeListType DFEdgeList; ///< List of Dataflow edges among children public: - DFGraph(DFInternalNode *P); + inline DFGraph(DFInternalNode *P); virtual ~DFGraph() {} - void addChildDFNode(DFNode *child) { ChildrenList.push_back(child); } + void addChildDFNode(DFNode *child) { + if (std::find(ChildrenList.begin(), ChildrenList.end(), child) == + ChildrenList.end()) + ChildrenList.push_back(child); + } void removeChildDFNode(DFNode *child) { children_iterator position = std::find(begin(), end(), child); @@ -85,7 +95,10 @@ public: } // Dataflow edge connecting child dataflow nodes - void addDFEdge(DFEdge *E) { DFEdgeList.push_back(E); } + void addDFEdge(DFEdge *E) { + if (std::find(DFEdgeList.begin(), DFEdgeList.end(), E) == DFEdgeList.end()) + DFEdgeList.push_back(E); + } DFNode *getEntry() const { return Entry; } @@ -95,8 +108,8 @@ public: bool isExit(const DFNode *N) const { return N == Exit; } - void sortChildren(); - static bool compareRank(DFNode *A, DFNode *B); + inline void sortChildren(); + static inline bool compareRank(DFNode *A, DFNode *B); // Iterators typedef DFNodeListType::iterator children_iterator; @@ -140,9 +153,10 @@ public: //===--------------------------------------------------------------------===// DFInternalNode *getParent() const { return Parent; } + void setParent(DFInternalNode *_Parent) { Parent = _Parent; } // Child graph is streaming if any of the edges in the edge list is streaming - bool isStreaming(); + inline bool isStreaming(); //**************************************************************************// //* Functions to modify a dataflow graph *// @@ -154,6 +168,8 @@ public: if (position != dfedge_end()) // the edge was found DFEdgeList.erase(position); } + // Returns whether an edge was removed (might not happen if edge not found). + inline bool removeEdge(const DFEdge *E); }; // DFNode represents a single HPVM Dataflow Node in LLVM. @@ -165,6 +181,7 @@ public: // 4. Pointer to parent Dataflow Node // 5. List of children Dataflow Nodes (empty if it is a leaf node) // 6. List of Dataflow Edges among children +// 7. A Node Criticality value class DFNode { @@ -210,12 +227,17 @@ private: const DFNodeKind Kind; ///< Kind of Node Internal/Leaf hpvm::Target Tag; ///< Code Generated for which backend hpvm::Target Hint; ///< To store preferred backend + unsigned Criticality; + bool Root; + std::vector<hpvm::EPOCHS_DEVICES> EpochsTargets; // Store EPOCHS targets public: virtual ~DFNode() { // TODO: Check if fields DimLimits and OutputType need to freed here. } + StringRef getName() { return FuncPointer->getName(); } + // Iterators typedef DFNodeListType::iterator successor_iterator; typedef DFNodeListType::const_iterator const_successor_iterator; @@ -287,36 +309,75 @@ public: DFNode(IntrinsicInst *_II, Function *_FuncPointer, hpvm::Target _Hint, DFInternalNode *_Parent, unsigned _NumOfDim, - std::vector<Value *> _DimLimits, DFNodeKind _K); + std::vector<Value *> _DimLimits, DFNodeKind _K, unsigned _Criticality); + void setRoot() { Root = true; } bool isRoot() const { - // It is a root node is it was created from a launch intrinsic - if (II->getCalledFunction()->getName().equals("llvm.hpvm.launch")) { - assert(Level == 0 && "Root node's level is zero."); + if (Root) return true; - } - return false; + else + return false; + // It is a root node is it was created from a launch intrinsic + // if (II->getCalledFunction()->getName().equals("llvm.hpvm.launch")) { + // assert(Level == 0 && "Root node's level is zero."); + // return true; + // } + // return false; } StructType *getOutputType() const { return OutputType; } void addSuccessor(DFNode *N) { Successors.push_back(N); } + void removeSuccessor(DFNode *N) { + Successors.erase(std::remove(Successors.begin(), Successors.end(), N), + Successors.end()); + } + // Add incoming dataflow edge - void addInDFEdge(DFEdge *E) { InDFEdges.push_back(E); } + void addInDFEdge(DFEdge *E) { + if (std::find(InDFEdges.begin(), InDFEdges.end(), E) == InDFEdges.end()) + InDFEdges.push_back(E); + } // Add outgoing dataflow edge - void addOutDFEdge(DFEdge *E) { OutDFEdges.push_back(E); } + void addOutDFEdge(DFEdge *E) { + if (std::find(OutDFEdges.begin(), OutDFEdges.end(), E) == OutDFEdges.end()) + OutDFEdges.push_back(E); + } + + // For removing dataflow edges + // Note (applies to add*Edge too): does not update edge data in other places + // (nodes and the graph). Does not update successors either. + void removeInDFEdge(const DFEdge *E) { + InDFEdges.erase(std::remove(InDFEdges.begin(), InDFEdges.end(), E), + InDFEdges.end()); + } + + void removeOutDFEdge(const DFEdge *E) { + OutDFEdges.erase(std::remove(OutDFEdges.begin(), OutDFEdges.end(), E), + OutDFEdges.end()); + } Function *getFuncPointer() const { return FuncPointer; } void setFuncPointer(Function *_FuncPointer) { FuncPointer = _FuncPointer; } IntrinsicInst *getInstruction() const { return II; } + void setInstruction(IntrinsicInst *_II) { II = _II; } DFInternalNode *getParent() const { return Parent; } + void setParent(DFInternalNode *_Parent) { Parent = _Parent; } + + // Adds the node to the graph of P and updates this node's parent + inline void addToNodeGraph(DFInternalNode *P); unsigned getNumOfDim() const { return NumOfDim; } + void setNumOfDim(unsigned numDims) { NumOfDim = numDims; } + + const std::vector<Value *> &getDimLimits() const { return DimLimits; } + std::vector<Value *> &getDimLimits() { return DimLimits; } + void setDimLimits(std::vector<Value *> _DimLimits) { DimLimits = _DimLimits; } std::vector<Value *> getDimLimits() const { return DimLimits; } @@ -365,6 +426,15 @@ public: break; case hpvm::CPU_OR_GPU_TARGET: break; + case hpvm::FFT_TARGET: + GenFuncInfo.fft_hasCPUFunc = isCPUFunc; + break; + case hpvm::VITERBI_TARGET: + GenFuncInfo.viterbi_hasCPUFunc = isCPUFunc; + break; + case hpvm::EPOCHS_TARGET: + GenFuncInfo.epochs_hasCPUFunc = isCPUFunc; + break; default: assert(false && "Unknown target\n"); break; @@ -384,6 +454,12 @@ public: return GenFuncInfo.cudnn_hasCPUFunc; case hpvm::TENSOR_TARGET: return GenFuncInfo.promise_hasCPUFunc; + case hpvm::FFT_TARGET: + return GenFuncInfo.fft_hasCPUFunc; + case hpvm::VITERBI_TARGET: + return GenFuncInfo.viterbi_hasCPUFunc; + case hpvm::EPOCHS_TARGET: + return GenFuncInfo.epochs_hasCPUFunc; case hpvm::CPU_OR_GPU_TARGET: assert(false && "Single target expected (CPU/GPU/SPIR/CUDNN/PROMISE)\n"); default: @@ -427,6 +503,32 @@ public: GenFuncs.PROMISEGenFunc = F; GenFuncInfo.promise_hasCPUFunc = isCPUFunc; break; + case hpvm::EPOCHS_TARGET: + if (isCPUFunc == true) { + if (GenFuncInfo.epochs_hasCPUFunc) { + DEBUG(errs() << "Warning: Second generated EPOCHS function for node " + << FuncPointer->getName() << "\n"); + } + } + GenFuncs.EPOCHSGenFunc = NULL; // We do not actually generate a function + GenFuncInfo.epochs_hasCPUFunc = isCPUFunc; + break; + case hpvm::FFT_TARGET: + if (GenFuncs.FFTGenFunc != NULL) { + DEBUG(errs() << "Warning: Second generated FFT function for node " + << FuncPointer->getName() << "\n"); + } + GenFuncs.FFTGenFunc = F; + GenFuncInfo.fft_hasCPUFunc = isCPUFunc; + break; + case hpvm::VITERBI_TARGET: + if (GenFuncs.VITERBIGenFunc != NULL) { + DEBUG(errs() << "Warning: Second generated VITERBI function for node " + << FuncPointer->getName() << "\n"); + } + GenFuncs.VITERBIGenFunc = F; + GenFuncInfo.viterbi_hasCPUFunc = isCPUFunc; + break; case hpvm::CPU_OR_GPU_TARGET: assert(false && "A node function should be set with a tag specifying its \ type, not the node hint itself\n"); @@ -449,6 +551,12 @@ public: return GenFuncs.CUDNNGenFunc; case hpvm::TENSOR_TARGET: return GenFuncs.PROMISEGenFunc; + case hpvm::FFT_TARGET: + return GenFuncs.FFTGenFunc; + case hpvm::VITERBI_TARGET: + return GenFuncs.VITERBIGenFunc; + case hpvm::EPOCHS_TARGET: + return GenFuncs.EPOCHSGenFunc; case hpvm::CPU_OR_GPU_TARGET: assert(false && "Requesting genarated node function with dual tag instead of \ @@ -479,6 +587,18 @@ public: GenFuncs.PROMISEGenFunc = NULL; GenFuncInfo.promise_hasCPUFunc = false; break; + case hpvm::FFT_TARGET: + GenFuncs.FFTGenFunc = NULL; + GenFuncInfo.fft_hasCPUFunc = false; + break; + case hpvm::VITERBI_TARGET: + GenFuncs.VITERBIGenFunc = NULL; + GenFuncInfo.viterbi_hasCPUFunc = false; + break; + case hpvm::EPOCHS_TARGET: + GenFuncs.EPOCHSGenFunc = NULL; + GenFuncInfo.viterbi_hasCPUFunc = false; + break; case hpvm::CPU_OR_GPU_TARGET: assert(false && "Removing genarated node function with dual tag instead of \ @@ -493,25 +613,28 @@ public: hpvm::Target getTargetHint() const { return Hint; } + std::vector<hpvm::EPOCHS_DEVICES> getEpochsDevices() { return EpochsTargets; } + bool isDummyNode() const { return isEntryNode() || isExitNode(); } bool isAllocationNode() { // If Allocation Property is defined then it is not an allocation node return PropertyList.count(Allocation) != 0; } - void setRank(unsigned r); - bool isEntryNode() const; - bool isExitNode() const; - DFEdge *getInDFEdgeAt(unsigned inPort); - DFEdge *getExtendedInDFEdgeAt(unsigned inPort); - DFEdge *getOutDFEdgeAt(unsigned outPort); - DFEdge *getExtendedOutDFEdgeAt(unsigned outPort); - std::map<unsigned, unsigned> getInArgMap(); - std::map<unsigned, std::pair<Value *, unsigned>> getSharedInArgMap(); - std::vector<unsigned> getOutArgMap(); - int getAncestorHops(DFNode *N); - bool hasSideEffects(); - + inline void setRank(unsigned r); + inline bool isEntryNode() const; + inline bool isExitNode() const; + inline DFEdge *getInDFEdgeAt(unsigned inPort); + inline DFEdge *getExtendedInDFEdgeAt(unsigned inPort); + inline DFEdge *getOutDFEdgeAt(unsigned outPort); + inline DFEdge *getExtendedOutDFEdgeAt(unsigned outPort); + inline std::map<unsigned, unsigned> getInArgMap(); + inline std::map<unsigned, std::pair<Value *, unsigned>> getSharedInArgMap(); + inline std::vector<unsigned> getOutArgMap(); + inline int getAncestorHops(DFNode *N); + inline bool hasSideEffects(); + + unsigned getCriticality() { return Criticality; } virtual void applyDFNodeVisitor(DFNodeVisitor &V) = 0; // virtual void applyDFEdgeVisitor(DFEdgeVisitor &V) = 0; @@ -534,9 +657,9 @@ private: // Constructor DFInternalNode(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint, DFInternalNode *Parent, int NumOfDim, - std::vector<Value *> DimLimits) - : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, - InternalNode) { + std::vector<Value *> DimLimits, unsigned Criticality) + : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, InternalNode, + Criticality) { childGraph = new DFGraph(this); } @@ -546,10 +669,11 @@ public: Create(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint = hpvm::CPU_TARGET, DFInternalNode *Parent = NULL, int NumOfDim = 0, - std::vector<Value *> DimLimits = std::vector<Value *>()) { + std::vector<Value *> DimLimits = std::vector<Value *>(), + unsigned Criticality = 0) { return new DFInternalNode(II, FuncPointer, Hint, Parent, NumOfDim, - DimLimits); + DimLimits, Criticality); } static bool classof(const DFNode *N) { return N->getKind() == InternalNode; } @@ -558,14 +682,16 @@ public: void removeChildFromDFGraph(DFNode *N) { childGraph->removeChildDFNode(N); } - void addEdgeToDFGraph(DFEdge *E); + inline void addEdgeToDFGraph(DFEdge *E); + + void removeEdgeFromDFGraph(DFEdge *E) { childGraph->removeEdge(E); } DFGraph *getChildGraph() const { return childGraph; } bool isChildGraphStreaming() { return childGraph->isStreaming(); } - void applyDFNodeVisitor(DFNodeVisitor &V); /*virtual*/ - // void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/ + inline void applyDFNodeVisitor(DFNodeVisitor &V); /*virtual*/ + // inline void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/ }; /***************************************************** @@ -577,21 +703,50 @@ private: // Constructor DFLeafNode(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint, DFInternalNode *Parent, int NumOfDim = 0, - std::vector<Value *> DimLimits = std::vector<Value *>()) - : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, LeafNode) {} + std::vector<Value *> DimLimits = std::vector<Value *>(), + unsigned Criticality = 0) + : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, LeafNode, + Criticality) { + std::vector<Instruction *> toErase; + for (inst_iterator i = inst_begin(FuncPointer), e = inst_end(FuncPointer); + i != e; ++i) { + + Instruction *I = &*i; + + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() == Intrinsic::hpvm_task) { + ConstantInt *C = dyn_cast<ConstantInt>(II->getOperand(0)); + assert(C && "Task ID should be a constant integer"); + TaskID = C->getSExtValue(); + + toErase.push_back(II); + } + } + } + + for (Instruction *I : toErase) { + I->eraseFromParent(); + } + } + + int TaskID; public: static DFLeafNode * Create(IntrinsicInst *II, Function *FuncPointer, hpvm::Target Hint, DFInternalNode *Parent, int NumOfDim = 0, - std::vector<Value *> DimLimits = std::vector<Value *>()) { - return new DFLeafNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits); + std::vector<Value *> DimLimits = std::vector<Value *>(), + unsigned Criticality = 0) { + return new DFLeafNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, + Criticality); } static bool classof(const DFNode *N) { return N->getKind() == LeafNode; } - void applyDFNodeVisitor(DFNodeVisitor &V); /*virtual*/ - // void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/ + inline void applyDFNodeVisitor(DFNodeVisitor &V); /*virtual*/ + // inline void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/ + + int getTaskID() { return TaskID; } }; // DFEdge represents a single HPVM Dataflow Edge in LLVM. @@ -652,6 +807,8 @@ public: bool getEdgeType() const { return EdgeType; } + void setEdgeType(bool _EdgeType) { EdgeType = _EdgeType; } + unsigned getSourcePosition() const { return SourcePosition; } void setSourcePosition(unsigned i) { SourcePosition = i; } @@ -663,6 +820,8 @@ public: Type *getType() const { return ArgType; } bool isStreamingEdge() const { return isStreaming; } + + void setStreamingEdge(bool _isStreaming) { isStreaming = _isStreaming; } }; //===--------------------- DFGraph Outlined Functions --------------===// @@ -690,12 +849,31 @@ bool DFGraph::isStreaming() { return false; } +bool DFGraph::removeEdge(const DFEdge *E) { + dfedge_iterator position = std::find(dfedge_begin(), dfedge_end(), E); + if (position != dfedge_end()) // the edge was found + DFEdgeList.erase(position); + + DFNode *S = E->getSourceDF(); + DFNode *D = E->getDestDF(); + S->removeSuccessor(D); + S->removeOutDFEdge(E); + D->removeInDFEdge(E); + + if (position == dfedge_end()) + DEBUG_WITH_TYPE( + "dfgraph", + errs() << "DFGraph::removeEdge called with non-existent edge.\n"); + return position != dfedge_end(); +} + //===--------------------- DFNode Outlined Functions --------------===// DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, hpvm::Target _Hint, DFInternalNode *_Parent, unsigned _NumOfDim, - std::vector<Value *> _DimLimits, DFNodeKind _K) + std::vector<Value *> _DimLimits, DFNodeKind _K, + unsigned _Criticality) : II(_II), FuncPointer(_FuncPointer), Parent(_Parent), NumOfDim(_NumOfDim), - DimLimits(_DimLimits), Kind(_K) { + DimLimits(_DimLimits), Kind(_K), Criticality(_Criticality) { Type *Ty = FuncPointer->getFunctionType()->getReturnType(); @@ -724,11 +902,24 @@ DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, hpvm::Target _Hint, GenFuncs.GPUGenFunc = NULL; GenFuncs.CUDNNGenFunc = NULL; GenFuncs.PROMISEGenFunc = NULL; + GenFuncs.FFTGenFunc = NULL; + GenFuncs.VITERBIGenFunc = NULL; + GenFuncs.EPOCHSGenFunc = NULL; GenFuncInfo.cpu_hasCPUFunc = false; GenFuncInfo.gpu_hasCPUFunc = false; GenFuncInfo.cudnn_hasCPUFunc = false; GenFuncInfo.promise_hasCPUFunc = false; + GenFuncInfo.fft_hasCPUFunc = false; + GenFuncInfo.viterbi_hasCPUFunc = false; + GenFuncInfo.epochs_hasCPUFunc = false; + Root = false; + Hint = _Hint; +} + +void DFNode::addToNodeGraph(DFInternalNode *P) { + Parent = P; + P->addChildToDFGraph(this); } void DFNode::setRank(unsigned r) { @@ -796,13 +987,14 @@ DFEdge *DFNode::getOutDFEdgeAt(unsigned outPort) { // Cannot perform check for the number of outputs here, // it depends on the node's return type - unsigned index = 0; + // ADEL: Note - kept this same as non-release branch because wasn't sure how + // that change would affect Scheduler backend. The release-branch-change was + // made by Akash for Movidius stuff. for (outdfedge_iterator i = outdfedge_begin(), e = outdfedge_end(); i != e; ++i) { DFEdge *E = *i; - if (outPort == index) // E->getSourcePosition()) + if (outPort == E->getSourcePosition()) return E; - index++; } return NULL; } @@ -1182,9 +1374,13 @@ template <> struct DOTGraphTraits<DFGraph *> : public DefaultDOTGraphTraits { static void addCustomGraphFeatures(DFGraph *G, GraphWriter<DFGraph *> &GW) {} }; -void viewDFGraph(DFGraph *G) { - llvm::WriteGraph(G, "DataflowGraph"); - // llvm::ViewGraph(G, "DataflowGraph"); +inline void viewDFGraph(DFGraph *G) { + DFInternalNode *Parent = G->getParent(); + assert(Parent && "Child Graph with no parent\n"); + + llvm::WriteGraph(G, + (Parent->isRoot() ? "Root_Dataflow_Graph" + : (Parent->getFuncPointer()->getName()))); } } // namespace llvm diff --git a/hpvm/include/SupportHPVM/HPVMHint.h b/hpvm/include/SupportHPVM/HPVMHint.h index 25020e82016b8b3320abb8ddf94b78f24bc91acd..a4946d235e221cf24b68580b1e6e27f0fa5ff42f 100644 --- a/hpvm/include/SupportHPVM/HPVMHint.h +++ b/hpvm/include/SupportHPVM/HPVMHint.h @@ -22,10 +22,37 @@ enum Target { CUDNN_TARGET, TENSOR_TARGET, CPU_OR_GPU_TARGET, + EPOCHS_TARGET, // ALL_TARGETS, + FFT_TARGET, + VITERBI_TARGET, NUM_TARGETS }; +enum EPOCHS_DEVICES { + CPU_ACCEL=0, + OneD_FFT_ACCEL, + VITDEC_ACCEL, + CV_CNN_ACCEL, + NUM_DEVICES +}; + +enum EPOCHS_TASKS { + FFT_TASK=0, + RADAR_TASK, + CV_TASK, + VIT_TASK, + PNC_TASK, + NONE_TASK, + NUM_TASKS +}; + +enum NODE_CRITICALITY { + HPVM_BASE = 1, + HPVM_ELEVATED, + HPVM_CRITICAL +}; + #ifdef __cplusplus } #endif diff --git a/hpvm/include/SupportHPVM/HPVMUtils.h b/hpvm/include/SupportHPVM/HPVMUtils.h index 2a5116ddb122b16b28ee45022d7c57409cdce566..e313083f93d83092c3990ca4ed168ad79cd71b06 100644 --- a/hpvm/include/SupportHPVM/HPVMUtils.h +++ b/hpvm/include/SupportHPVM/HPVMUtils.h @@ -18,7 +18,7 @@ #include <assert.h> -#include "SupportHPVM/HPVMHint.h" +#include "HPVMHint.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" @@ -62,7 +62,7 @@ static bool isHPVMLaunchCall(Instruction *I) { } // Creates a new createNode intrinsic, similar to II but with different // associated function F instead -IntrinsicInst * +static IntrinsicInst * createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function *F, IntrinsicInst *II) { Module *M = F->getParent(); @@ -72,27 +72,28 @@ createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function *F, Constant *Fp = ConstantExpr::getPointerCast(F, Type::getInt8PtrTy(II->getContext())); - ArrayRef<Value *> CreateNodeArgs; + std::vector<Value *> NodeArgs; switch (II->getIntrinsicID()) { case Intrinsic::hpvm_createNode: { - CreateNodeArgs = ArrayRef<Value *>(Fp); + NodeArgs.push_back(Fp); break; } case Intrinsic::hpvm_createNode1D: { - Value *CreateNode1DArgs[] = {Fp, II->getArgOperand(1)}; - CreateNodeArgs = ArrayRef<Value *>(CreateNode1DArgs, 2); + NodeArgs.push_back(Fp); + NodeArgs.push_back(II->getArgOperand(1)); break; } case Intrinsic::hpvm_createNode2D: { - Value *CreateNode2DArgs[] = {Fp, II->getArgOperand(1), - II->getArgOperand(2)}; - CreateNodeArgs = ArrayRef<Value *>(CreateNode2DArgs, 3); + NodeArgs.push_back(Fp); + NodeArgs.push_back(II->getArgOperand(1)); + NodeArgs.push_back(II->getArgOperand(2)); break; } case Intrinsic::hpvm_createNode3D: { - Value *CreateNode3DArgs[] = {Fp, II->getArgOperand(1), II->getArgOperand(2), - II->getArgOperand(3)}; - CreateNodeArgs = ArrayRef<Value *>(CreateNode3DArgs, 4); + NodeArgs.push_back(Fp); + NodeArgs.push_back(II->getArgOperand(1)); + NodeArgs.push_back(II->getArgOperand(2)); + NodeArgs.push_back(II->getArgOperand(2)); break; } default: @@ -100,14 +101,16 @@ createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function *F, break; } + ArrayRef<Value *> CreateNodeArgs(NodeArgs); + CallInst *CI = - CallInst::Create(CreateNodeF, CreateNodeArgs, F->getName() + ".node"); + CallInst::Create(CreateNodeF, CreateNodeArgs, F->getName() + ".node", II); IntrinsicInst *CreateNodeII = cast<IntrinsicInst>(CI); return CreateNodeII; } // Fix HPVM hints for this function -void fixHintMetadata(Module &M, Function *F, Function *G) { +static void fixHintMetadata(Module &M, Function *F, Function *G) { Metadata *MD_F = ValueAsMetadata::getIfExists(F); MDTuple *MDT_F = MDTuple::getIfExists(F->getContext(), ArrayRef<Metadata *>(MD_F)); @@ -129,12 +132,15 @@ void fixHintMetadata(Module &M, Function *F, Function *G) { FixHint("hpvm_hint_cpu_gpu"); FixHint("hpvm_hint_cudnn"); FixHint("hpvm_hint_promise"); + FixHint("hpvm_hint_fft"); + FixHint("hpvm_hint_viterbi"); + FixHint("hpvm_hint_epochs"); } // Assuming that the changed function is a node function, it is only used as a // first operand of createNode*. It is enough to iterate through all createNode* // calls in the program. -void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) { +static void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) { for (auto &Func : M) { DEBUG(errs() << "Function: " << Func.getName() << "\n"); @@ -148,7 +154,7 @@ void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) { if (isHPVMCreateNodeIntrinsic(I)) { IntrinsicInst *II = cast<IntrinsicInst>(I); // The found createNode is not associated with the changed function - if (II->getArgOperand(0) != F) + if (II->getArgOperand(0)->stripPointerCasts() != F) continue; // skip it // Otherwise, create a new createNode similar to the other one, @@ -211,10 +217,10 @@ void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) { // over extra pointer arguments. // The function returns the list of return instructions to the caller to fix in // case the return type is also changed. -Function *cloneFunction(Function *F, FunctionType *newFT, - bool isAddingPtrSizeArg, - SmallVectorImpl<ReturnInst *> *Returns = NULL, - std::vector<Argument *> *Args = NULL) { +static Function *cloneFunction(Function *F, FunctionType *newFT, + bool isAddingPtrSizeArg, + SmallVectorImpl<ReturnInst *> *Returns = NULL, + std::vector<Argument *> *Args = NULL) { DEBUG(errs() << "Cloning Function: " << F->getName() << "\n"); DEBUG(errs() << "Old Function Type: " << *F->getFunctionType() << "\n"); @@ -225,8 +231,8 @@ Function *cloneFunction(Function *F, FunctionType *newFT, "the old function!"); // Create Function of specified type - Function *newF = Function::Create(newFT, F->getLinkage(), - F->getName() + "_cloned", F->getParent()); + Function *newF = Function::Create(newFT, F->getLinkage(), F->getName() + "_c", + F->getParent()); DEBUG(errs() << "Old Function name: " << F->getName() << "\n"); DEBUG(errs() << "New Function name: " << newF->getName() << "\n"); ValueToValueMapTy VMap; @@ -308,13 +314,15 @@ Function *cloneFunction(Function *F, FunctionType *newFT, if (Returns == NULL) Returns = new SmallVector<ReturnInst *, 8>(); CloneFunctionInto(newF, F, VMap, false, *Returns); + newF->setAttributes(F->getAttributes()); return newF; } // Overloaded version of cloneFunction -Function *cloneFunction(Function *F, Function *newF, bool isAddingPtrSizeArg, - SmallVectorImpl<ReturnInst *> *Returns = NULL) { +static Function *cloneFunction(Function *F, Function *newF, + bool isAddingPtrSizeArg, + SmallVectorImpl<ReturnInst *> *Returns = NULL) { DEBUG(errs() << "Cloning Function: " << F->getName() << "\n"); DEBUG(errs() << "Old Function Type: " << *F->getFunctionType() << "\n"); @@ -372,13 +380,15 @@ Function *cloneFunction(Function *F, Function *newF, bool isAddingPtrSizeArg, Returns = new SmallVector<ReturnInst *, 8>(); CloneFunctionInto(newF, F, VMap, false, *Returns); + newF->setAttributes(F->getAttributes()); + return newF; } //------------------- Helper Functions For Handling Hints -------------------// // Return true if 1st arg (tag) contains 2nd (target) -bool tagIncludesTarget(hpvm::Target Tag, hpvm::Target T) { +static bool tagIncludesTarget(hpvm::Target Tag, hpvm::Target T) { switch (Tag) { case hpvm::None: return false; @@ -403,6 +413,18 @@ bool tagIncludesTarget(hpvm::Target Tag, hpvm::Target T) { if (T == hpvm::TENSOR_TARGET) return true; return false; + case hpvm::FFT_TARGET: + if (T == hpvm::FFT_TARGET) + return true; + return false; + case hpvm::VITERBI_TARGET: + if (T == hpvm::VITERBI_TARGET) + return true; + return false; + case hpvm::EPOCHS_TARGET: + if (T == hpvm::EPOCHS_TARGET) + return true; + return false; default: assert(false && "Unknown Target\n"); return false; // What kind of compiler doesn't know this is unreachable?! @@ -411,15 +433,19 @@ bool tagIncludesTarget(hpvm::Target Tag, hpvm::Target T) { bool isSingleTargetTag(hpvm::Target T) { return ((T == hpvm::CPU_TARGET) || (T == hpvm::GPU_TARGET) - || (T == hpvm::CUDNN_TARGET) || (T == hpvm::TENSOR_TARGET)); + || (T == hpvm::CUDNN_TARGET) || (T == hpvm::TENSOR_TARGET)|| + (T == hpvm::FFT_TARGET) || (T == hpvm::VITERBI_TARGET) || + (T == hpvm::EPOCHS_TARGET)); } // Add the specified target to the given tag hpvm::Target getUpdatedTag(hpvm::Target Tag, hpvm::Target T) { assert(((T == hpvm::CPU_TARGET) || (T == hpvm::GPU_TARGET) - || (T == hpvm::CUDNN_TARGET) || (T == hpvm::TENSOR_TARGET)) && + || (T == hpvm::CUDNN_TARGET) || (T == hpvm::TENSOR_TARGET) || + (T == hpvm::FFT_TARGET) || (T == hpvm::VITERBI_TARGET) || + (T == hpvm::EPOCHS_TARGET)) && "The target is only allowed to be a single target: CPU, GPU, SPIR, " - "CUDNN, PROMISE\n"); + "CUDNN, PROMISE, FFT, VITERBI, EPOCHS\n"); switch (Tag) { case hpvm::None: @@ -441,6 +467,18 @@ hpvm::Target getUpdatedTag(hpvm::Target Tag, hpvm::Target T) { case hpvm::CPU_OR_GPU_TARGET: assert((T != hpvm::CUDNN_TARGET) && (T != hpvm::TENSOR_TARGET) && "Unsupported target combination\n"); return hpvm::CPU_OR_GPU_TARGET; + case hpvm::FFT_TARGET: + if (T == hpvm::FFT_TARGET) + return hpvm::FFT_TARGET; + return T; + case hpvm::VITERBI_TARGET: + if (T == hpvm::VITERBI_TARGET) + return hpvm::VITERBI_TARGET; + return T; + case hpvm::EPOCHS_TARGET: + if (T == hpvm::EPOCHS_TARGET) + return hpvm::EPOCHS_TARGET; + return T; default: assert(false && "Unknown Target\n"); } @@ -448,8 +486,7 @@ hpvm::Target getUpdatedTag(hpvm::Target Tag, hpvm::Target T) { } // This functions add the hint as metadata in hpvm code -void addHint(Function *F, hpvm::Target T) { - DEBUG(errs() << "ADD HINT *************************\n"); +static void addHint(Function *F, hpvm::Target T) { // Get Module Module *M = F->getParent(); DEBUG(errs() << "Set preferred target for " << F->getName() << ": "); @@ -478,6 +515,18 @@ void addHint(Function *F, hpvm::Target T) { DEBUG(errs() << "PROMISE\n"); HintNode = M->getOrInsertNamedMetadata("hpvm_hint_promise"); break; + case hpvm::FFT_TARGET: + DEBUG(errs() << "FFT Target\n"); + HintNode = M->getOrInsertNamedMetadata("hpvm_hint_fft"); + break; + case hpvm::VITERBI_TARGET: + DEBUG(errs() << "VITERBI Target\n"); + HintNode = M->getOrInsertNamedMetadata("hpvm_hint_viterbi"); + break; + case hpvm::EPOCHS_TARGET: + DEBUG(errs() << "EPOCHS Target\n"); + HintNode = M->getOrInsertNamedMetadata("hpvm_hint_epochs"); + break; default: llvm_unreachable("Unsupported Target Hint!"); break; @@ -490,7 +539,7 @@ void addHint(Function *F, hpvm::Target T) { } // This function removes the hint as metadata in hpvm code -void removeHint(Function *F, hpvm::Target T) { +static void removeHint(Function *F, hpvm::Target T) { // Get Module Module *M = F->getParent(); DEBUG(errs() << "Remove preferred target for " << F->getName() << ": " << T @@ -514,6 +563,15 @@ void removeHint(Function *F, hpvm::Target T) { case hpvm::TENSOR_TARGET: HintNode = M->getOrInsertNamedMetadata("hpvm_hint_promise"); break; + case hpvm::FFT_TARGET: + HintNode = M->getOrInsertNamedMetadata("hpvm_hint_fft"); + break; + case hpvm::VITERBI_TARGET: + HintNode = M->getOrInsertNamedMetadata("hpvm_hint_viterbi"); + break; + case hpvm::EPOCHS_TARGET: + HintNode = M->getOrInsertNamedMetadata("hpvm_hint_epochs"); + break; default: llvm_unreachable("Unsupported Target Hint!"); break; @@ -539,7 +597,7 @@ void removeHint(Function *F, hpvm::Target T) { } } -hpvm::Target getPreferredTarget(Function *F) { +static hpvm::Target getPreferredTarget(Function *F) { DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n"); Module *M = F->getParent(); @@ -565,6 +623,12 @@ hpvm::Target getPreferredTarget(Function *F) { return hpvm::CUDNN_TARGET; if (FoundPrefTarget("hpvm_hint_promise")) return hpvm::TENSOR_TARGET; + if (FoundPrefTarget("hpvm_hint_fft")) + return hpvm::FFT_TARGET; + if (FoundPrefTarget("hpvm_hint_viterbi")) + return hpvm::VITERBI_TARGET; + if (FoundPrefTarget("hpvm_hint_epochs")) + return hpvm::EPOCHS_TARGET; return hpvm::None; } diff --git a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp index b3b46de48260f965782b1fb13bc049d446f51da2..be3b3e65fe77843335a38d475af0126b6be10d2e 100644 --- a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp +++ b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp @@ -53,6 +53,7 @@ bool BuildDFG::runOnModule(Module &M) { // Intrinsic Instruction has been initialized from this point on. Function *F = cast<Function>(II->getOperand(0)->stripPointerCasts()); Root = DFInternalNode::Create(II, F, hpvmUtils::getPreferredTarget(F)); + Root->setRoot(); // errs() << "INTRINSIC: " << II << "\n"; // errs() << "ROOT NODE" << Root << "\n"; Roots.push_back(Root); @@ -179,11 +180,12 @@ void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) { isInternalNode = true; } - // Number of Dimensions would be equal to the (number of operands - 1) as - // the first operand is the pointer to associated Function and the + // Number of Dimensions would be equal to the (number of operands - 2) as + // the first operand is the pointer to associated Function, the last + // operand is the node criticality and the // remaining operands are the limits in each dimension. unsigned numOfDim = - II->getCalledFunction()->getFunctionType()->getNumParams() - 1; + II->getCalledFunction()->getFunctionType()->getNumParams() - 2; assert(numOfDim <= 3 && "Invalid number of dimensions for createNode intrinsic!"); std::vector<Value *> dimLimits; @@ -194,11 +196,15 @@ void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) { dimLimits.push_back(cast<Value>(II->getOperand(i))); } + ConstantInt* Criticality = dyn_cast<ConstantInt>(II->getArgOperand(numOfDim + 1)); + + assert(Criticality && (Criticality->getSExtValue() >= 0) && "Criticality must be a non-negative constant integer (i64) value"); + if (isInternalNode) { // Create Internal DFNode, add it to the map and recursively build its // dataflow graph DFInternalNode *childDFNode = DFInternalNode::Create( - II, F, hpvmUtils::getPreferredTarget(F), N, numOfDim, dimLimits); + II, F, hpvmUtils::getPreferredTarget(F), N, numOfDim, dimLimits, Criticality->getSExtValue()); // errs() << "INTERNAL NODE: " << childDFNode << "\n"; N->addChildToDFGraph(childDFNode); HandleToDFNodeMap[II] = childDFNode; @@ -206,7 +212,7 @@ void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) { } else { // Create Leaf DFnode and add it to the map. DFLeafNode *childDFNode = DFLeafNode::Create( - II, F, hpvmUtils::getPreferredTarget(F), N, numOfDim, dimLimits); + II, F, hpvmUtils::getPreferredTarget(F), N, numOfDim, dimLimits, Criticality->getSExtValue()); // errs() << "LEAF NODE: " << childDFNode << "\n"; N->addChildToDFGraph(childDFNode); HandleToDFNodeMap[II] = childDFNode; diff --git a/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/CMakeLists.txt b/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8850e61eb0dab26ac128b2b3965112261cf2ca0 --- /dev/null +++ b/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/CMakeLists.txt @@ -0,0 +1,14 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_BUILD_DIR=${CMAKE_BINARY_DIR}") + +add_llvm_library( LLVMDFG2LLVM_EPOCHS + MODULE + DFG2LLVM_EPOCHS.cpp + + DEPENDS intrinsics_gen + PLUGIN_TOOL + opt + ) diff --git a/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/DFG2LLVM_EPOCHS.cpp b/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/DFG2LLVM_EPOCHS.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bba52c049d702b3e592235c4cd17f1483a577389 --- /dev/null +++ b/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/DFG2LLVM_EPOCHS.cpp @@ -0,0 +1,2498 @@ +//===-------------------------- DFG2LLVM_EPOCHS.cpp +//--------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass is responsible for generating code for host code and kernel code +// for EPOCHS target using HPVM dataflow graph. +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/Intrinsics.h" +#include "llvm/Support/CommandLine.h" +#define DEBUG_TYPE "DFG2LLVM_EPOCHS" +#include "SupportHPVM/DFG2LLVM.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Pass.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" + +#include <string> +#include <fstream> +#include <queue> +#include <sys/stat.h> + +#ifndef LLVM_BUILD_DIR +#error LLVM_BUILD_DIR is not defined +#endif + +#define STR_VALUE(X) #X +#define STRINGIFY(X) STR_VALUE(X) +#define LLVM_BUILD_DIR_STR STRINGIFY(LLVM_BUILD_DIR) + +using namespace llvm; +using namespace builddfg; +using namespace dfg2llvm; + +//// HPVM Command line option to use timer or not +// static cl::opt<bool> HPVMTimer_EPOCHS("hpvm-timers-epochs", +// cl::desc("Enable hpvm timers")); + +static cl::opt<std::string> SCHDULER_LIBRARY_PATH( + "sched-lib-path", cl::Required, + cl::desc("Path to the schedular library bitcode module."), + cl::value_desc("scheduler_module.bc")); + +static cl::opt<std::string> + SCHDULER_CONFIG_PATH("sched-config", cl::Required, + cl::desc("Path to the schedular configuration file."), + cl::value_desc("scheduler configuration file")); + +static cl::opt<std::string> TASK_CONFIG_PATH( + "task-config", cl::Required, + cl::desc("Path to the task description configuration file."), + cl::value_desc("task configuration file")); + +static cl::opt<std::string> + TASK_LIBRARY_PATH("task-lib-path", cl::Required, + cl::desc("Path to the task library bitcode module."), + cl::value_desc("tasklib.bc")); + +namespace dfg2llvm { + +static void findReturnInst(Function *, std::vector<ReturnInst *> &); + +struct TaskDeviceInfo { + hpvm::EPOCHS_DEVICES TaskDeviceTy; + std::string TaskDeviceFn; + + TaskDeviceInfo() { + TaskDeviceTy = hpvm::CPU_ACCEL; + TaskDeviceFn = ""; + } + + void clearTaskDeviceInfo() { + TaskDeviceTy = hpvm::CPU_ACCEL; + TaskDeviceFn = ""; + } +}; + +struct TaskInfo { + int TaskID; + std::string TaskName; + std::string TaskDescription; + std::string TaskSetup; + std::string TaskPrint; + std::string TaskRunStats; + std::string TaskFinishExecution; + std::string TaskAutoFinish; + std::vector<TaskDeviceInfo> Devices; + Value *TaskHandleAddr; + + TaskInfo() { + TaskID = 0; + TaskName = ""; + TaskDescription = ""; + TaskSetup = ""; + TaskPrint = ""; + TaskRunStats = ""; + TaskAutoFinish = ""; + TaskFinishExecution = ""; + Devices.clear(); + } + + void printTaskInfo() { + DEBUG(errs() << "TaskID :\t" << TaskID << "\n"); + DEBUG(errs() << "TaskName :\t" << TaskName << "\n"); + DEBUG(errs() << "TaskDescription :\t" << TaskDescription << "\n"); + DEBUG(errs() << "TaskSetup :\t" << TaskSetup << "\n"); + DEBUG(errs() << "TaskPrint :\t" << TaskPrint << "\n"); + DEBUG(errs() << "TaskRunStats :\t" << TaskRunStats << "\n"); + DEBUG(errs() << "TaskFinishExecution :\t" << TaskFinishExecution << "\n"); + DEBUG(errs() << "TaskAutoFinish :\t" << TaskAutoFinish << "\n"); + DEBUG(errs() << "Task Devices:\n"); + + for (TaskDeviceInfo &TDI : Devices) { + DEBUG(errs() << "\tDevice Name :\t" << TDI.TaskDeviceTy << "\n"); + DEBUG(errs() << "\tDevice Function :\t" << TDI.TaskDeviceFn << "\n"); + } + } + + unsigned getNumDevices() { return Devices.size(); } + + std::vector<hpvm::EPOCHS_DEVICES> getDeviceTypes() { + std::vector<hpvm::EPOCHS_DEVICES> Devs; + for (TaskDeviceInfo &TDI : Devices) { + Devs.push_back(TDI.TaskDeviceTy); + } + return Devs; + } + + void clearTaskInfo() { + TaskName = ""; + TaskDescription = ""; + TaskSetup = ""; + TaskPrint = ""; + TaskRunStats = ""; + TaskAutoFinish = ""; + TaskFinishExecution = ""; + Devices.clear(); + } + + bool operator<(const TaskInfo &rhs) const { return TaskID < rhs.TaskID; } + + bool operator==(const TaskInfo &rhs) const { return TaskID == rhs.TaskID; } +}; + +void strSplit(std::string &str, char delim, std::vector<std::string> &out) { + size_t start; + size_t end = 0; + + while ((start = str.find_first_not_of(delim, end)) != std::string::npos) { + end = str.find(delim, start); + out.push_back(str.substr(start, end - start)); + } +} + +// Visitor for Code generation traversal (tree traversal for now) +class CGT_EPOCHS : public CodeGenTraversal { + +private: + // Member variables + std::vector<TaskInfo> TaskTypes; + std::map<DFLeafNode *, TaskInfo> TaskHandleMap; + std::map<DFLeafNode *, std::map<unsigned, unsigned>> returnArgMap; + std::map<DFLeafNode *, GlobalVariable *> TaskMBMap; + std::set<TaskInfo> RegisteredTasks; + std::set<DFLeafNode *> FinishedTasks; + + // Global Variables + GlobalVariable *SchedulerHandleAddress; + GlobalVariable *DagIDGlobal; + + std::unique_ptr<Module> TaskModule; + + FunctionCallee malloc; + // HPVM Runtime API + FunctionCallee llvm_hpvm_cpu_launch; + FunctionCallee llvm_hpvm_cpu_wait; + FunctionCallee llvm_hpvm_cpu_argument_ptr; + + FunctionCallee llvm_hpvm_streamLaunch; + FunctionCallee llvm_hpvm_streamPush; + FunctionCallee llvm_hpvm_streamPop; + FunctionCallee llvm_hpvm_streamWait; + FunctionCallee llvm_hpvm_createBindInBuffer; + FunctionCallee llvm_hpvm_createBindOutBuffer; + FunctionCallee llvm_hpvm_createEdgeBuffer; + FunctionCallee llvm_hpvm_createLastInputBuffer; + FunctionCallee llvm_hpvm_createThread; + FunctionCallee llvm_hpvm_bufferPush; + FunctionCallee llvm_hpvm_bufferPop; + // FunctionCallee llvm_hpvm_cpu_dstack_push; + // FunctionCallee llvm_hpvm_cpu_dstack_pop; + FunctionCallee llvm_hpvm_cpu_getDimLimit; + FunctionCallee llvm_hpvm_cpu_getDimInstance; + + // EPOCHS Scheduler API + FunctionCallee set_up_scheduler; + FunctionCallee initialize_scheduler_from_config_file; + FunctionCallee initialize_task_lib; + FunctionCallee register_task_type; + FunctionCallee set_up_task; + FunctionCallee request_execution; + FunctionCallee finish_task_execution; + FunctionCallee wait_on_tasklist; + FunctionCallee shutdown_scheduler; + + // Functions + std::vector<IntrinsicInst *> *getUseList(Value *LI); + void addWhileLoop(Instruction *, Instruction *, Instruction *, Value *); + Instruction *addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *); + StructType *getArgumentListStructTy(DFNode *); + Function *createFunctionFilter(DFNode *C); + void startNodeThread(DFNode *, std::vector<Value *>, + DenseMap<DFEdge *, Value *>, Value *, Value *, + Instruction *); + Function *createLaunchFunction(DFInternalNode *); + + void finishNodeEPOCHS(DFLeafNode *, Instruction *IB, Function *ParentF); + void waitOnTask(DFLeafNode *Leaf, Instruction *IB); + + + // Task File parsing and utility functions + TaskInfo &getTaskInfo(int TaskID); + + FunctionCallee getTaskFunctionHelper(std::string TaskFunctionName); + FunctionCallee getTaskSetupFunction(TaskInfo &TI); + FunctionCallee getTaskPrintFunction(TaskInfo &TI); + FunctionCallee getTaskRunStatsFunction(TaskInfo &TI); + FunctionCallee getTaskAutoFinishFunction(TaskInfo &TI); + FunctionCallee getTaskFinishExecutionFunction(TaskInfo &TI); + + FunctionCallee getTaskDeviceFunction(TaskInfo &TI, + hpvm::EPOCHS_DEVICES DevTy); + + // Virtual Functions + void init() { + // HPVMTimer = HPVMTimer_EPOCHS; + TargetName = "EPOCHS"; + } + void initRuntimeAPI(); + void initSchedulerAPI(); + void initTaskAPI(); + void codeGen(DFInternalNode *N); + void codeGen(DFLeafNode *N); + Function *codeGenStreamPush(DFInternalNode *N); + Function *codeGenStreamPop(DFInternalNode *N); + void invokeChild(DFNode *C, Function *F_CPU, ValueToValueMapTy &VMap, + Instruction *IB, hpvm::Target Tag); + Value *getInValueAt(DFNode *Child, unsigned i, Function *ParntF_CPU, + Instruction *InsertBefore); + // hpvm::Target Tag = hpvm::CPU_TARGET); + +public: + // Constructor + CGT_EPOCHS(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG) { + init(); + initRuntimeAPI(); + initSchedulerAPI(); + initTaskAPI(); + } + + void codeGenLaunch(DFInternalNode *Root); + void codeGenLaunchStreaming(DFInternalNode *Root); + void parseTaskFile(std::string TaskPath); +}; + +bool runDFG2LLVM_EPOCHS(Module &M, BuildDFG &DFG) { + // DFInternalNode *Root = DFG.getRoot(); + std::vector<DFInternalNode *> Roots = DFG.getRoots(); + // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); + // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); + + // Visitor for Code Generation Graph Traversal + CGT_EPOCHS *CGTVisitor = new CGT_EPOCHS(M, DFG); + + // Populate scheduler library's task + // details from the provided task + // description file. + + // Iterate over all the DFGs and produce code for each one of them + for (auto &rootNode : Roots) { + // Initiate code generation for root DFNode + CGTVisitor->visit(rootNode); + // Go ahead and replace the launch intrinsic with pthread call, otherwise + // return now. + // TODO: Later on, we might like to do this in a separate pass, which would + // allow us the flexibility to switch between complete static code + // generation for DFG or having a customized runtime+scheduler + + // Do streaming code generation if root node is streaming. Usual otherwise + if (rootNode->isChildGraphStreaming()) + CGTVisitor->codeGenLaunchStreaming(rootNode); + else + CGTVisitor->codeGenLaunch(rootNode); + } + + for (auto &F : M) { + for (Function::arg_iterator ai = F.arg_begin(), ae = F.arg_end(); ai != ae; + ai++) { + Argument *Arg = &*ai; + if (Arg->hasAttribute(Attribute::In)) + Arg->removeAttr(Attribute::In); + if (Arg->hasAttribute(Attribute::Out)) + Arg->removeAttr(Attribute::Out); + if (Arg->hasAttribute(Attribute::InOut)) + Arg->removeAttr(Attribute::InOut); + } + } + + delete CGTVisitor; + return true; +} + +bool DFG2LLVM_EPOCHS::runOnModule(Module &M) { + DEBUG(errs() << "\nDFG2LLVM_EPOCHS PASS\n"); + + // Get the BuildDFG Analysis Results: + // - Dataflow graph + // - Maps from i8* hansles to DFNode and DFEdge + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + return runDFG2LLVM_EPOCHS(M, DFG); +} +void CGT_EPOCHS::initTaskAPI() { + // Load Task API Module + SMDiagnostic Err; + + std::string TaskAPI = TASK_LIBRARY_PATH; + TaskModule = parseIRFile(TaskAPI, Err, M.getContext()); + if (TaskModule == nullptr) { + DEBUG(errs() << Err.getMessage() << " " << TaskAPI << "\n"); + assert(false && "Couldn't parse TaskAPI module!"); + } else + DEBUG(errs() << "Successfully loaded Task API module\n"); + + DECLARE_EPOCHS(initialize_task_lib, TaskModule); + IRBuilder<> Builder(InitCall); + // Issue initialize_task_lib call + CallInst *InitializeTaskLibInst = + Builder.CreateCall(initialize_task_lib, ArrayRef<Value *>()); + DEBUG(errs() << *InitializeTaskLibInst << "\n"); + + std::string TaskConfigFile = TASK_CONFIG_PATH; + struct stat buffer; + + // Check if file provided is accessible + if (stat(TaskConfigFile.c_str(), &buffer) != 0) { + DEBUG(errs() << "Invalid Task Config File provided: " << TaskConfigFile + << "!\n"); + assert(false && "Couldn't access task config file!"); + } + + // Populate task library info from the provided task + // description file. + + parseTaskFile(TaskConfigFile); +} +void CGT_EPOCHS::initSchedulerAPI() { + // Load Scheduler API Module + SMDiagnostic Err; + + std::string SchedulerAPI = SCHDULER_LIBRARY_PATH; + auto SchedulerModule = parseIRFile(SchedulerAPI, Err, M.getContext()); + if (SchedulerModule == nullptr) { + DEBUG(errs() << Err.getMessage() << " " << SchedulerAPI << "\n"); + assert(false && "Couldn't parse SchedulerAPI module!"); + } else + DEBUG(errs() << "Successfully loaded Scheduler API module\n"); + + DECLARE_EPOCHS(set_up_scheduler, SchedulerModule); + DECLARE_EPOCHS(initialize_scheduler_from_config_file, SchedulerModule); + DECLARE_EPOCHS(register_task_type, SchedulerModule); + DECLARE_EPOCHS(set_up_task, SchedulerModule); + DECLARE_EPOCHS(request_execution, SchedulerModule); + DECLARE_EPOCHS(finish_task_execution, SchedulerModule); + DECLARE_EPOCHS(wait_on_tasklist, SchedulerModule); + DECLARE_EPOCHS(shutdown_scheduler, SchedulerModule); + + + IRBuilder<> Builder(InitCall); + + // Issue set_up_scheduler call + CallInst *SetupSchedulerInst = + Builder.CreateCall(set_up_scheduler, ArrayRef<Value *>()); + DEBUG(errs() << *SetupSchedulerInst << "\n"); + + // Issue initialize_scheduler call + std::string SchedulerConfigFile = SCHDULER_CONFIG_PATH; + struct stat buffer; + // Check if file provided is accessible + if (stat(SchedulerConfigFile.c_str(), &buffer) != 0) { + DEBUG(errs() << "Invalid Scheduler Config File provided: " + << SchedulerConfigFile << "!\n"); + assert(false && "Couldn't access scheduler config file!"); + } + Value *ConfigFileStrPtr = Builder.CreateGlobalStringPtr(SchedulerConfigFile); + DEBUG(errs() << *ConfigFileStrPtr << "\n"); + Value *InitializeSchedulerArgs[] = {ConfigFileStrPtr}; + CallInst *InitSchedInst = Builder.CreateCall( + initialize_scheduler_from_config_file, + ArrayRef<Value *>(InitializeSchedulerArgs, 1), "scheduler"); + DEBUG(errs() << *InitSchedInst << "\n"); + + SchedulerHandleAddress = new GlobalVariable( + M, InitSchedInst->getType(), false, GlobalValue::CommonLinkage, + Constant::getNullValue(InitSchedInst->getType()), "scheduler.addr"); + DEBUG(errs() << "Store at: " << *SchedulerHandleAddress << "\n"); + auto *SI = Builder.CreateStore(InitSchedInst, SchedulerHandleAddress); + DEBUG(errs() << *SI << "\n"); + + // Insert the DagID Global Variable, initialized to 0 + // TODO: what to do when there are multiple DFGs? + DagIDGlobal = new GlobalVariable( + M, Type::getInt32Ty(M.getContext()), false, GlobalValue::CommonLinkage, + ConstantInt::get(Type::getInt32Ty(M.getContext()), 0), "DagID"); + + DEBUG(errs() << *InitCall->getParent() << "\n"); + + + + // Add the shutdown scheduler call at the __hpvm__cleanup call. + + Builder.SetInsertPoint(CleanupCall); + + + // Load the Schedule Pointer + auto *SchedulerPtr = Builder.CreateLoad(SchedulerHandleAddress); + std::vector<Value*> ShutdownArgs = {SchedulerPtr}; + auto *ShutdownCall = Builder.CreateCall(shutdown_scheduler, ArrayRef<Value *>(ShutdownArgs)); + + DEBUG(errs() <<"Shutdown call: "<< *ShutdownCall << "\n"); + + +} + +// Initialize the HPVM runtime API. This makes it easier to insert these calls +void CGT_EPOCHS::initRuntimeAPI() { + + // Load Runtime API Module + SMDiagnostic Err; + + std::string runtimeAPI = std::string(LLVM_BUILD_DIR_STR) + + "/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc"; + + runtimeModule = parseIRFile(runtimeAPI, Err, M.getContext()); + if (runtimeModule == nullptr) { + DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n"); + assert(false && "couldn't parse runtime"); + } else + DEBUG(errs() << "Successfully loaded hpvm-rt API module\n"); + + // Get or insert the global declarations for launch/wait functions + DECLARE_EPOCHS(llvm_hpvm_cpu_launch, runtimeModule); + DECLARE_EPOCHS(malloc, runtimeModule); + DECLARE_EPOCHS(llvm_hpvm_cpu_wait, runtimeModule); + DECLARE_EPOCHS(llvm_hpvm_cpu_argument_ptr, runtimeModule); + DECLARE_EPOCHS(llvm_hpvm_streamLaunch, runtimeModule); + DECLARE_EPOCHS(llvm_hpvm_streamPush, runtimeModule); + DECLARE_EPOCHS(llvm_hpvm_streamPop, runtimeModule); + DECLARE_EPOCHS(llvm_hpvm_streamWait, runtimeModule); + DECLARE_EPOCHS(llvm_hpvm_createBindInBuffer, runtimeModule); + DECLARE_EPOCHS(llvm_hpvm_createBindOutBuffer, runtimeModule); + DECLARE_EPOCHS(llvm_hpvm_createEdgeBuffer, runtimeModule); + DECLARE_EPOCHS(llvm_hpvm_createLastInputBuffer, runtimeModule); + DECLARE_EPOCHS(llvm_hpvm_createThread, runtimeModule); + DECLARE_EPOCHS(llvm_hpvm_bufferPush, runtimeModule); + DECLARE_EPOCHS(llvm_hpvm_bufferPop, runtimeModule); + DECLARE_EPOCHS(llvm_hpvm_cpu_dstack_push, runtimeModule); + DECLARE_EPOCHS(llvm_hpvm_cpu_dstack_pop, runtimeModule); + DECLARE_EPOCHS(llvm_hpvm_cpu_getDimLimit, runtimeModule); + DECLARE_EPOCHS(llvm_hpvm_cpu_getDimInstance, runtimeModule); + + // Get or insert timerAPI functions as well if you plan to use timers + // initTimerAPI(); + + // Insert init context in main + Function *VI = M.getFunction("llvm.hpvm.init"); + assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once"); + // DEBUG(errs() << "Inserting cpu timer initialization\n"); + InitCall = cast<Instruction>(*VI->user_begin()); + // initializeTimerSet(InitCall); + // switchToTimer(hpvm_TimerID_NONE, I); + // Insert print instruction at hpvm exit + Function *VC = M.getFunction("llvm.hpvm.cleanup"); + assert(VC->getNumUses() == 1 && "__hpvm__cleanup should only be used once"); + CleanupCall = cast<Instruction>(*VC->user_begin()); + + // DEBUG(errs() << "Inserting cpu timer print\n"); + // printTimerSet(InitCall); +} + +/* Returns vector of all wait instructions + */ +std::vector<IntrinsicInst *> *CGT_EPOCHS::getUseList(Value *GraphID) { + std::vector<IntrinsicInst *> *UseList = new std::vector<IntrinsicInst *>(); + // It must have been loaded from memory somewhere + for (Value::user_iterator ui = GraphID->user_begin(), + ue = GraphID->user_end(); + ui != ue; ++ui) { + if (IntrinsicInst *waitI = dyn_cast<IntrinsicInst>(*ui)) { + UseList->push_back(waitI); + } else { + llvm_unreachable("Error: Operation on Graph ID not supported!\n"); + } + } + return UseList; +} + +/* Add Loop around the instruction I + * Algorithm: + * (1) Split the basic block of instruction I into three parts, where the + * middleblock/body would contain instruction I. + * (2) Add phi node before instruction I. Add incoming edge to phi node from + * predecessor + * (3) Add increment and compare instruction to index variable + * (4) Replace terminator/branch instruction of body with conditional branch + * which loops over bidy if true and goes to end if false + * (5) Update phi node of body + */ +void CGT_EPOCHS::addWhileLoop(Instruction *CondBlockStart, + Instruction *BodyStart, Instruction *BodyEnd, + Value *TerminationCond) { + BasicBlock *Entry = CondBlockStart->getParent(); + BasicBlock *CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition"); + BasicBlock *WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body"); + BasicBlock *WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end"); + + // Replace the terminator instruction of conditional with new conditional + // branch which goes to while.body if true and branches to while.end otherwise + BranchInst *BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond); + ReplaceInstWithInst(CondBlock->getTerminator(), BI); + + // While Body should jump to condition block + BranchInst *UnconditionalBranch = BranchInst::Create(CondBlock); + ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch); +} + +Instruction *CGT_EPOCHS::addWhileLoopCounter(BasicBlock *Entry, + BasicBlock *Cond, + BasicBlock *Body) { + Module *M = Entry->getParent()->getParent(); + Type *Int64Ty = Type::getInt64Ty(M->getContext()); + + // Insert a PHI instruction at the beginning of the condition block + Instruction *IB = Cond->getFirstNonPHI(); + PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB); + + ConstantInt *IConst = + ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true); + Instruction *CounterIncr = + BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst, + "cnt_incr", Body->getTerminator()); + + // Set incoming values for Phi node + IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true); + CounterPhi->addIncoming(IConst, Entry); + CounterPhi->addIncoming(CounterIncr, Body); + + // Return the pointer to the created PHI node in the corresponding argument + return CounterPhi; +} + +// Returns a packed struct type. The structtype is created by packing the input +// types, output types and isLastInput buffer type. All the streaming +// inputs/outputs are converted to i8*, since this is the type of buffer +// handles. +StructType *CGT_EPOCHS::getArgumentListStructTy(DFNode *C) { + std::vector<Type *> TyList; + // Input types + Function *CF = C->getFuncPointer(); + for (Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end(); + ai != ae; ++ai) { + if (C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge()) + TyList.push_back(Type::getInt8PtrTy(CF->getContext())); + else + TyList.push_back(ai->getType()); + } + // Output Types + StructType *OutStructTy = cast<StructType>(CF->getReturnType()); + for (unsigned i = 0; i < OutStructTy->getNumElements(); i++) { + // All outputs of a node are streaming edge + assert(C->getOutDFEdgeAt(i)->isStreamingEdge() && + "All output edges of child node have to be streaming"); + TyList.push_back(Type::getInt8PtrTy(CF->getContext())); + } + // isLastInput buffer element + TyList.push_back(Type::getInt8PtrTy(CF->getContext())); + + StructType *STy = + StructType::create(CF->getContext(), TyList, + Twine("struct.thread." + CF->getName()).str(), true); + return STy; +} + +void CGT_EPOCHS::startNodeThread(DFNode *C, std::vector<Value *> Args, + DenseMap<DFEdge *, Value *> EdgeBufferMap, + Value *isLastInputBuffer, Value *graphID, + Instruction *IB) { + DEBUG(errs() << "Starting Pipeline for child node: " + << C->getFuncPointer()->getName() << "\n"); + // Create a filter/pipeline function for the child node + Function *C_Pipeline = createFunctionFilter(C); + Function *CF = C->getFuncPointer(); + + // Get module context and i32 0 constant, as they would be frequently used in + // this function. + LLVMContext &Ctx = IB->getParent()->getContext(); + Constant *IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0); + + // Marshall arguments + // Create a packed struct type with inputs of C followed by outputs and then + // another i8* to indicate isLastInput buffer. Streaming inputs are replaced + // by i8* + // + StructType *STy = getArgumentListStructTy(C); + // Allocate the struct on heap *NOT* stack and bitcast i8* to STy* + CallInst *CI = + CallInst::Create(malloc, ArrayRef<Value *>(ConstantExpr::getSizeOf(STy)), + C->getFuncPointer()->getName() + ".inputs", IB); + CastInst *Struct = BitCastInst::CreatePointerCast( + CI, STy->getPointerTo(), CI->getName() + ".i8ptr", IB); + // AllocaInst* AI = new AllocaInst(STy, + // C->getFuncPointer()->getName()+".inputs", IB); + // Insert elements in the struct + DEBUG(errs() << "Marshall inputs for child node: " + << C->getFuncPointer()->getName() << "\n"); + // Marshall Inputs + for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) { + // Create constant int (i) + Constant *Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i); + // Get Element pointer instruction + Value *GEPIndices[] = {IntZero, Int_i}; + GetElementPtrInst *GEP = GetElementPtrInst::Create( + nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2), + Struct->getName() + ".arg_" + Twine(i), IB); + DFEdge *E = C->getInDFEdgeAt(i); + if (E->getSourceDF()->isEntryNode()) { + // This is a Bind Input Edge + if (E->isStreamingEdge()) { + // Streaming Bind Input edge. Get buffer corresponding to it + assert(EdgeBufferMap.count(E) && + "No mapping buffer for a Streaming Bind DFEdge!"); + new StoreInst(EdgeBufferMap[E], GEP, IB); + } else { + // Non-streaming Bind edge + new StoreInst(Args[i], GEP, IB); + } + } else { + // This is an edge between siblings. + // This must be an streaming edge. As it is our assumption that all edges + // between two nodes in a DFG are streaming. + assert(EdgeBufferMap.count(E) && + "No mapping buffer for a Streaming DFEdge!"); + new StoreInst(EdgeBufferMap[E], GEP, IB); + } + } + unsigned numInputs = CF->getFunctionType()->getNumParams(); + unsigned numOutputs = cast<StructType>(CF->getReturnType())->getNumElements(); + // Marshall Outputs + DEBUG(errs() << "Marshall outputs for child node: " + << C->getFuncPointer()->getName() << "\n"); + for (unsigned i = 0; i < numOutputs; i++) { + // Create constant int (i+numInputs) + Constant *Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i + numInputs); + // Get Element pointer instruction + Value *GEPIndices[] = {IntZero, Int_i}; + GetElementPtrInst *GEP = GetElementPtrInst::Create( + nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2), + Struct->getName() + ".out_" + Twine(i), IB); + DFEdge *E = C->getOutDFEdgeAt(i); + assert(E->isStreamingEdge() && + "Output Edge must be streaming of all nodes"); + assert(EdgeBufferMap.count(E) && + "No mapping buffer for a Out Streaming DFEdge!"); + new StoreInst(EdgeBufferMap[E], GEP, IB); + } + // Marshall last argument. isLastInput buffer + DEBUG(errs() << "Marshall isLastInput for child node: " + << C->getFuncPointer()->getName() << "\n"); + // Create constant int (i+numInputs) + Constant *Int_index = + ConstantInt::get(Type::getInt32Ty(Ctx), numInputs + numOutputs); + // Get Element pointer instruction + Value *GEPIndices[] = {IntZero, Int_index}; + GetElementPtrInst *GEP = GetElementPtrInst::Create( + nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2), + Struct->getName() + ".isLastInput", IB); + new StoreInst(isLastInputBuffer, GEP, IB); + + // AllocaInst AI points to memory with all the arguments packed + // Call runtime to create the thread with these arguments + DEBUG(errs() << "Start Thread for child node: " + << C->getFuncPointer()->getName() << "\n"); + // DEBUG(errs() << *llvm_hpvm_createThread << "\n"); + DEBUG(errs() << *graphID->getType() << "\n"); + DEBUG(errs() << *C_Pipeline->getType() << "\n"); + DEBUG(errs() << *Struct->getType() << "\n"); + // Bitcast AI to i8* + CastInst *BI = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx), + Struct->getName(), IB); + Value *CreateThreadArgs[] = {graphID, C_Pipeline, BI}; + CallInst::Create(llvm_hpvm_createThread, + ArrayRef<Value *>(CreateThreadArgs, 3), "", IB); +} + +Function *CGT_EPOCHS::createLaunchFunction(DFInternalNode *N) { + DEBUG(errs() << "Generating Streaming Launch Function\n"); + // Get Function associated with Node N + Function *NF = N->getFuncPointer(); + + // Map from Streaming edge to buffer + DenseMap<DFEdge *, Value *> EdgeBufferMap; + + /* Now we have all the necessary global declarations necessary to generate the + * Launch function, pointer to which can be passed to pthread utils to execute + * DFG. The Launch function has just one input: i8* data.addr + * This is the address of the all the input data that needs to be passed to + * this function. In our case it contains the input arguments of the Root + * function in the correct order. + * (1) Create an empty Launch function of type void (i8* args, i8* GraphID) + * (2) Extract each of inputs from data.addr + * (3) create Buffers for all the streaming edges + * - Put buffers in the context + * (4) Go over each child node + * - marshall its arguments together (use buffers in place of streaming + * arguments) + * - Start the threads + * (5) The return value from Root is stored in memory, pointer to which is + * passed to pthread_exit call. + */ + // (1) Create Launch Function of type void (i8* args, i8* GraphID) + Type *i8Ty = Type::getInt8Ty(M.getContext()); + Type *ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()}; + FunctionType *LaunchFuncTy = FunctionType::get( + Type::getVoidTy(NF->getContext()), ArrayRef<Type *>(ArgTypes, 2), false); + Function *LaunchFunc = Function::Create( + LaunchFuncTy, NF->getLinkage(), NF->getName() + ".LaunchFunction", &M); + DEBUG(errs() << "Generating Code for Streaming Launch Function\n"); + // Give a name to the argument which is used pass data to this thread + Argument *data = &*LaunchFunc->arg_begin(); + // NOTE-HS: Check correctness with Maria + Argument *graphID = &*(LaunchFunc->arg_begin() + 1); + data->setName("data.addr"); + graphID->setName("graphID"); + // Add a basic block to this empty function and a return null statement to it + DEBUG(errs() << *LaunchFunc->getReturnType() << "\n"); + BasicBlock *BB = + BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc); + ReturnInst *RI = ReturnInst::Create(LaunchFunc->getContext(), BB); + + DEBUG(errs() << "Created Empty Launch Function\n"); + + // (2) Extract each of inputs from data.addr + std::vector<Type *> TyList; + std::vector<std::string> names; + std::vector<Value *> Args; + + for (Function::arg_iterator ai = NF->arg_begin(), ae = NF->arg_end(); + ai != ae; ++ai) { + if (N->getChildGraph() + ->getEntry() + ->getOutDFEdgeAt(ai->getArgNo()) + ->isStreamingEdge()) { + TyList.push_back(i8Ty->getPointerTo()); + names.push_back(Twine(ai->getName() + "_buffer").str()); + continue; + } + TyList.push_back(ai->getType()); + names.push_back(ai->getName()); + } + Args = extractElements(data, TyList, names, RI); + DEBUG(errs() << "Launch function for " << NF->getName() << *LaunchFunc + << "\n"); + // (3) Create buffers for all the streaming edges + for (DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(), + de = N->getChildGraph()->dfedge_end(); + di != de; ++di) { + DFEdge *Edge = *di; + DEBUG(errs() << *Edge->getType() << "\n"); + Value *size = ConstantExpr::getSizeOf(Edge->getType()); + Value *CallArgs[] = {graphID, size}; + if (Edge->isStreamingEdge()) { + CallInst *CI; + // Create a buffer call + if (Edge->getSourceDF()->isEntryNode()) { + // Bind Input Edge + Constant *Int_ArgNo = ConstantInt::get( + Type::getInt32Ty(RI->getContext()), Edge->getSourcePosition()); + Value *BindInCallArgs[] = {graphID, size, Int_ArgNo}; + CI = CallInst::Create( + llvm_hpvm_createBindInBuffer, ArrayRef<Value *>(BindInCallArgs, 3), + "BindIn." + Edge->getDestDF()->getFuncPointer()->getName(), RI); + } else if (Edge->getDestDF()->isExitNode()) { + // Bind Output Edge + CI = CallInst::Create( + llvm_hpvm_createBindOutBuffer, ArrayRef<Value *>(CallArgs, 2), + "BindOut." + Edge->getSourceDF()->getFuncPointer()->getName(), RI); + } else { + // Streaming Edge + CI = CallInst::Create( + llvm_hpvm_createEdgeBuffer, ArrayRef<Value *>(CallArgs, 2), + Edge->getSourceDF()->getFuncPointer()->getName() + "." + + Edge->getDestDF()->getFuncPointer()->getName(), + RI); + } + EdgeBufferMap[Edge] = CI; + } + } + // Create buffer for isLastInput for all the child nodes + DFGraph *G = N->getChildGraph(); + DenseMap<DFNode *, Value *> NodeLastInputMap; + for (DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce; + ++ci) { + DFNode *child = *ci; + if (child->isDummyNode()) + continue; + Value *size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext())); + Value *CallArgs[] = {graphID, size}; + CallInst *CI = CallInst::Create( + llvm_hpvm_createLastInputBuffer, ArrayRef<Value *>(CallArgs, 2), + "BindIn.isLastInput." + child->getFuncPointer()->getName(), RI); + NodeLastInputMap[child] = CI; + } + DEBUG(errs() << "Start Each child node filter\n"); + // (4) Marshall arguments for each child node and start the thread with its + // pipeline funtion + for (DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); + ci != ce; ++ci) { + DFNode *C = *ci; + // Skip dummy node call + if (C->isDummyNode()) + continue; + + // Marshall all the arguments for this node into an i8* + // Pass to the runtime to create the thread + // Start the thread for child node C + startNodeThread(C, Args, EdgeBufferMap, NodeLastInputMap[C], graphID, RI); + } + + DEBUG(errs() << "Launch function:\n"); + DEBUG(errs() << *LaunchFunc << "\n"); + + return LaunchFunc; +} + +/* This fuction does the steps necessary to launch a streaming graph + * Steps + * Create Pipeline/Filter function for each node in child graph of Root + * Create Functions DFGLaunch, DFGPush, DFGPop, DFGWait + * Modify each of the instrinsic in host code + * Launch, Push, Pop, Wait + */ +void CGT_EPOCHS::codeGenLaunchStreaming(DFInternalNode *Root) { + IntrinsicInst *LI = Root->getInstruction(); + Function *RootLaunch = createLaunchFunction(Root); + // Substitute launch intrinsic main + DEBUG(errs() << "Substitute launch intrinsic\n"); + Value *LaunchInstArgs[] = {RootLaunch, LI->getArgOperand(1)}; + CallInst *LaunchInst = CallInst::Create( + llvm_hpvm_streamLaunch, ArrayRef<Value *>(LaunchInstArgs, 2), + "graph" + Root->getFuncPointer()->getName(), LI); + + DEBUG(errs() << *LaunchInst << "\n"); + // Replace all wait instructions with cpu specific wait instructions + DEBUG(errs() << "Substitute wait, push, pop intrinsics\n"); + std::vector<IntrinsicInst *> *UseList = getUseList(LI); + for (unsigned i = 0; i < UseList->size(); ++i) { + IntrinsicInst *II = UseList->at(i); + CallInst *CI; + Value *PushArgs[] = {LaunchInst, II->getOperand(1)}; + switch (II->getIntrinsicID()) { + case Intrinsic::hpvm_wait: + CI = CallInst::Create(llvm_hpvm_streamWait, ArrayRef<Value *>(LaunchInst), + ""); + break; + case Intrinsic::hpvm_push: + CI = CallInst::Create(llvm_hpvm_streamPush, + ArrayRef<Value *>(PushArgs, 2), ""); + break; + case Intrinsic::hpvm_pop: + CI = CallInst::Create(llvm_hpvm_streamPop, ArrayRef<Value *>(LaunchInst), + ""); + break; + default: + llvm_unreachable( + "GraphID is used by an instruction other than wait, push, pop"); + }; + DEBUG(errs() << "Replace:\n\t" << *II << "\n"); + ReplaceInstWithInst(II, CI); + DEBUG(errs() << "\twith " << *CI << "\n"); + } +} + +void CGT_EPOCHS::codeGenLaunch(DFInternalNode *Root) { + // TODO: Place an assert to check if the constant passed by launch intrinsic + // as the number of arguments to DFG is same as the number of arguments of the + // root of DFG + DEBUG(errs() << "Generating Launch Function\n"); + // Get Launch Instruction + IntrinsicInst *LI = Root->getInstruction(); + // switchToTimer(hpvm_TimerID_PTHREAD_CREATE, LI); + DEBUG(errs() << "Generating Launch Function\n"); + + /* Now we have all the necessary global declarations necessary to generate the + * Launch function, pointer to which can be passed to pthread utils to execute + * DFG. The Launch function has just one input: i8* data.addr + * This is the address of the all the input data that needs to be passed to + * this function. In our case it contains the input arguments of the Root + * function in the correct order. + * (1) Create an empty Launch function of type i8*(i8*) + * (2) Extract each of inputs from data.addr and pass them as arguments to the + * call to Root function + * (3) The return value from Root is stored in memory, pointer to which is + * passed to pthread_exit call. + */ + // Create Launch Function of type i8*(i8*) which calls the root function + Type *i8Ty = Type::getInt8Ty(M.getContext()); + FunctionType *AppFuncTy = FunctionType::get( + i8Ty->getPointerTo(), ArrayRef<Type *>(i8Ty->getPointerTo()), false); + Function *AppFunc = + Function::Create(AppFuncTy, Root->getFuncPointer()->getLinkage(), + "LaunchDataflowGraph", &M); + DEBUG(errs() << "Generating Launch Function\n"); + // Give a name to the argument which is used pass data to this thread + Value *data = &*AppFunc->arg_begin(); + data->setName("data.addr"); + // Add a basic block to this empty function and a return null statement to it + BasicBlock *BB = BasicBlock::Create(AppFunc->getContext(), "entry", AppFunc); + ReturnInst *RI = + ReturnInst::Create(AppFunc->getContext(), + Constant::getNullValue(AppFunc->getReturnType()), BB); + // switchToTimer(hpvm_TimerID_ARG_UNPACK, RI); + + DEBUG(errs() << "Created Empty Launch Function\n"); + // Find the CPU function generated for Root and + // Function* RootF_CPU = Root->getGenFunc(); + Function *RootF_CPU = Root->getGenFuncForTarget(hpvm::CPU_TARGET); + assert(RootF_CPU && "Error: No generated CPU function for Root node\n"); + assert(Root->hasCPUGenFuncForTarget(hpvm::CPU_TARGET) && + "Error: Generated Function for Root node with no cpu wrapper\n"); + + // Generate a call to RootF_CPU with null parameters for now + std::vector<Value *> Args; + for (unsigned i = 0; i < RootF_CPU->getFunctionType()->getNumParams(); i++) { + Args.push_back( + Constant::getNullValue(RootF_CPU->getFunctionType()->getParamType(i))); + } + CallInst *CI = + CallInst::Create(RootF_CPU, Args, RootF_CPU->getName() + ".output", RI); + + // Extract input data from i8* data.addr and patch them to correct argument of + // call to RootF_CPU. For each argument + std::vector<Type *> TyList; + std::vector<std::string> names; + for (Function::arg_iterator ai = RootF_CPU->arg_begin(), + ae = RootF_CPU->arg_end(); + ai != ae; ++ai) { + TyList.push_back(ai->getType()); + names.push_back(ai->getName()); + } + std::vector<Value *> elements = extractElements(data, TyList, names, CI); + // Patch the elements to the call arguments + for (unsigned i = 0; i < CI->getNumArgOperands(); i++) + CI->setArgOperand(i, elements[i]); + + // Add timers around Call to RootF_CPU function + // switchToTimer(hpvm_TimerID_COMPUTATION, CI); + // switchToTimer(hpvm_TimerID_OUTPUT_PACK, RI); + + StructType *RootRetTy = + cast<StructType>(RootF_CPU->getFunctionType()->getReturnType()); + + // if Root has non empty return + if (RootRetTy->getNumElements()) { + // We can't access the type of the arg struct - build it + std::vector<Type *> TyList; + for (Function::arg_iterator ai = RootF_CPU->arg_begin(), + ae = RootF_CPU->arg_end(); + ai != ae; ++ai) { + TyList.push_back(ai->getType()); + } + TyList.push_back(CI->getType()); + + StructType *ArgStructTy = StructType::create( + M.getContext(), ArrayRef<Type *>(TyList), + (RootF_CPU->getName() + ".arg.struct.ty").str(), true); + + // Cast the data pointer to the type of the arg struct + CastInst *OutputAddrCast = CastInst::CreatePointerCast( + data, ArgStructTy->getPointerTo(), "argStructCast.addr", RI); + + // Result struct is the last element of the packed struct passed to launch + unsigned outStructIdx = ArgStructTy->getNumElements() - 1; + + ConstantInt *IntZero = + ConstantInt::get(Type::getInt32Ty(M.getContext()), 0); + ConstantInt *IntIdx = + ConstantInt::get(Type::getInt32Ty(M.getContext()), outStructIdx); + + Value *GEPIIdxList[] = {IntZero, IntIdx}; + // Get data pointer to the last element of struct - result field + GetElementPtrInst *OutGEPI = GetElementPtrInst::Create( + ArgStructTy, OutputAddrCast, ArrayRef<Value *>(GEPIIdxList, 2), + CI->getName() + ".addr", RI); + // Store result there + new StoreInst(CI, OutGEPI, RI); + } else { + // There is no return - no need to actually code gen, but for fewer + // changes maintain what code was already doing + // We were casting the data pointer to the result type of Root, and + // returning result there. This would work at the LLVM level, but not + // at the C level, thus the rewrite. + CastInst *OutputAddrCast = CastInst::CreatePointerCast( + data, CI->getType()->getPointerTo(), CI->getName() + ".addr", RI); + new StoreInst(CI, OutputAddrCast, RI); + } + + // switchToTimer(hpvm_TimerID_NONE, RI); + + DEBUG(errs() << "Application specific function:\n"); + DEBUG(errs() << *AppFunc << "\n"); + + // Substitute launch intrinsic main + Value *LaunchInstArgs[] = {AppFunc, LI->getArgOperand(1)}; + CallInst *LaunchInst = CallInst::Create( + llvm_hpvm_cpu_launch, ArrayRef<Value *>(LaunchInstArgs, 2), + "graph" + Root->getFuncPointer()->getName(), LI); + // ReplaceInstWithInst(LI, LaunchInst); + + DEBUG(errs() << *LaunchInst << "\n"); + // Replace all wait instructions with cpu specific wait instructions + std::vector<IntrinsicInst *> *UseList = getUseList(LI); + for (unsigned i = 0; i < UseList->size(); ++i) { + IntrinsicInst *II = UseList->at(i); + CallInst *CI; + switch (II->getIntrinsicID()) { + case Intrinsic::hpvm_wait: + CI = CallInst::Create(llvm_hpvm_cpu_wait, ArrayRef<Value *>(LaunchInst), + ""); + break; + case Intrinsic::hpvm_push: + CI = CallInst::Create(llvm_hpvm_bufferPush, ArrayRef<Value *>(LaunchInst), + ""); + break; + case Intrinsic::hpvm_pop: + CI = CallInst::Create(llvm_hpvm_bufferPop, ArrayRef<Value *>(LaunchInst), + ""); + break; + default: + llvm_unreachable( + "GraphID is used by an instruction other than wait, push, pop"); + }; + ReplaceInstWithInst(II, CI); + DEBUG(errs() << *CI << "\n"); + } +} + +/* This function takes a DFNode, and creates a filter function for it. By filter + * function we mean a function which keeps on getting input from input buffers, + * applying the function on the inputs and then pushes data on output buffers + */ +// Create a function with void* (void*) type. +// Create a new basic block +// Add a return instruction to the basic block +// extract arguments from the aggregate data input. Type list would be +// Replace the streaming inputs with i8* types signifying handle to +// corresponding buffers +// Add a boolean argument isLastInput +// Add runtime API calls to get input for each of the streaming inputs +// Add a call to the generated function of the child node +// Add runtime API calls to push output for each of the streaming outputs +// Add loop around the basic block, which exits the loop if isLastInput is false + +Function *CGT_EPOCHS::createFunctionFilter(DFNode *C) { + DEBUG(errs() << "*********Creating Function filter for " + << C->getFuncPointer()->getName() << "*****\n"); + + /* Create a function with same argument list as child.*/ + DEBUG(errs() << "\tCreate a function with the same argument list as child\n"); + // Get the generated function for child node + Function *CF = C->getFuncPointer(); + // Create Filter Function of type i8*(i8*) which calls the root function + Type *i8Ty = Type::getInt8Ty(M.getContext()); + FunctionType *CF_PipelineTy = FunctionType::get( + i8Ty->getPointerTo(), ArrayRef<Type *>(i8Ty->getPointerTo()), false); + Function *CF_Pipeline = Function::Create(CF_PipelineTy, CF->getLinkage(), + CF->getName() + "_Pipeline", &M); + DEBUG(errs() << "Generating Pipeline Function\n"); + // Give a name to the argument which is used pass data to this thread + Value *data = &*CF_Pipeline->arg_begin(); + data->setName("data.addr"); + // Create a new basic block + DEBUG(errs() << "\tCreate new BB and add a return function\n"); + // Add a basic block to this empty function + BasicBlock *BB = + BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline); + // Add a return instruction to the basic block + ReturnInst *RI = + ReturnInst::Create(CF_Pipeline->getContext(), + UndefValue::get(CF_Pipeline->getReturnType()), BB); + + /* Extract the elements from the aggregate argument to the function. + * Replace the streaming inputs with i8* types signifying handle to + * corresponding buffers + * Add outputs to the list as well + * Add isLastInput to the list + */ + DEBUG(errs() << "\tReplace streaming input arguments with i8* type\n"); + // These Args will be used when passing arguments to the generated function + // inside loop, and reading outputs as well. + std::vector<Value *> Args; + std::vector<Type *> TyList; + std::vector<std::string> names; + // Adding inputs + for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e; + ++i) { + if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { + TyList.push_back(i8Ty->getPointerTo()); + names.push_back((Twine(i->getName()) + "_buffer").str()); + } else { + TyList.push_back(i->getType()); + names.push_back(i->getName()); + } + } + // Adding outputs. FIXME: Since we assume all outputs to be streaming edges, + // because we get there buffer handles + StructType *RetTy = cast<StructType>(CF->getReturnType()); + for (unsigned i = 0; i < RetTy->getNumElements(); i++) { + TyList.push_back(i8Ty->getPointerTo()); + names.push_back("out"); + } + /* Add a boolean argument isLastInput */ + DEBUG(errs() << "\tAdd a boolean argument called isLastInput to function\n"); + TyList.push_back(i8Ty->getPointerTo()); + names.push_back("isLastInput_buffer"); + + // Extract the inputs, outputs + Args = extractElements(data, TyList, names, RI); + for (unsigned i = 0; i < Args.size(); i++) { + DEBUG(errs() << *Args[i] << "\n"); + } + + // Split the Args vector into, input output and isLastInput + unsigned numInputs = CF->getFunctionType()->getNumParams(); + unsigned numOutputs = RetTy->getNumElements(); + std::vector<Value *> InputArgs(Args.begin(), Args.begin() + numInputs); + std::vector<Value *> OutputArgs(Args.begin() + numInputs, + Args.begin() + numInputs + numOutputs); + Instruction *isLastInput = cast<Instruction>(Args[Args.size() - 1]); + + /* Add runtime API calls to get input for each of the streaming input edges */ + DEBUG(errs() << "\tAdd runtime API calls to get input for each of the " + "streaming input edges\n"); + // First read the termination condition variable islastInput + CallInst *isLastInputPop = CallInst::Create( + llvm_hpvm_bufferPop, ArrayRef<Value *>(isLastInput), "", RI); + + CastInst *BI = BitCastInst::CreateIntegerCast( + isLastInputPop, Type::getInt64Ty(CF_Pipeline->getContext()), false, + "isLastInput", RI); + isLastInput = BI; + // Create a loop termination condition + CmpInst *Cond = CmpInst::Create( + Instruction::ICmp, CmpInst::ICMP_NE, isLastInput, + Constant::getNullValue(Type::getInt64Ty(CF->getContext())), + "isLastInputNotZero", RI); + + // Get input from buffers of all the incoming streaming edges + for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e; + ++i) { + if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { + CallInst *bufferIn = + CallInst::Create(llvm_hpvm_bufferPop, + ArrayRef<Value *>(InputArgs[i->getArgNo()]), "", RI); + CastInst *BI; + if (i->getType()->isPointerTy()) { + BI = CastInst::Create(CastInst::IntToPtr, bufferIn, i->getType(), + i->getName() + ".addr", RI); + } else if (i->getType()->isFloatTy()) { + BI = CastInst::CreateFPCast(bufferIn, i->getType(), + i->getName() + ".addr", RI); + } else { + BI = CastInst::CreateIntegerCast(bufferIn, i->getType(), false, + i->getName() + ".addr", RI); + } + // Replace the argument in Args vector. We would be using the vector as + // parameters passed to the call + InputArgs[i->getArgNo()] = BI; + } + } + /* Add a call to the generated function of the child node */ + DEBUG(errs() << "\tAdd a call to the generated function of the child node\n"); + // DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n"); + // CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs, + // C->getGenFunc()->getName()+".output", RI); + Function *CGenF = C->getGenFuncForTarget(hpvm::CPU_TARGET); + DEBUG(errs() << "Type: " << *CGenF->getType() << "\n"); + CallInst *CI = + CallInst::Create(CGenF, InputArgs, CGenF->getName() + ".output", RI); + + /* Add runtime API calls to push output for each of the streaming outputs */ + // FIXME: Assumption + // All edges between siblings are streaming edges + DEBUG(errs() << "\tAdd runtime API calls to push output for each of the " + "streaming outputs\n"); + for (unsigned i = 0; i < numOutputs; i++) { + // Extract output + ExtractValueInst *EI = + ExtractValueInst::Create(CI, ArrayRef<unsigned>(i), "", RI); + // Convert to i64 + CastInst *BI; + if (EI->getType()->isPointerTy()) + BI = + CastInst::Create(CastInst::PtrToInt, EI, + Type::getInt64Ty(CF_Pipeline->getContext()), "", RI); + else + BI = CastInst::CreateIntegerCast( + EI, Type::getInt64Ty(CF_Pipeline->getContext()), false, "", RI); + // Push to Output buffer + Value *bufferOutArgs[] = {OutputArgs[i], BI}; + CallInst::Create(llvm_hpvm_bufferPush, ArrayRef<Value *>(bufferOutArgs, 2), + "", RI); + } + + // Add loop around the basic block, which exits the loop if isLastInput is + // false Pointers to keep the created loop structure + Instruction *CondStartI = cast<Instruction>(isLastInputPop); + Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode(); + addWhileLoop(CondStartI, BodyStartI, RI, Cond); + + // Return the Function pointer + DEBUG(errs() << "Pipeline Version of " << CF->getName() << ":\n"); + DEBUG(errs() << *CF_Pipeline << "\n"); + return CF_Pipeline; +} +// Redefining invokeChild() for CGT_EPOCHS because we need to use member +// variables like the SPTR and the TASKDEFINITION +void CGT_EPOCHS::invokeChild(DFNode *C, Function *F_CPU, + ValueToValueMapTy &VMap, Instruction *IB, + hpvm::Target Tag) { + Function *CF = C->getFuncPointer(); + if (Tag == hpvm::CPU_TARGET) { + + // Function* CF_CPU = C->getGenFunc(); + Function *CF_CPU = C->getGenFuncForTarget(Tag); + assert(CF_CPU != NULL && + "Found leaf node for which code generation has not happened yet!\n"); + assert(C->hasCPUGenFuncForTarget(Tag) && + "The generated function to be called from cpu backend is not an cpu " + "function\n"); + DEBUG(errs() << "Invoking child node" << CF_CPU->getName() << "\n"); + + std::vector<Value *> Args; + // Create argument list to pass to call instruction + // First find the correct values using the edges + // The remaing six values are inserted as constants for now. + for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) { + Args.push_back(getInValueAt(C, i, F_CPU, IB)); + } + + Value *I64Zero = ConstantInt::get(Type::getInt64Ty(F_CPU->getContext()), 0); + for (unsigned j = 0; j < 6; j++) + Args.push_back(I64Zero); + + DEBUG(errs() << "Gen Function type: " << *CF_CPU->getType() << "\n"); + DEBUG(errs() << "Node Function type: " << *CF->getType() << "\n"); + DEBUG(errs() << "Arguments: "; for (const Value *Arg + : Args) errs() + << *(Arg->getType()) << ", "; + errs() << "\n"); + + // Copying over index calculation. + copyChildIndexCalc(C, F_CPU, VMap, IB); + + // Call the F_CPU function associated with this node + CallInst *CI = + CallInst::Create(CF_CPU, Args, CF_CPU->getName() + "_output", IB); + DEBUG(errs() << *CI << "\n"); + OutputMap[C] = CI; + + // Find num of dimensions this node is replicated in. + // Based on number of dimensions, insert loop instructions + std::string varNames[3] = {"x", "y", "z"}; + unsigned numArgs = CI->getNumArgOperands(); + for (unsigned j = 0; j < C->getNumOfDim(); j++) { + Value *indexLimit = NULL; + // Limit can either be a constant or an arguement of the internal node. + // In case of constant we can use that constant value directly in the + // new F_CPU function. In case of an argument, we need to get the mapped + // value using VMap + if (ConstantInt *ConstDimLimit = + dyn_cast<ConstantInt>(C->getDimLimits()[j])) { + indexLimit = C->getDimLimits()[j]; + DEBUG(errs() << "In Constant case:\n" + << " indexLimit type = " << *indexLimit->getType() + << "\n"); + if (ConstDimLimit->getZExtValue() == 1) { + DEBUG(errs() << "DimLimit is 1, no need for loop!\n"); + continue; + } + } else if (isa<Constant>(C->getDimLimits()[j])) { + indexLimit = C->getDimLimits()[j]; + DEBUG(errs() << "In Constant case:\n" + << " indexLimit type = " << *indexLimit->getType() + << "\n"); + } else { + indexLimit = VMap[C->getDimLimits()[j]]; + DEBUG(errs() << "In VMap case:" + << " indexLimit type = " << *indexLimit->getType() + << "\n"); + } + assert(indexLimit && "Invalid dimension limit!"); + // Insert loop + Value *indexVar = addLoop(CI, indexLimit, varNames[j]); + DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n"); + // Insert index variable and limit arguments + CI->setArgOperand(numArgs - 6 + j, indexVar); + CI->setArgOperand(numArgs - 3 + j, indexLimit); + } + + // Insert call to runtime to push the dim limits and instanceID on the depth + // stack + Value *args[] = { + ConstantInt::get(Type::getInt32Ty(CI->getContext()), + C->getNumOfDim()), // numDim + CI->getArgOperand(numArgs - 3 + 0), // limitX + CI->getArgOperand(numArgs - 6 + 0), // iX + CI->getArgOperand(numArgs - 3 + 1), // limitY + CI->getArgOperand(numArgs - 6 + 1), // iY + CI->getArgOperand(numArgs - 3 + 2), // limitZ + CI->getArgOperand(numArgs - 6 + 2) // iZ + }; + + CallInst *Push = CallInst::Create(llvm_hpvm_cpu_dstack_push, + ArrayRef<Value *>(args, 7), "", CI); + DEBUG(errs() << "Push on stack: " << *Push << "\n"); + // Insert call to runtime to pop the dim limits and instanceID from the + // depth stack + BasicBlock::iterator i(CI); + ++i; + Instruction *NextI = &*i; + // Next Instruction should also belong to the same basic block as the basic + // block will have a terminator instruction + assert(NextI->getParent() == CI->getParent() && + "Next Instruction should also belong to the same basic block!"); + + CallInst *Pop = CallInst::Create(llvm_hpvm_cpu_dstack_pop, None, "", NextI); + DEBUG(errs() << "Pop from stack: " << *Pop << "\n"); + DEBUG(errs() << *CI->getParent()->getParent()); + } else if (Tag == hpvm::EPOCHS_TARGET) { + // If this is an EPOCHS_TARGET node, we want to issue the setup_task call + IRBuilder<> Builder(IB); + DFLeafNode *CLeaf = dyn_cast<DFLeafNode>(C); + std::vector<Value *> SetupTaskArgs; + + // Load the Schedule Pointer + auto *SchedulerPtr = Builder.CreateLoad(SchedulerHandleAddress); + DEBUG(errs() << *SchedulerPtr << "\n"); + auto *SchedulerPtrBC = Builder.CreateBitCast( + SchedulerPtr, Type::getInt8PtrTy(M.getContext()), "scheduler_handle"); + DEBUG(errs() << *SchedulerPtrBC << "\n"); + SetupTaskArgs.push_back(SchedulerPtrBC); + + // Load the Task handle + auto TI = TaskHandleMap[CLeaf]; + auto *TaskHandleAddr = TI.TaskHandleAddr; + auto *TaskHandle = Builder.CreateLoad(TaskHandleAddr); + DEBUG(errs() << *TaskHandleAddr << "\n"); + DEBUG(errs() << *TaskHandle << "\n"); + SetupTaskArgs.push_back(TaskHandle); + + // Get the criticality + auto Criticality = CLeaf->getCriticality(); + if (Criticality < 2) + Criticality = 1; + else if (Criticality > 2) + Criticality = 3; + Value *CriticalityVal = + ConstantInt::get(Type::getInt32Ty(M.getContext()), Criticality); + DEBUG(errs() << *CriticalityVal << "\n"); + SetupTaskArgs.push_back(CriticalityVal); + + // Get the AutoFinish. If node does not have any Out DFEdges then it should + // have autofinish. + Value *AutoFinish; + if (CLeaf->outdfedge_empty()) + AutoFinish = ConstantInt::get(Type::getInt32Ty(M.getContext()), 1); + else + AutoFinish = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0); + DEBUG(errs() << *AutoFinish << "\n"); + SetupTaskArgs.push_back(AutoFinish); + + // Load the DAGId + auto *DagID = Builder.CreateLoad(DagIDGlobal); + DEBUG(errs() << *DagID << "\n"); + SetupTaskArgs.push_back(DagID); + + // Create argument list to pass to set_up_args + // Find the correct values using the edges + for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) { + auto *InValueArg = getInValueAt(C, i, F_CPU, IB); + DEBUG(errs() << i << ": " << *InValueArg << "\n"); + SetupTaskArgs.push_back(InValueArg); + } + + CallInst *SetupTaskInst = + Builder.CreateCall(set_up_task, ArrayRef<Value *>(SetupTaskArgs), + "setup_task_" + TI.TaskName); + DEBUG(errs() << *SetupTaskInst << "\n"); + + auto *TaskMBBlockAddr = new GlobalVariable( + M, SetupTaskInst->getType(), false, GlobalValue::CommonLinkage, + Constant::getNullValue(SetupTaskInst->getType()), + "MB_" + TI.TaskName + ".addr"); + DEBUG(errs() << *TaskMBBlockAddr << "\n"); + auto *SI = Builder.CreateStore(SetupTaskInst, TaskMBBlockAddr); + DEBUG(errs() << *SI << "\n"); + + TaskMBMap[CLeaf] = TaskMBBlockAddr; + + Value *RequestExecArgs[] = {SetupTaskInst}; + CallInst *RequestExecutionInst = Builder.CreateCall( + request_execution, ArrayRef<Value *>(RequestExecArgs)); + DEBUG(errs() << *RequestExecutionInst << "\n"); + } +} + +Value *CGT_EPOCHS::getInValueAt(DFNode *Child, unsigned i, + Function *ParentF_CPU, + Instruction *InsertBefore) { + Value *inputVal; + // TODO: Assumption is that each input port of a node has just one + // incoming edge. May change later on. + hpvm::Target Tag = Child->getTag(); + // Find the incoming edge at the requested input port + DEBUG(errs() << "Finding incoming edge " << i << " for " + << Child->getFuncPointer()->getName() << "\n"); + DFEdge *E = Child->getInDFEdgeAt(i); + assert(E && "No incoming edge or binding for input element!"); + // Find the Source DFNode associated with the incoming edge + DFNode *SrcDF = E->getSourceDF(); + + // If Source DFNode is a dummyNode, edge is from parent. Get the + // argument from argument list of this internal node + if (SrcDF->isEntryNode()) { + inputVal = getArgumentAt(ParentF_CPU, E->getSourcePosition()); + DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n"); + } else { + if (SrcDF->getTag() == hpvm::CPU_TARGET) { + // edge is from a sibling + // Check - code should already be generated for this source dfnode + assert(OutputMap.count(SrcDF) && + "Source node call not found. Dependency violation!"); + + // Find CallInst associated with the Source DFNode using OutputMap + Value *CI = OutputMap[SrcDF]; + + // Extract element at source position from this call instruction + std::vector<unsigned> IndexList; + IndexList.push_back(E->getSourcePosition()); + DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n"); + ExtractValueInst *EI = + ExtractValueInst::Create(CI, IndexList, "", InsertBefore); + inputVal = EI; + } else if (SrcDF->getTag() == hpvm::EPOCHS_TARGET) { + // TODO: Check how to handle internal node source + // Assumption: source can only be a leaf node + assert(isa<DFLeafNode>(SrcDF) && + "Source can only be a leaf node for now!"); + DFLeafNode *SrcLeaf = dyn_cast<DFLeafNode>(SrcDF); + + // Issue the call to finish() + finishNodeEPOCHS(SrcLeaf, InsertBefore, ParentF_CPU); + + // Get the inputVal + auto OutPos = E->getSourcePosition(); + // The returnArgMap tells us which argument in the leaf node corresponds + // to this output. We need to get the corresponding argument in the + // wrapper. + auto LeafArgNum = returnArgMap[SrcLeaf][OutPos]; + auto *InEdge = SrcLeaf->getInDFEdgeAt(LeafArgNum); + auto ParentArgNum = InEdge->getSourcePosition(); + inputVal = getArgumentAt(ParentF_CPU, ParentArgNum); + // inputVal = getArgumentAt(ParentF_CPU, + // returnArgMap[SrcLeaf][E->getSourcePosition()]); + } + } + return inputVal; +} + +void CGT_EPOCHS::finishNodeEPOCHS(DFLeafNode *Leaf, Instruction *IB, + Function *ParentF_CPU) { + if (FinishedTasks.find(Leaf) != FinishedTasks.end()) { + // Task has already been finished. Exit; + return; + } + + DEBUG(errs() << "Finishing Task: \n"); + + IRBuilder<> Builder(IB); + std::vector<Value *> FinishArgs; + auto *TaskMBBlockAddress = TaskMBMap[Leaf]; + auto *TaskMBBlock = Builder.CreateLoad(TaskMBBlockAddress); + DEBUG(errs() << *TaskMBBlockAddress << "\n"); + DEBUG(errs() << *TaskMBBlock << "\n"); + FinishArgs.push_back(TaskMBBlock); + + for (auto ei = Leaf->outdfedge_begin(); ei != Leaf->outdfedge_end(); ++ei) { + auto *E = *ei; + DEBUG(E->dump()); + auto OutPos = E->getSourcePosition(); + // The returnArgMap tells us which argument in the leaf node corresponds to + // this output. We need to get the corresponding argument in the wrapper. + auto LeafArgNum = returnArgMap[Leaf][OutPos]; + auto *InEdge = Leaf->getInDFEdgeAt(LeafArgNum); + auto ParentArgNum = InEdge->getSourcePosition(); + Argument *OutArg = getArgumentAt(ParentF_CPU, ParentArgNum); + DEBUG(errs() << *OutArg << "\n"); + if (std::find(FinishArgs.begin(), FinishArgs.end(), OutArg) == + FinishArgs.end()) + FinishArgs.push_back(OutArg); + } + + auto *FinishCall = + Builder.CreateCall(finish_task_execution, ArrayRef<Value *>(FinishArgs)); + DEBUG(errs() << *FinishCall << "\n"); + FinishedTasks.insert(Leaf); + + // Add explicit wait call before finish call + waitOnTask(Leaf, FinishCall); + + + +} + +void CGT_EPOCHS::waitOnTask(DFLeafNode *Leaf, Instruction *IB) { + + DEBUG(errs() << "Adding wait call \n"); + + std::vector<Value *> WaitArgs; + IRBuilder<> Builder(IB); + + // Load the Schedule Pointer + auto *SchedulerPtr = Builder.CreateLoad(SchedulerHandleAddress); + DEBUG(errs() << *SchedulerPtr << "\n"); + auto *SchedulerPtrBC = Builder.CreateBitCast( + SchedulerPtr, Type::getInt8PtrTy(M.getContext()), "scheduler_handle"); + DEBUG(errs() << *SchedulerPtrBC << "\n"); + WaitArgs.push_back(SchedulerPtrBC); + + + // Add number of tasks to wait on + // (In this case one task) + WaitArgs.push_back(ConstantInt::get(Type::getInt32Ty(M.getContext()), 1)); + + + auto *TaskMBBlockAddress = TaskMBMap[Leaf]; + auto *TaskMBBlock = Builder.CreateLoad(TaskMBBlockAddress); + DEBUG(errs() << *TaskMBBlockAddress << "\n"); + DEBUG(errs() << *TaskMBBlock << "\n"); + WaitArgs.push_back(TaskMBBlock); + + + + auto *WaitCall = + Builder.CreateCall(wait_on_tasklist, ArrayRef<Value *>(WaitArgs)); + DEBUG(errs() << *WaitCall << "\n"); + +} + +void CGT_EPOCHS::codeGen(DFInternalNode *N) { + // Check if N is root node and its graph is streaming. We do not do codeGen + // for Root in such a case + if (N->isRoot() && N->isChildGraphStreaming()) + return; + + // Check if clone already exists. If it does, it means we have visited this + // function before and nothing else needs to be done for this leaf node. + // if(N->getGenFunc() != NULL) + // return; + if (!preferredTargetIncludes(N, hpvm::CPU_TARGET)) { + DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName() + << " : skipping it\n"); + return; + } + + assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) == NULL && + "Error: Visiting a node for which code already generated\n"); + + // Sort children in topological order before code generation + N->getChildGraph()->sortChildren(); + + // Only process if all children have a CPU cpu function + // Otherwise skip to end + bool codeGen = true; + for (DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); + ci != ce; ++ci) { + DFNode *C = *ci; + // Skip dummy node call + if (C->isDummyNode()) + continue; + + if (!(C->hasCPUGenFuncForTarget(hpvm::CPU_TARGET)) && + !(C->hasCPUGenFuncForTarget(hpvm::EPOCHS_TARGET))) { + DEBUG(errs() << "No CPU & EPOCHS cpu version for child node " + << C->getFuncPointer()->getName() + << "\n Skip code gen for parent node " + << N->getFuncPointer()->getName() << "\n"); + codeGen = false; + } + } + + if (codeGen) { + Function *F = N->getFuncPointer(); + // Create of clone of F with no instructions. Only the type is the same as F + // without the extra arguments. + Function *F_CPU; + + // Clone the function, if we are seeing this function for the first time. We + // only need a clone in terms of type. + ValueToValueMapTy VMap; + + // Create new function with the same type + F_CPU = Function::Create(F->getFunctionType(), F->getLinkage(), + F->getName(), &M); + + // Loop over the arguments, copying the names of arguments over. + Function::arg_iterator dest_iterator = F_CPU->arg_begin(); + for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); + i != e; ++i) { + dest_iterator->setName(i->getName()); // Copy the name over... + // Increment dest iterator + ++dest_iterator; + } + + // Add a basic block to this empty function + BasicBlock *BB = BasicBlock::Create(F_CPU->getContext(), "entry", F_CPU); + ReturnInst *RI = ReturnInst::Create( + F_CPU->getContext(), UndefValue::get(F_CPU->getReturnType()), BB); + + // Add Index and Dim arguments except for the root node and the child graph + // of parent node is not streaming + if (!N->isRoot() && !N->getParent()->isChildGraphStreaming()) + F_CPU = addIdxDimArgs(F_CPU); + + BB = &*F_CPU->begin(); + RI = cast<ReturnInst>(BB->getTerminator()); + + // Add generated function info to DFNode + // N->setGenFunc(F_CPU, hpvm::CPU_TARGET); + N->addGenFunc(F_CPU, hpvm::CPU_TARGET, true); + + // Loop over the arguments, to create the VMap. + dest_iterator = F_CPU->arg_begin(); + for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); + i != e; ++i) { + // Add mapping and increment dest iterator + VMap[&*i] = &*dest_iterator; + ++dest_iterator; + } + + // Iterate over children in topological order + for (DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); + ci != ce; ++ci) { + DFNode *C = *ci; + // Skip dummy node call + if (C->isDummyNode()) + continue; + + // Create calls to CPU function of child node + invokeChild(C, F_CPU, VMap, RI, C->getTag()); + } + + DEBUG(errs() << "*** Generating epilogue code for the function****\n"); + // Generate code for output bindings + // Get Exit node + DFNode *C = N->getChildGraph()->getExit(); + // Get OutputType of this node + StructType *OutTy = N->getOutputType(); + Value *retVal = UndefValue::get(F_CPU->getReturnType()); + // Find all the input edges to exit node + for (unsigned i = 0; i < OutTy->getNumElements(); i++) { + DEBUG(errs() << "Output Edge " << i << "\n"); + // Find the incoming edge at the requested input port + DFEdge *E = C->getInDFEdgeAt(i); + + assert(E && "No Binding for output element!"); + // Find the Source DFNode associated with the incoming edge + DFNode *SrcDF = E->getSourceDF(); + + DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() + << "\n"); + + // If Source DFNode is a dummyNode, edge is from parent. Get the + // argument from argument list of this internal node + Value *inputVal; + if (SrcDF->isEntryNode()) { + inputVal = getArgumentAt(F_CPU, i); + DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n"); + } else { + if (SrcDF->getTag() == hpvm::CPU_TARGET) { + // edge is from a internal node + // Check - code should already be generated for this source dfnode + assert(OutputMap.count(SrcDF) && + "Source node call not found. Dependency violation!"); + + // Find Output Value associated with the Source DFNode using OutputMap + Value *CI = OutputMap[SrcDF]; + + // Extract element at source position from this call instruction + std::vector<unsigned> IndexList; + IndexList.push_back(E->getSourcePosition()); + DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI + << "\n"); + ExtractValueInst *EI = + ExtractValueInst::Create(CI, IndexList, "", RI); + inputVal = EI; + } else if (SrcDF->getTag() == hpvm::EPOCHS_TARGET) { + // TODO: Check how to handle internal node source + // Assumption: source can only be a leaf node + assert(isa<DFLeafNode>(SrcDF) && + "Source can only be a leaf node for now!"); + DFLeafNode *SrcLeaf = dyn_cast<DFLeafNode>(SrcDF); + + // Issue the call to finish() + finishNodeEPOCHS(SrcLeaf, RI, F_CPU); + + // Get the inputVal + auto OutPos = E->getSourcePosition(); + // The returnArgMap tells us which argument in the leaf node + // corresponds to this output. We need to get the corresponding + // argument in the wrapper. + auto LeafArgNum = returnArgMap[SrcLeaf][OutPos]; + auto *InEdge = SrcLeaf->getInDFEdgeAt(LeafArgNum); + auto ParentArgNum = InEdge->getSourcePosition(); + inputVal = getArgumentAt(F_CPU, ParentArgNum); + // inputVal = getArgumentAt( + // F_CPU, returnArgMap[SrcLeaf][E->getSourcePosition()]); + } + } + std::vector<unsigned> IdxList; + IdxList.push_back(i); + retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI); + } + DEBUG(errs() << "Extracted all\n"); + retVal->setName("output"); + ReturnInst *newRI = ReturnInst::Create(F_CPU->getContext(), retVal); + ReplaceInstWithInst(RI, newRI); + + // Increment the dagid + IRBuilder<> Builder(newRI); + auto *DagID = Builder.CreateLoad(DagIDGlobal, "DagID"); + auto *Increment = Builder.CreateAdd( + DagID, ConstantInt::get(Type::getInt32Ty(M.getContext()), 1), + "DagID.increment"); + Builder.CreateStore(Increment, DagIDGlobal); + } + + //-------------------------------------------------------------------------// + // Here, we need to check if this node (N) has more than one versions + // If so, we query the policy and have a call to each version + // If not, we see which version exists, check that it is in fact an cpu + // function and save it as the CPU_TARGET function + + // TODO: hpvm_id per node, so we can use this for id for policies + // For now, use node function name and change it later + Function *CF = N->getGenFuncForTarget(hpvm::CPU_TARGET); + Function *GF = N->getGenFuncForTarget(hpvm::GPU_TARGET); + + bool CFcpu = N->hasCPUGenFuncForTarget(hpvm::CPU_TARGET); + bool GFcpu = N->hasCPUGenFuncForTarget(hpvm::GPU_TARGET); + + DEBUG(errs() << "Before editing\n"); + DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag " + << N->getTag() << "\n"); + DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null") << "\n"); + DEBUG(errs() << "hascpuGenFuncForCPU : " << CFcpu << "\n"); + DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n"); + DEBUG(errs() << "hascpuGenFuncForGPU : " << GFcpu << "\n"); + + if (N->getTag() == hpvm::None) { + // No code is available for this node. This (usually) means that this + // node is a node that + // - from the accelerator backends has been mapped to an intermediate + // node, and thus they have not produced a genFunc + // - a child node had no CPU hint, thus no code gen for CPU could + // take place + DEBUG(errs() << "No GenFunc - Skipping CPU code generation for node " + << N->getFuncPointer()->getName() << "\n"); + } else if (hpvmUtils::isSingleTargetTag(N->getTag())) { + // There is a single version for this node according to code gen hints. + // Therefore, we do not need to check the policy, we simply use the + // available implementation, whichever target it is for. + + // Sanity check - to be removed TODO + switch (N->getTag()) { + case hpvm::CPU_TARGET: + assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) && ""); + assert(N->hasCPUGenFuncForTarget(hpvm::CPU_TARGET) && ""); + assert(!(N->getGenFuncForTarget(hpvm::GPU_TARGET)) && ""); + assert(!(N->hasCPUGenFuncForTarget(hpvm::GPU_TARGET)) && ""); + break; + case hpvm::GPU_TARGET: + assert(!(N->getGenFuncForTarget(hpvm::CPU_TARGET)) && ""); + assert(!(N->hasCPUGenFuncForTarget(hpvm::CPU_TARGET)) && ""); + assert(N->getGenFuncForTarget(hpvm::GPU_TARGET) && ""); + assert(N->hasCPUGenFuncForTarget(hpvm::GPU_TARGET) && ""); + break; + default: + assert(false && "Unreachable: we checked that tag was single target!\n"); + break; + } + + N->addGenFunc(N->getGenFuncForTarget(N->getTag()), hpvm::CPU_TARGET, true); + N->removeGenFuncForTarget(hpvm::GPU_TARGET); + N->setTag(hpvm::CPU_TARGET); + + // Sanity checks - to be removed TODO + CF = N->getGenFuncForTarget(hpvm::CPU_TARGET); + GF = N->getGenFuncForTarget(hpvm::GPU_TARGET); + + CFcpu = N->hasCPUGenFuncForTarget(hpvm::CPU_TARGET); + GFcpu = N->hasCPUGenFuncForTarget(hpvm::GPU_TARGET); + + DEBUG(errs() << "After editing\n"); + DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag " + << N->getTag() << "\n"); + DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null") << "\n"); + DEBUG(errs() << "hascpuGenFuncForCPU : " << CFcpu << "\n"); + DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n"); + DEBUG(errs() << "hascpuGenFuncForGPU : " << GFcpu << "\n"); + + } else { + assert(false && "Multiple tags unsupported!"); + } +} + +// Code generation for leaf nodes +void CGT_EPOCHS::codeGen(DFLeafNode *N) { + // Skip code generation if it is a dummy node + if (N->isDummyNode()) { + DEBUG(errs() << "Skipping dummy node\n"); + return; + } + + // Allocation nodes are skipped + if (N->isAllocationNode()) { + DEBUG(errs() << "Skipping allocation node\n"); + return; + } + + // If this is a regular CPU node, perform regular CPU codegen. + if (N->getTargetHint() == hpvm::CPU_TARGET) { + if (N->getGenFuncForTarget(hpvm::CPU_TARGET) != NULL) { + DEBUG(errs() << "Already generated CPU code for this node!\n"); + return; + } + // assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) == NULL && + // "Error: Visiting a node for which code already generated\n"); + + std::vector<IntrinsicInst *> IItoRemove; + std::vector<std::pair<IntrinsicInst *, Value *>> IItoReplace; + BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap; + + // Get the function associated woth the dataflow node + Function *F = N->getFuncPointer(); + DEBUG(errs() << "Generating CPU code for function " << F->getName() + << "\n"); + + // Clone the function, if we are seeing this function for the first time. + Function *F_CPU; + ValueToValueMapTy VMap; + F_CPU = CloneFunction(F, VMap); + F_CPU->removeFromParent(); + // Insert the cloned function into the module + M.getFunctionList().push_back(F_CPU); + + // Add the new argument to the argument list. Add arguments only if the cild + // graph of parent node is not streaming + if (!N->getParent()->isChildGraphStreaming()) + F_CPU = addIdxDimArgs(F_CPU); + + // Add generated function info to DFNode + // N->setGenFunc(F_CPU, hpvm::CPU_TARGET); + N->addGenFunc(F_CPU, hpvm::CPU_TARGET, true); + + // Go through the arguments, and any pointer arguments with in attribute + // need to have cpu_argument_ptr call to get the cpu ptr of the argument + // Insert these calls in a new BB which would dominate all other BBs + // Create new BB + BasicBlock *EntryBB = &*F_CPU->begin(); + BasicBlock *BB = + BasicBlock::Create(M.getContext(), "getHPVMPtrArgs", F_CPU, EntryBB); + BranchInst *Terminator = BranchInst::Create(EntryBB, BB); + // Insert calls + for (Function::arg_iterator ai = F_CPU->arg_begin(), ae = F_CPU->arg_end(); + ai != ae; ++ai) { + if (F_CPU->getAttributes().hasAttribute(ai->getArgNo() + 1, + Attribute::In)) { + assert(ai->getType()->isPointerTy() && + "Only pointer arguments can have hpvm in/out attributes "); + Function::arg_iterator aiNext = ai; + ++aiNext; + Argument *size = &*aiNext; + assert(size->getType() == Type::getInt64Ty(M.getContext()) && + "Next argument after a pointer should be an i64 type"); + CastInst *BI = BitCastInst::CreatePointerCast( + &*ai, Type::getInt8PtrTy(M.getContext()), ai->getName() + ".i8ptr", + Terminator); + Value *ArgPtrCallArgs[] = {BI, size}; + CallInst::Create(llvm_hpvm_cpu_argument_ptr, + ArrayRef<Value *>(ArgPtrCallArgs, 2), "", Terminator); + } + } + DEBUG(errs() << *BB << "\n"); + + // Go through all the instructions + for (inst_iterator i = inst_begin(F_CPU), e = inst_end(F_CPU); i != e; + ++i) { + Instruction *I = &(*i); + DEBUG(errs() << *I << "\n"); + // Leaf nodes should not contain HPVM graph intrinsics or launch + assert(!BuildDFG::isHPVMLaunchIntrinsic(I) && + "Launch intrinsic within a dataflow graph!"); + assert(!BuildDFG::isHPVMGraphIntrinsic(I) && + "HPVM graph intrinsic within a leaf dataflow node!"); + + if (BuildDFG::isHPVMQueryIntrinsic(I)) { + IntrinsicInst *II = cast<IntrinsicInst>(I); + IntrinsicInst *ArgII; + DFNode *ArgDFNode; + + /*********************************************************************** + * Handle HPVM Query intrinsics * + ***********************************************************************/ + switch (II->getIntrinsicID()) { + /**************************** llvm.hpvm.getNode() *******************/ + case Intrinsic::hpvm_getNode: { + // add mapping <intrinsic, this node> to the node-specific map + Leaf_HandleToDFNodeMap[II] = N; + IItoRemove.push_back(II); + break; + } + /************************* llvm.hpvm.getParentNode() ****************/ + case Intrinsic::hpvm_getParentNode: { + // get the parent node of the arg node + // get argument node + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + // get the parent node of the arg node + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + // Add mapping <intrinsic, parent node> to the node-specific map + // the argument node must have been added to the map, orelse the + // code could not refer to it + Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent(); + IItoRemove.push_back(II); + break; + } + /*************************** llvm.hpvm.getNumDims() *****************/ + case Intrinsic::hpvm_getNumDims: { + // get node from map + // get the appropriate field + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + int numOfDim = Leaf_HandleToDFNodeMap[ArgII]->getNumOfDim(); + IntegerType *IntTy = Type::getInt32Ty(M.getContext()); + ConstantInt *numOfDimConstant = + ConstantInt::getSigned(IntTy, (int64_t)numOfDim); + + II->replaceAllUsesWith(numOfDimConstant); + IItoRemove.push_back(II); + break; + } + /*********************** llvm.hpvm.getNodeInstanceID() **************/ + case Intrinsic::hpvm_getNodeInstanceID_x: + case Intrinsic::hpvm_getNodeInstanceID_y: + case Intrinsic::hpvm_getNodeInstanceID_z: { + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + + // The dfnode argument should be an ancestor of this leaf node or + // the leaf node itself + int parentLevel = N->getAncestorHops(ArgDFNode); + assert((parentLevel >= 0 || ArgDFNode == (DFNode *)N) && + "Invalid DFNode argument to getNodeInstanceID_[xyz]!"); + + // Get specified dimension + // (dim = 0) => x + // (dim = 1) => y + // (dim = 2) => z + int dim = + (int)(II->getIntrinsicID() - Intrinsic::hpvm_getNodeInstanceID_x); + assert( + (dim >= 0) && (dim < 3) && + "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic " + "ID!"); + + // For immediate ancestor, use the extra argument introduced in + // F_CPU + int numParamsF = F->getFunctionType()->getNumParams(); + int numParamsF_CPU = F_CPU->getFunctionType()->getNumParams(); + assert((numParamsF_CPU - numParamsF == 6) && + "Difference of arguments between function and its clone is " + "not 6!"); + + if (parentLevel == 0) { + // Case when the query is for this node itself + unsigned offset = 3 + (3 - dim); + // Traverse argument list of F_CPU in reverse order to find the + // correct index or dim argument. + Argument *indexVal = getArgumentFromEnd(F_CPU, offset); + assert(indexVal && "Index argument not found. Invalid offset!"); + + DEBUG(errs() << *II << " replaced with " << *indexVal << "\n"); + + II->replaceAllUsesWith(indexVal); + IItoRemove.push_back(II); + } else { + // Case when query is for an ancestor + Value *args[] = { + ConstantInt::get(Type::getInt32Ty(II->getContext()), + parentLevel), + ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)}; + CallInst *CI = CallInst::Create(llvm_hpvm_cpu_getDimInstance, + ArrayRef<Value *>(args, 2), + "nodeInstanceID", II); + DEBUG(errs() << *II << " replaced with " << *CI << "\n"); + II->replaceAllUsesWith(CI); + IItoRemove.push_back(II); + } + break; + } + /********************** llvm.hpvm.getNumNodeInstances() *************/ + case Intrinsic::hpvm_getNumNodeInstances_x: + case Intrinsic::hpvm_getNumNodeInstances_y: + case Intrinsic::hpvm_getNumNodeInstances_z: { + + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + + // The dfnode argument should be an ancestor of this leaf node or + // the leaf node itself + int parentLevel = N->getAncestorHops(ArgDFNode); + assert((parentLevel >= 0 || ArgDFNode == (DFNode *)N) && + "Invalid DFNode argument to getNodeInstanceID_[xyz]!"); + + // Get specified dimension + // (dim = 0) => x + // (dim = 1) => y + // (dim = 2) => z + int dim = (int)(II->getIntrinsicID() - + Intrinsic::hpvm_getNumNodeInstances_x); + assert((dim >= 0) && (dim < 3) && + "Invalid dimension for getNumNodeInstances_[xyz]. Check " + "Intrinsic ID!"); + + // For immediate ancestor, use the extra argument introduced in + // F_CPU + int numParamsF = F->getFunctionType()->getNumParams(); + int numParamsF_CPU = F_CPU->getFunctionType()->getNumParams(); + assert((numParamsF_CPU - numParamsF == 6) && + "Difference of arguments between function and its clone is " + "not 6!"); + + if (parentLevel == 0) { + // Case when the query is for this node itself + unsigned offset = 3 - dim; + // Traverse argument list of F_CPU in reverse order to find the + // correct index or dim argument. + Argument *limitVal = getArgumentFromEnd(F_CPU, offset); + assert(limitVal && "Limit argument not found. Invalid offset!"); + + DEBUG(errs() << *II << " replaced with " << *limitVal << "\n"); + + II->replaceAllUsesWith(limitVal); + IItoRemove.push_back(II); + } else { + // Case when query is from the ancestor + Value *args[] = { + ConstantInt::get(Type::getInt32Ty(II->getContext()), + parentLevel), + ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)}; + CallInst *CI = CallInst::Create(llvm_hpvm_cpu_getDimLimit, + ArrayRef<Value *>(args, 2), + "numNodeInstances", II); + DEBUG(errs() << *II << " replaced with " << *CI << "\n"); + II->replaceAllUsesWith(CI); + IItoRemove.push_back(II); + } + + break; + } + default: + DEBUG(errs() << "Found unknown intrinsic with ID = " + << II->getIntrinsicID() << "\n"); + assert(false && "Unknown HPVM Intrinsic!"); + break; + } + + } else if (BuildDFG::isHPVMIntrinsic(I)) { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); + if (II->getIntrinsicID() == Intrinsic::hpvm_nz_loop) { + IItoRemove.push_back(II); + } + } + } + + // Remove them in reverse order + for (std::vector<IntrinsicInst *>::iterator i = IItoRemove.begin(); + i != IItoRemove.end(); ++i) { + (*i)->replaceAllUsesWith(UndefValue::get((*i)->getType())); + (*i)->eraseFromParent(); + } + + DEBUG(errs() << *F_CPU); + + } else if (N->getTargetHint() == hpvm::EPOCHS_TARGET) { + // Do codegen for EPOCHS scheduler + // Only if the task hasn't been registered yet. + IRBuilder<> Builder(InitCall); + + // Get Task Info + int TaskID = N->getTaskID(); + TaskInfo &TI = getTaskInfo(TaskID); + // Check if task hasn't already been registered, then register it + if (RegisteredTasks.find(TI) == RegisteredTasks.end()) { + std::vector<Value *> RegisterTaskArgs; + + // Load the Schedule Pointer + auto *SchedulerPtr = Builder.CreateLoad(SchedulerHandleAddress); + DEBUG(errs() << *SchedulerPtr << "\n"); + auto *SchedulerPtrBC = Builder.CreateBitCast( + SchedulerPtr, Type::getInt8PtrTy(M.getContext()), "scheduler_handle"); + DEBUG(errs() << *SchedulerPtrBC << "\n"); + RegisterTaskArgs.push_back(SchedulerPtrBC); + + std::string TaskName = TI.TaskName; + Value *TaskNameStrPtr = Builder.CreateGlobalStringPtr(TaskName); + DEBUG(errs() << TaskName << ": " << *TaskNameStrPtr << "\n"); + RegisterTaskArgs.push_back(TaskNameStrPtr); + + std::string TaskDescription = TI.TaskDescription; + Value *TaskDescriptionStrPtr = + Builder.CreateGlobalStringPtr(TaskDescription); + DEBUG(errs() << TaskDescription << ": " << *TaskDescriptionStrPtr + << "\n"); + RegisterTaskArgs.push_back(TaskDescriptionStrPtr); + + FunctionCallee TaskSetupFunction = getTaskSetupFunction(TI); + Value *TaskSetupFuncPtr = TaskSetupFunction.getCallee(); + DEBUG(errs() << *TaskSetupFuncPtr << "\n"); + RegisterTaskArgs.push_back(TaskSetupFuncPtr); + + FunctionCallee TaskFinishExecutionFunction = + getTaskFinishExecutionFunction(TI); + Value *TaskFinishExecutionFuncPtr = + TaskFinishExecutionFunction.getCallee(); + DEBUG(errs() << *TaskFinishExecutionFuncPtr << "\n"); + RegisterTaskArgs.push_back(TaskFinishExecutionFuncPtr); + + FunctionCallee TaskAutoFinishFunction = getTaskAutoFinishFunction(TI); + Value *TaskAutoFinishFuncPtr = TaskAutoFinishFunction.getCallee(); + DEBUG(errs() << *TaskAutoFinishFuncPtr << "\n"); + RegisterTaskArgs.push_back(TaskAutoFinishFuncPtr); + + FunctionCallee TaskPrintFunction = getTaskPrintFunction(TI); + Value *TaskPrintFuncPtr = TaskPrintFunction.getCallee(); + DEBUG(errs() << *TaskPrintFuncPtr << "\n"); + RegisterTaskArgs.push_back(TaskPrintFuncPtr); + + FunctionCallee TaskRunStatsFunction = getTaskRunStatsFunction(TI); + Value *TaskRunStatsFuncPtr = TaskRunStatsFunction.getCallee(); + DEBUG(errs() << *TaskRunStatsFuncPtr << "\n"); + RegisterTaskArgs.push_back(TaskRunStatsFuncPtr); + + Value *NumAccels = ConstantInt::get(Type::getInt32Ty(M.getContext()), + TI.getNumDevices()); + DEBUG(errs() << *NumAccels << "\n"); + RegisterTaskArgs.push_back(NumAccels); + + for (auto TDI : TI.Devices) { + hpvm::EPOCHS_DEVICES TaskDevice = TDI.TaskDeviceTy; + Value *TaskDeviceInt = + ConstantInt::get(Type::getInt64Ty(M.getContext()), TaskDevice); + + FunctionCallee TaskDeviceExecutionFunction = + getTaskDeviceFunction(TI, TaskDevice); + Value *TaskDeviceExecutionFuncPtr = + TaskDeviceExecutionFunction.getCallee(); + + DEBUG(errs() << *TaskDeviceInt << ": " << *TaskDeviceExecutionFuncPtr + << "\n"); + RegisterTaskArgs.push_back(TaskDeviceInt); + RegisterTaskArgs.push_back(TaskDeviceExecutionFuncPtr); + } + + DEBUG(errs() << "Calling: " << *register_task_type.getCallee() + << "\n\tType: " << *register_task_type.getFunctionType() + << "\n"); + + CallInst *RegisterTaskInst = Builder.CreateCall( + register_task_type, ArrayRef<Value *>(RegisterTaskArgs), + "reg_task_" + TaskName); + DEBUG(errs() << *RegisterTaskInst << "\n"); + + auto *TaskHandleAddr = new GlobalVariable( + M, RegisterTaskInst->getType(), //->getPointerTo(), + false, GlobalValue::CommonLinkage, + Constant::getNullValue(RegisterTaskInst->getType()), + "task_" + TaskName + ".addr"); + DEBUG(errs() << *TaskHandleAddr << "\n"); + auto *SI = Builder.CreateStore(RegisterTaskInst, TaskHandleAddr); + DEBUG(errs() << *SI << "\n"); + TI.TaskHandleAddr = TaskHandleAddr; + RegisteredTasks.insert(TI); + } + + TaskHandleMap[N] = TI; + + N->addGenFunc(NULL, hpvm::EPOCHS_TARGET, true); + + // Build the returnArgMap to handle the edges and insert finish() calls + Function *F = N->getFuncPointer(); + StructType *FRetTy = cast<StructType>(F->getReturnType()); + std::vector<ReturnInst *> RIvec; + findReturnInst(F, RIvec); + assert(RIvec.size() < 2 && "Cannot have more than 1 return!"); + + ReturnInst *RI = RIvec[0]; + + Value *RV = RI->getReturnValue(); + DEBUG(errs() << *RV << "\n"); + assert(isa<InsertValueInst>(RV) && + "Expecting an InsertValueInst for the return"); + InsertValueInst *IVI = dyn_cast<InsertValueInst>(RV); + Value *insertedValue = IVI->getInsertedValueOperand(); + if (Argument *insertedArgument = dyn_cast<Argument>(insertedValue)) { + DEBUG(errs() << "Returned value is an argument! Treat it differently!\n"); + assert(IVI->getIndices().size() == 1 && + "Invalid indeces, expecting only 1."); + unsigned index = IVI->getIndices()[0]; + returnArgMap[N][index] = insertedArgument->getArgNo(); + } else { + llvm_unreachable("Unhandled return value!\n"); + } + for (unsigned i = 1; i < FRetTy->getNumElements(); ++i) { + RV = IVI->getAggregateOperand(); + assert(isa<InsertValueInst>(RV) && + "Expecting an InsertValueInst for the return"); + IVI = dyn_cast<InsertValueInst>(RV); + insertedValue = IVI->getInsertedValueOperand(); + if (Argument *insertedArgument = dyn_cast<Argument>(insertedValue)) { + DEBUG( + errs() << "Returned value is an argument! Treat it differently!\n"); + assert(IVI->getIndices().size() == 1 && + "Invalid indeces, expecting only 1."); + unsigned index = IVI->getIndices()[0]; + returnArgMap[N][index] = insertedArgument->getArgNo(); + } else { + llvm_unreachable("Unhandled return value!\n"); + } + } + + } else { + // if (!preferredTargetIncludes(N, hpvm::CPU_TARGET)) { + // + // Do we need this??? + DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName() + << " : skipping it\n"); + + switch (N->getTag()) { + case hpvm::GPU_TARGET: + // A leaf node should not have an cpu function for GPU + // by design of DFG2LLVM_OpenCL backend + assert(!(N->hasCPUGenFuncForTarget(hpvm::GPU_TARGET)) && + "Leaf node not expected to have GPU GenFunc"); + break; + default: + break; + } + + // return; + //} + } +} + +// Helper function, populate a vector with all return statements in a function +static void findReturnInst(Function *F, + std::vector<ReturnInst *> &ReturnInstVec) { + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &(*i); + ReturnInst *RI = dyn_cast<ReturnInst>(I); + if (RI) { + ReturnInstVec.push_back(RI); + } + } +} +// Read the task details described in the Task configuration file +// and populate the TaskTypes vector. +void CGT_EPOCHS::parseTaskFile(std::string TaskPath) { + DEBUG(errs() << "parseTaskFile invoked on file path: " << TaskPath << "\n"); + std::ifstream TaskFile(TaskPath); + + TaskInfo TI; + TaskDeviceInfo TDI; + + bool isTIEmpty = true; + + for (std::string line; getline(TaskFile, line);) { + // std::cout << "Line: "<<line<<"\n"; + + std::vector<std::string> Tokens; + + strSplit(line, ' ', Tokens); + + if (Tokens.empty()) + continue; + + if (Tokens[0] == "TaskID") { + if (!isTIEmpty) + TaskTypes.push_back(TI); + + TI.clearTaskInfo(); + assert(Tokens.size() == 2 && + "TaskID in Config File must provide integer TaskID"); + + TI.TaskID = std::stoi(Tokens[1]); + isTIEmpty = false; + + } else if (Tokens[0] == "TaskName") { + assert(Tokens.size() == 2 && + "TaskName in Config File must provide string TaskName"); + + TI.TaskName = Tokens[1]; + + } else if (Tokens[0] == "TaskDescription") { + assert( + Tokens.size() >= 2 && + "TaskDescription in Config File must provide string TaskDescription"); + + std::string Description = ""; + for (unsigned i = 1; i < Tokens.size(); i++) { + Description = Description + Tokens[i] + " "; + } + + TI.TaskDescription = Description; + + } else if (Tokens[0] == "TaskFinishExecution") { + assert(Tokens.size() == 2 && "TaskFinishExecution in Config File must " + "provide string Task Finish function name"); + + TI.TaskFinishExecution = Tokens[1]; + + } else if (Tokens[0] == "TaskAutoFinish") { + assert(Tokens.size() == 2 && "TaskAutoFinish in Config File must provide " + "string Task AutoFinish function name"); + + TI.TaskAutoFinish = Tokens[1]; + + } else if (Tokens[0] == "TaskSetup") { + assert(Tokens.size() == 2 && "TaskSetup in Config File must provide " + "string Task Setup function name"); + + TI.TaskSetup = Tokens[1]; + + } else if (Tokens[0] == "TaskPrint") { + assert(Tokens.size() == 2 && "TaskPrint in Config File must provide " + "string Task Print function name"); + + TI.TaskPrint = Tokens[1]; + + } else if (Tokens[0] == "TaskRunStats") { + assert(Tokens.size() == 2 && "TaskRunStats in Config File must provide " + "string Task Run Stats function name"); + + TI.TaskRunStats = Tokens[1]; + + } else if (Tokens[0].rfind("TaskDevice") == 0) { + assert(Tokens.size() == 2 && + "TaskDevice* in Config File must provide string Task Device name"); + + TDI.clearTaskDeviceInfo(); + + // Check which device Ty + hpvm::EPOCHS_DEVICES DevTy = hpvm::CPU_ACCEL; + + if (Tokens[1] == "CPU_ACCEL") { + DevTy = hpvm::CPU_ACCEL; + } else if (Tokens[1] == "1D_FFT_ACCEL") { + DevTy = hpvm::OneD_FFT_ACCEL; + } else if (Tokens[1] == "CV_CNN_ACCEL") { + DevTy = hpvm::CV_CNN_ACCEL; + } else if (Tokens[1] == "VITDEC_ACCEL") { + DevTy = hpvm::VITDEC_ACCEL; + } else { + assert(false && "Unknown Device Type encountered in Task Config file"); + } + + TDI.TaskDeviceTy = DevTy; + + } else if (Tokens[0].rfind("TaskExecuteOnDevice") == 0) { + assert(Tokens.size() == 2 && + "TaskExecuteOnDevice* in Config File must provide string Task " + "Execute on Device function name"); + TDI.TaskDeviceFn = Tokens[1]; + TI.Devices.push_back(TDI); + + } else { + assert(false && "Unknown Token Type!"); + } + } + if (!isTIEmpty) { + TaskTypes.push_back(TI); + } +} + +TaskInfo &CGT_EPOCHS::getTaskInfo(int TaskID) { + for (TaskInfo &TI : TaskTypes) { + if (TI.TaskID == TaskID) { + return TI; + } + } + + llvm_unreachable("Requested TaskInfo with unknown TaskID"); +} + +FunctionCallee CGT_EPOCHS::getTaskSetupFunction(TaskInfo &TI) { + return getTaskFunctionHelper(TI.TaskSetup); +} + +FunctionCallee CGT_EPOCHS::getTaskPrintFunction(TaskInfo &TI) { + return getTaskFunctionHelper(TI.TaskPrint); +} + +FunctionCallee CGT_EPOCHS::getTaskRunStatsFunction(TaskInfo &TI) { + return getTaskFunctionHelper(TI.TaskRunStats); +} + +FunctionCallee CGT_EPOCHS::getTaskDeviceFunction(TaskInfo &TI, + hpvm::EPOCHS_DEVICES DevTy) { + + std::string DeviceFunctionName = ""; + + for (TaskDeviceInfo TDI : TI.Devices) { + if (TDI.TaskDeviceTy == DevTy) { + DeviceFunctionName = TDI.TaskDeviceFn; + break; + } + } + + assert(DeviceFunctionName != "" && "Device function name not found!"); + + return getTaskFunctionHelper(DeviceFunctionName); +} + +FunctionCallee CGT_EPOCHS::getTaskAutoFinishFunction(TaskInfo &TI) { + return getTaskFunctionHelper(TI.TaskAutoFinish); +} + +FunctionCallee CGT_EPOCHS::getTaskFinishExecutionFunction(TaskInfo &TI) { + return getTaskFunctionHelper(TI.TaskFinishExecution); +} + +FunctionCallee CGT_EPOCHS::getTaskFunctionHelper(std::string TaskFunctionName) { + Function *TaskFn; + + DEBUG(errs() << "Requesting Task Function " << TaskFunctionName << " ...\n"); + if (!(TaskFn = M.getFunction(TaskFunctionName))) { + TaskFn = TaskModule->getFunction(TaskFunctionName); + + assert(TaskFn && "Requested Task function does not exist in task module"); + } + + return M.getOrInsertFunction(TaskFunctionName, TaskFn->getFunctionType()); +} + +} // namespace dfg2llvm + +char DFG2LLVM_EPOCHS::ID = 0; +static RegisterPass<DFG2LLVM_EPOCHS> + X("dfg2llvm-epochs", "Dataflow Graph to LLVM for EPOCHS backend", + false /* does not modify the CFG */, + true /* transformation, not just analysis */); diff --git a/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/DFG2LLVM_EPOCHS.exports b/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/DFG2LLVM_EPOCHS.exports new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/LLVMBuild.txt b/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/LLVMBuild.txt new file mode 100644 index 0000000000000000000000000000000000000000..c620bd6030aee7e3742c09b7213baad9626a0ac5 --- /dev/null +++ b/hpvm/lib/Transforms/DFG2LLVM_EPOCHS/LLVMBuild.txt @@ -0,0 +1,21 @@ +;===- ./lib/Transforms/DFG2LLVM_EPOCHS/LLVMBuild.txt --------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = DFG2LLVM_EPOCHS +parent = Transforms diff --git a/hpvm/llvm_patches/include/llvm/Analysis/DDG.h b/hpvm/llvm_patches/include/llvm/Analysis/DDG.h deleted file mode 100644 index 165efc97a480e06b8ea5e8031e97da469d1f6418..0000000000000000000000000000000000000000 --- a/hpvm/llvm_patches/include/llvm/Analysis/DDG.h +++ /dev/null @@ -1,623 +0,0 @@ -//===- llvm/Analysis/DDG.h --------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file defines the Data-Dependence Graph (DDG). -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_ANALYSIS_DDG_H -#define LLVM_ANALYSIS_DDG_H - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DirectedGraph.h" -#include "llvm/Analysis/DependenceAnalysis.h" -#include "llvm/Analysis/DependenceGraphBuilder.h" -#include "llvm/Analysis/LoopAnalysisManager.h" -#include "llvm/IR/Instructions.h" -#include "llvm/Support/GraphWriter.h" -#include <sstream> - - -namespace llvm { -class DDGNode; -class DDGEdge; -using DDGNodeBase = DGNode<DDGNode, DDGEdge>; -using DDGEdgeBase = DGEdge<DDGNode, DDGEdge>; -using DDGBase = DirectedGraph<DDGNode, DDGEdge>; -class LPMUpdater; - -/// Data Dependence Graph Node -/// The graph can represent the following types of nodes: -/// 1. Single instruction node containing just one instruction. -/// 2. Multiple instruction node where two or more instructions from -/// the same basic block are merged into one node. -/// 3. Pi-block node which is a group of other DDG nodes that are part of a -/// strongly-connected component of the graph. -/// A pi-block node contains more than one single or multiple instruction -/// nodes. The root node cannot be part of a pi-block. -/// 4. Root node is a special node that connects to all components such that -/// there is always a path from it to any node in the graph. -class DDGNode : public DDGNodeBase { -public: - using InstructionListType = SmallVectorImpl<Instruction *>; - - enum class NodeKind { - Unknown, - SingleInstruction, - MultiInstruction, - PiBlock, - Root, - }; - - DDGNode() = delete; - DDGNode(const NodeKind K) : DDGNodeBase(), Kind(K) {} - DDGNode(const DDGNode &N) : DDGNodeBase(N), Kind(N.Kind) {} - DDGNode(DDGNode &&N) : DDGNodeBase(std::move(N)), Kind(N.Kind) {} - virtual ~DDGNode() = 0; - - DDGNode &operator=(const DDGNode &N) { - DGNode::operator=(N); - Kind = N.Kind; - return *this; - } - - DDGNode &operator=(DDGNode &&N) { - DGNode::operator=(std::move(N)); - Kind = N.Kind; - return *this; - } - - /// Getter for the kind of this node. - NodeKind getKind() const { return Kind; } - - /// Collect a list of instructions, in \p IList, for which predicate \p Pred - /// evaluates to true when iterating over instructions of this node. Return - /// true if at least one instruction was collected, and false otherwise. - bool collectInstructions(llvm::function_ref<bool(Instruction *)> const &Pred, - InstructionListType &IList) const; - -protected: - /// Setter for the kind of this node. - void setKind(NodeKind K) { Kind = K; } - -private: - NodeKind Kind; -}; - -/// Subclass of DDGNode representing the root node of the graph. -/// There should only be one such node in a given graph. -class RootDDGNode : public DDGNode { -public: - RootDDGNode() : DDGNode(NodeKind::Root) {} - RootDDGNode(const RootDDGNode &N) = delete; - RootDDGNode(RootDDGNode &&N) : DDGNode(std::move(N)) {} - ~RootDDGNode() {} - - /// Define classof to be able to use isa<>, cast<>, dyn_cast<>, etc. - static bool classof(const DDGNode *N) { - return N->getKind() == NodeKind::Root; - } - static bool classof(const RootDDGNode *N) { return true; } -}; - -/// Subclass of DDGNode representing single or multi-instruction nodes. -class SimpleDDGNode : public DDGNode { - friend class DDGBuilder; - -public: - SimpleDDGNode() = delete; - SimpleDDGNode(Instruction &I); - SimpleDDGNode(const SimpleDDGNode &N); - SimpleDDGNode(SimpleDDGNode &&N); - ~SimpleDDGNode(); - - SimpleDDGNode &operator=(const SimpleDDGNode &N) { - DDGNode::operator=(N); - InstList = N.InstList; - return *this; - } - - SimpleDDGNode &operator=(SimpleDDGNode &&N) { - DDGNode::operator=(std::move(N)); - InstList = std::move(N.InstList); - return *this; - } - - /// Get the list of instructions in this node. - const InstructionListType &getInstructions() const { - assert(!InstList.empty() && "Instruction List is empty."); - return InstList; - } - InstructionListType &getInstructions() { - return const_cast<InstructionListType &>( - static_cast<const SimpleDDGNode *>(this)->getInstructions()); - } - - /// Get the first/last instruction in the node. - Instruction *getFirstInstruction() const { return getInstructions().front(); } - Instruction *getLastInstruction() const { return getInstructions().back(); } - - /// Define classof to be able to use isa<>, cast<>, dyn_cast<>, etc. - static bool classof(const DDGNode *N) { - return N->getKind() == NodeKind::SingleInstruction || - N->getKind() == NodeKind::MultiInstruction; - } - static bool classof(const SimpleDDGNode *N) { return true; } - -private: - /// Append the list of instructions in \p Input to this node. - void appendInstructions(const InstructionListType &Input) { - setKind((InstList.size() == 0 && Input.size() == 1) - ? NodeKind::SingleInstruction - : NodeKind::MultiInstruction); - InstList.insert(InstList.end(), Input.begin(), Input.end()); - } - void appendInstructions(const SimpleDDGNode &Input) { - appendInstructions(Input.getInstructions()); - } - - /// List of instructions associated with a single or multi-instruction node. - SmallVector<Instruction *, 2> InstList; -}; - -/// Subclass of DDGNode representing a pi-block. A pi-block represents a group -/// of DDG nodes that are part of a strongly-connected component of the graph. -/// Replacing all the SCCs with pi-blocks results in an acyclic representation -/// of the DDG. For example if we have: -/// {a -> b}, {b -> c, d}, {c -> a} -/// the cycle a -> b -> c -> a is abstracted into a pi-block "p" as follows: -/// {p -> d} with "p" containing: {a -> b}, {b -> c}, {c -> a} -class PiBlockDDGNode : public DDGNode { -public: - using PiNodeList = SmallVector<DDGNode *, 4>; - - PiBlockDDGNode() = delete; - PiBlockDDGNode(const PiNodeList &List); - PiBlockDDGNode(const PiBlockDDGNode &N); - PiBlockDDGNode(PiBlockDDGNode &&N); - ~PiBlockDDGNode(); - - PiBlockDDGNode &operator=(const PiBlockDDGNode &N) { - DDGNode::operator=(N); - NodeList = N.NodeList; - return *this; - } - - PiBlockDDGNode &operator=(PiBlockDDGNode &&N) { - DDGNode::operator=(std::move(N)); - NodeList = std::move(N.NodeList); - return *this; - } - - /// Get the list of nodes in this pi-block. - const PiNodeList &getNodes() const { - assert(!NodeList.empty() && "Node list is empty."); - return NodeList; - } - PiNodeList &getNodes() { - return const_cast<PiNodeList &>( - static_cast<const PiBlockDDGNode *>(this)->getNodes()); - } - - /// Define classof to be able to use isa<>, cast<>, dyn_cast<>, etc. - static bool classof(const DDGNode *N) { - return N->getKind() == NodeKind::PiBlock; - } - -private: - /// List of nodes in this pi-block. - PiNodeList NodeList; -}; - -/// Data Dependency Graph Edge. -/// An edge in the DDG can represent a def-use relationship or -/// a memory dependence based on the result of DependenceAnalysis. -/// A rooted edge connects the root node to one of the components -/// of the graph. -class DDGEdge : public DDGEdgeBase { -public: - /// The kind of edge in the DDG - enum class EdgeKind { - Unknown, - RegisterDefUse, - MemoryDependence, - Rooted, - Last = Rooted // Must be equal to the largest enum value. - }; - - explicit DDGEdge(DDGNode &N) = delete; - DDGEdge(DDGNode &N, EdgeKind K) : DDGEdgeBase(N), Kind(K) {} - DDGEdge(const DDGEdge &E) : DDGEdgeBase(E), Kind(E.getKind()) {} - DDGEdge(DDGEdge &&E) : DDGEdgeBase(std::move(E)), Kind(E.Kind) {} - DDGEdge &operator=(const DDGEdge &E) { - DDGEdgeBase::operator=(E); - Kind = E.Kind; - return *this; - } - - DDGEdge &operator=(DDGEdge &&E) { - DDGEdgeBase::operator=(std::move(E)); - Kind = E.Kind; - return *this; - } - - /// Get the edge kind - EdgeKind getKind() const { return Kind; }; - - /// Return true if this is a def-use edge, and false otherwise. - bool isDefUse() const { return Kind == EdgeKind::RegisterDefUse; } - - /// Return true if this is a memory dependence edge, and false otherwise. - bool isMemoryDependence() const { return Kind == EdgeKind::MemoryDependence; } - - /// Return true if this is an edge stemming from the root node, and false - /// otherwise. - bool isRooted() const { return Kind == EdgeKind::Rooted; } - -private: - EdgeKind Kind; -}; - -/// Encapsulate some common data and functionality needed for different -/// variations of data dependence graphs. -template <typename NodeType> class DependenceGraphInfo { -public: - using DependenceList = SmallVector<std::unique_ptr<Dependence>, 1>; - - DependenceGraphInfo() = delete; - DependenceGraphInfo(const DependenceGraphInfo &G) = delete; - DependenceGraphInfo(const std::string &N, const DependenceInfo &DepInfo) - : Name(N), DI(DepInfo), Root(nullptr) {} - DependenceGraphInfo(DependenceGraphInfo &&G) - : Name(std::move(G.Name)), DI(std::move(G.DI)), Root(G.Root) {} - virtual ~DependenceGraphInfo() {} - - /// Return the label that is used to name this graph. - const StringRef getName() const { return Name; } - - /// Return the root node of the graph. - NodeType &getRoot() const { - assert(Root && "Root node is not available yet. Graph construction may " - "still be in progress\n"); - return *Root; - } - - /// Collect all the data dependency infos coming from any pair of memory - /// accesses from \p Src to \p Dst, and store them into \p Deps. Return true - /// if a dependence exists, and false otherwise. - bool getDependencies(const NodeType &Src, const NodeType &Dst, - DependenceList &Deps) const; - -protected: - // Name of the graph. - std::string Name; - - // Store a copy of DependenceInfo in the graph, so that individual memory - // dependencies don't need to be stored. Instead when the dependence is - // queried it is recomputed using @DI. - const DependenceInfo DI; - - // A special node in the graph that has an edge to every connected component of - // the graph, to ensure all nodes are reachable in a graph walk. - NodeType *Root = nullptr; -}; - -//===--------------------------------------------------------------------===// -// DependenceGraphInfo Implementation -//===--------------------------------------------------------------------===// - -template <typename NodeType> -bool DependenceGraphInfo<NodeType>::getDependencies( - const NodeType &Src, const NodeType &Dst, DependenceList &Deps) const { - assert(Deps.empty() && "Expected empty output list at the start."); - - // List of memory access instructions from src and dst nodes. - SmallVector<Instruction *, 8> SrcIList, DstIList; - auto isMemoryAccess = [](const Instruction *I) { - return I->mayReadOrWriteMemory(); - }; - Src.collectInstructions(isMemoryAccess, SrcIList); - Dst.collectInstructions(isMemoryAccess, DstIList); - - for (auto *SrcI : SrcIList) - for (auto *DstI : DstIList) - if (auto Dep = - const_cast<DependenceInfo *>(&DI)->depends(SrcI, DstI, true)) - Deps.push_back(std::move(Dep)); - - return !Deps.empty(); -} - -using DDGInfo = DependenceGraphInfo<DDGNode>; - -/// Data Dependency Graph -class DataDependenceGraph : public DDGBase, public DDGInfo { - friend AbstractDependenceGraphBuilder<DataDependenceGraph>; - friend class DDGBuilder; - -public: - using NodeType = DDGNode; - using EdgeType = DDGEdge; - - DataDependenceGraph() = delete; - DataDependenceGraph(const DataDependenceGraph &G) = delete; - DataDependenceGraph(DataDependenceGraph &&G) - : DDGBase(std::move(G)), DDGInfo(std::move(G)) {} - DataDependenceGraph(Function &F, DependenceInfo &DI); - DataDependenceGraph(Loop &L, LoopInfo &LI, DependenceInfo &DI); - ~DataDependenceGraph(); - - /// If node \p N belongs to a pi-block return a pointer to the pi-block, - /// otherwise return null. - const PiBlockDDGNode *getPiBlock(const NodeType &N) const; - -protected: - /// Add node \p N to the graph, if it's not added yet, and keep track of the - /// root node as well as pi-blocks and their members. Return true if node is - /// successfully added. - bool addNode(NodeType &N); - -private: - using PiBlockMapType = DenseMap<const NodeType *, const PiBlockDDGNode *>; - - /// Mapping from graph nodes to their containing pi-blocks. If a node is not - /// part of a pi-block, it will not appear in this map. - PiBlockMapType PiBlockMap; -}; - -/// Concrete implementation of a pure data dependence graph builder. This class -/// provides custom implementation for the pure-virtual functions used in the -/// generic dependence graph build algorithm. -/// -/// For information about time complexity of the build algorithm see the -/// comments near the declaration of AbstractDependenceGraphBuilder. -class DDGBuilder : public AbstractDependenceGraphBuilder<DataDependenceGraph> { -public: - DDGBuilder(DataDependenceGraph &G, DependenceInfo &D, - const BasicBlockListType &BBs) - : AbstractDependenceGraphBuilder(G, D, BBs) {} - DDGNode &createRootNode() final override { - auto *RN = new RootDDGNode(); - assert(RN && "Failed to allocate memory for DDG root node."); - Graph.addNode(*RN); - return *RN; - } - DDGNode &createFineGrainedNode(Instruction &I) final override { - auto *SN = new SimpleDDGNode(I); - assert(SN && "Failed to allocate memory for simple DDG node."); - Graph.addNode(*SN); - return *SN; - } - DDGNode &createPiBlock(const NodeListType &L) final override { - auto *Pi = new PiBlockDDGNode(L); - assert(Pi && "Failed to allocate memory for pi-block node."); - Graph.addNode(*Pi); - return *Pi; - } - DDGEdge &createDefUseEdge(DDGNode &Src, DDGNode &Tgt) final override { - auto *E = new DDGEdge(Tgt, DDGEdge::EdgeKind::RegisterDefUse); - assert(E && "Failed to allocate memory for edge"); - Graph.connect(Src, Tgt, *E); - return *E; - } - DDGEdge &createMemoryEdge(DDGNode &Src, DDGNode &Tgt) final override { - auto *E = new DDGEdge(Tgt, DDGEdge::EdgeKind::MemoryDependence); - assert(E && "Failed to allocate memory for edge"); - Graph.connect(Src, Tgt, *E); - return *E; - } - DDGEdge &createRootedEdge(DDGNode &Src, DDGNode &Tgt) final override { - auto *E = new DDGEdge(Tgt, DDGEdge::EdgeKind::Rooted); - assert(E && "Failed to allocate memory for edge"); - assert(isa<RootDDGNode>(Src) && "Expected root node"); - Graph.connect(Src, Tgt, *E); - return *E; - } - - const NodeListType &getNodesInPiBlock(const DDGNode &N) final override { - auto *PiNode = dyn_cast<const PiBlockDDGNode>(&N); - assert(PiNode && "Expected a pi-block node."); - return PiNode->getNodes(); - } - - /// Return true if the two nodes \pSrc and \pTgt are both simple nodes and - /// the consecutive instructions after merging belong to the same basic block. - bool areNodesMergeable(const DDGNode &Src, - const DDGNode &Tgt) const final override; - void mergeNodes(DDGNode &Src, DDGNode &Tgt) final override; - bool shouldSimplify() const final override; - bool shouldCreatePiBlocks() const final override; -}; - -raw_ostream &operator<<(raw_ostream &OS, const DDGNode &N); -raw_ostream &operator<<(raw_ostream &OS, const DDGNode::NodeKind K); -raw_ostream &operator<<(raw_ostream &OS, const DDGEdge &E); -raw_ostream &operator<<(raw_ostream &OS, const DDGEdge::EdgeKind K); -raw_ostream &operator<<(raw_ostream &OS, const DataDependenceGraph &G); - -//===--------------------------------------------------------------------===// -// DDG Analysis Passes -//===--------------------------------------------------------------------===// - -/// Analysis pass that builds the DDG for a loop. -class DDGAnalysis : public AnalysisInfoMixin<DDGAnalysis> { -public: - using Result = std::unique_ptr<DataDependenceGraph>; - Result run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR); - -private: - friend AnalysisInfoMixin<DDGAnalysis>; - static AnalysisKey Key; -}; - -/// Textual printer pass for the DDG of a loop. -class DDGAnalysisPrinterPass : public PassInfoMixin<DDGAnalysisPrinterPass> { -public: - explicit DDGAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {} - PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, - LoopStandardAnalysisResults &AR, LPMUpdater &U); - -private: - raw_ostream &OS; -}; - - -//===--------------------------------------------------------------------===// -// GraphTraits specializations for the DDG -//===--------------------------------------------------------------------===// - - - -/// non-const versions of the grapth trait specializations for DDG -template <> struct GraphTraits<DDGNode *> { - using NodeRef = DDGNode *; - - static DDGNode *DDGGetTargetNode(DGEdge<DDGNode, DDGEdge> *P) { - return &P->getTargetNode(); - } - - // Provide a mapped iterator so that the GraphTrait-based implementations can - // find the target nodes without having to explicitly go through the edges. - using ChildIteratorType = - mapped_iterator<DDGNode::iterator, decltype(&DDGGetTargetNode)>; - using ChildEdgeIteratorType = DDGNode::iterator; - - static NodeRef getEntryNode(NodeRef N) { return N; } - static ChildIteratorType child_begin(NodeRef N) { - return ChildIteratorType(N->begin(), &DDGGetTargetNode); - } - static ChildIteratorType child_end(NodeRef N) { - return ChildIteratorType(N->end(), &DDGGetTargetNode); - } - - static ChildEdgeIteratorType child_edge_begin(NodeRef N) { - return N->begin(); - } - static ChildEdgeIteratorType child_edge_end(NodeRef N) { return N->end(); } -}; - -template <> -struct GraphTraits<DataDependenceGraph *> : public GraphTraits<DDGNode *> { - using nodes_iterator = DataDependenceGraph::iterator; - static NodeRef getEntryNode(DataDependenceGraph *DG) { - return &DG->getRoot(); - } - static nodes_iterator nodes_begin(DataDependenceGraph *DG) { - return DG->begin(); - } - static nodes_iterator nodes_end(DataDependenceGraph *DG) { return DG->end(); } -}; - -template <> struct DOTGraphTraits<DataDependenceGraph*> : public DefaultDOTGraphTraits { - DOTGraphTraits (bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {} - - static std::string getGraphName(DataDependenceGraph *Graph) { return "DDG";} - - static std::string getGraphProperties(DataDependenceGraph *Graph) { - return "\tcompound=true;"; - } - - std::string getNodeLabel(DDGNode *Node, DataDependenceGraph *Graph) { - std::string Str; - raw_string_ostream ss(Str); - ss << Node << " : "; - switch(Node->getKind()) { - case DDGNode::NodeKind::Root : - ss << "Root Node\n"; - break; - case DDGNode::NodeKind::SingleInstruction : - case DDGNode::NodeKind::MultiInstruction : - { - ss << "Simple Node\n"; - SimpleDDGNode *SN = dyn_cast<SimpleDDGNode>(Node); - DDGNode::InstructionListType &instructions = SN->getInstructions(); - for (auto i : instructions) { - ss << *i << "\n"; - } - break; - } - case DDGNode::NodeKind::PiBlock : - { - ss << "Pi Block\n"; - PiBlockDDGNode *PBN = dyn_cast<PiBlockDDGNode>(Node); - PiBlockDDGNode::PiNodeList &nodes = PBN->getNodes(); - for (auto n : nodes) { - ss << n << "\n"; - } - break; - } - default : - ss << "Unknown\n"; - } - return ss.str(); - } - - std::string getEdgeLabel(DDGEdge *Edge, DataDependenceGraph *Graph) { - std::string Str; - raw_string_ostream ss(Str); - switch(Edge->getKind()) { - case DDGEdge::EdgeKind::RegisterDefUse : - ss << "def-use"; - break; - case DDGEdge::EdgeKind::MemoryDependence : - ss << "memory" ; - break; - default : - ss << ""; - } - - return ss.str(); - } -}; - -/// const versions of the grapth trait specializations for DDG -template <> struct GraphTraits<const DDGNode *> { - using NodeRef = const DDGNode *; - - static const DDGNode *DDGGetTargetNode(const DGEdge<DDGNode, DDGEdge> *P) { - return &P->getTargetNode(); - } - - // Provide a mapped iterator so that the GraphTrait-based implementations can - // find the target nodes without having to explicitly go through the edges. - using ChildIteratorType = - mapped_iterator<DDGNode::const_iterator, decltype(&DDGGetTargetNode)>; - using ChildEdgeIteratorType = DDGNode::const_iterator; - - static NodeRef getEntryNode(NodeRef N) { return N; } - static ChildIteratorType child_begin(NodeRef N) { - return ChildIteratorType(N->begin(), &DDGGetTargetNode); - } - static ChildIteratorType child_end(NodeRef N) { - return ChildIteratorType(N->end(), &DDGGetTargetNode); - } - - static ChildEdgeIteratorType child_edge_begin(NodeRef N) { - return N->begin(); - } - static ChildEdgeIteratorType child_edge_end(NodeRef N) { return N->end(); } -}; - -template <> -struct GraphTraits<const DataDependenceGraph *> - : public GraphTraits<const DDGNode *> { - using nodes_iterator = DataDependenceGraph::const_iterator; - static NodeRef getEntryNode(const DataDependenceGraph *DG) { - return &DG->getRoot(); - } - static nodes_iterator nodes_begin(const DataDependenceGraph *DG) { - return DG->begin(); - } - static nodes_iterator nodes_end(const DataDependenceGraph *DG) { - return DG->end(); - } -}; - -} // namespace llvm - -#endif // LLVM_ANALYSIS_DDG_H diff --git a/hpvm/llvm_patches/include/llvm/Analysis/DependenceGraphBuilder.h b/hpvm/llvm_patches/include/llvm/Analysis/DependenceGraphBuilder.h deleted file mode 100644 index 6f4e1be94164f797e4fc053dac754f8e28839e9e..0000000000000000000000000000000000000000 --- a/hpvm/llvm_patches/include/llvm/Analysis/DependenceGraphBuilder.h +++ /dev/null @@ -1,203 +0,0 @@ -//===- llvm/Analysis/DependenceGraphBuilder.h -------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file defines a builder interface that can be used to populate dependence -// graphs such as DDG and PDG. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_ANALYSIS_DEPENDENCE_GRAPH_BUILDER_H -#define LLVM_ANALYSIS_DEPENDENCE_GRAPH_BUILDER_H - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/EquivalenceClasses.h" -#include "llvm/ADT/SmallVector.h" - -namespace llvm { - -class BasicBlock; -class DependenceInfo; -class Instruction; - -/// This abstract builder class defines a set of high-level steps for creating -/// DDG-like graphs. The client code is expected to inherit from this class and -/// define concrete implementation for each of the pure virtual functions used -/// in the high-level algorithm. -template <class GraphType> class AbstractDependenceGraphBuilder { -protected: - using BasicBlockListType = SmallVectorImpl<BasicBlock *>; - -private: - using NodeType = typename GraphType::NodeType; - using EdgeType = typename GraphType::EdgeType; - -public: - using ClassesType = EquivalenceClasses<BasicBlock *>; - using NodeListType = SmallVector<NodeType *, 4>; - - AbstractDependenceGraphBuilder(GraphType &G, DependenceInfo &D, - const BasicBlockListType &BBs) - : Graph(G), DI(D), BBList(BBs) {} - virtual ~AbstractDependenceGraphBuilder() {} - - /// The main entry to the graph construction algorithm. It starts by - /// creating nodes in increasing order of granularity and then - /// adds def-use and memory edges. As one of the final stages, it - /// also creates pi-block nodes to facilitate codegen in transformations - /// that use dependence graphs. - /// - /// The algorithmic complexity of this implementation is O(V^2 * I^2), where V - /// is the number of vertecies (nodes) and I is the number of instructions in - /// each node. The total number of instructions, N, is equal to V * I, - /// therefore the worst-case time complexity is O(N^2). The average time - /// complexity is O((N^2)/2). - void populate() { - computeInstructionOrdinals(); - createFineGrainedNodes(); - createDefUseEdges(); - createMemoryDependencyEdges(); - simplify(); - createAndConnectRootNode(); - createPiBlocks(); - sortNodesTopologically(); - } - - /// Compute ordinal numbers for each instruction and store them in a map for - /// future look up. These ordinals are used to compute node ordinals which are - /// in turn used to order nodes that are part of a cycle. - /// Instruction ordinals are assigned based on lexical program order. - void computeInstructionOrdinals(); - - /// Create fine grained nodes. These are typically atomic nodes that - /// consist of a single instruction. - void createFineGrainedNodes(); - - /// Analyze the def-use chains and create edges from the nodes containing - /// definitions to the nodes containing the uses. - void createDefUseEdges(); - - /// Analyze data dependencies that exist between memory loads or stores, - /// in the graph nodes and create edges between them. - void createMemoryDependencyEdges(); - - /// Create a root node and add edges such that each node in the graph is - /// reachable from the root. - void createAndConnectRootNode(); - - /// Apply graph abstraction to groups of nodes that belong to a strongly - /// connected component of the graph to create larger compound nodes - /// called pi-blocks. The purpose of this abstraction is to isolate sets of - /// program elements that need to stay together during codegen and turn - /// the dependence graph into an acyclic graph. - void createPiBlocks(); - - /// Go through all the nodes in the graph and collapse any two nodes - /// 'a' and 'b' if all of the following are true: - /// - the only edge from 'a' is a def-use edge to 'b' and - /// - the only edge to 'b' is a def-use edge from 'a' and - /// - there is no cyclic edge from 'b' to 'a' and - /// - all instructions in 'a' and 'b' belong to the same basic block and - /// - both 'a' and 'b' are simple (single or multi instruction) nodes. - void simplify(); - - /// Topologically sort the graph nodes. - void sortNodesTopologically(); - -protected: - /// Create the root node of the graph. - virtual NodeType &createRootNode() = 0; - - /// Create an atomic node in the graph given a single instruction. - virtual NodeType &createFineGrainedNode(Instruction &I) = 0; - - /// Create a pi-block node in the graph representing a group of nodes in an - /// SCC of the graph. - virtual NodeType &createPiBlock(const NodeListType &L) = 0; - - /// Create a def-use edge going from \p Src to \p Tgt. - virtual EdgeType &createDefUseEdge(NodeType &Src, NodeType &Tgt) = 0; - - /// Create a memory dependence edge going from \p Src to \p Tgt. - virtual EdgeType &createMemoryEdge(NodeType &Src, NodeType &Tgt) = 0; - - /// Create a rooted edge going from \p Src to \p Tgt . - virtual EdgeType &createRootedEdge(NodeType &Src, NodeType &Tgt) = 0; - - /// Given a pi-block node, return a vector of all the nodes contained within - /// it. - virtual const NodeListType &getNodesInPiBlock(const NodeType &N) = 0; - - /// Deallocate memory of edge \p E. - virtual void destroyEdge(EdgeType &E) { delete &E; } - - /// Deallocate memory of node \p N. - virtual void destroyNode(NodeType &N) { delete &N; } - - /// Return true if creation of pi-blocks are supported and desired, - /// and false otherwise. - virtual bool shouldCreatePiBlocks() const { return true; } - - /// Return true if graph simplification step is requested, and false - /// otherwise. - virtual bool shouldSimplify() const { return true; } - - /// Return true if it's safe to merge the two nodes. - virtual bool areNodesMergeable(const NodeType &A, - const NodeType &B) const = 0; - - /// Append the content of node \p B into node \p A and remove \p B and - /// the edge between \p A and \p B from the graph. - virtual void mergeNodes(NodeType &A, NodeType &B) = 0; - - /// Given an instruction \p I return its associated ordinal number. - size_t getOrdinal(Instruction &I) { - assert(InstOrdinalMap.find(&I) != InstOrdinalMap.end() && - "No ordinal computed for this instruction."); - return InstOrdinalMap[&I]; - } - - /// Given a node \p N return its associated ordinal number. - size_t getOrdinal(NodeType &N) { - assert(NodeOrdinalMap.find(&N) != NodeOrdinalMap.end() && - "No ordinal computed for this node."); - return NodeOrdinalMap[&N]; - } - - /// Map types to map instructions to nodes used when populating the graph. - using InstToNodeMap = DenseMap<Instruction *, NodeType *>; - - /// Map Types to map instruction/nodes to an ordinal number. - using InstToOrdinalMap = DenseMap<Instruction *, size_t>; - using NodeToOrdinalMap = DenseMap<NodeType *, size_t>; - - /// Reference to the graph that gets built by a concrete implementation of - /// this builder. - GraphType &Graph; - - /// Dependence information used to create memory dependence edges in the - /// graph. - DependenceInfo &DI; - - /// The list of basic blocks to consider when building the graph. - const BasicBlockListType &BBList; - - /// A mapping from instructions to the corresponding nodes in the graph. - InstToNodeMap IMap; - - /// A mapping from each instruction to an ordinal number. This map is used to - /// populate the \p NodeOrdinalMap. - InstToOrdinalMap InstOrdinalMap; - - /// A mapping from nodes to an ordinal number. This map is used to sort nodes - /// in a pi-block based on program order. - NodeToOrdinalMap NodeOrdinalMap; -}; - -} // namespace llvm - -#endif // LLVM_ANALYSIS_DEPENDENCE_GRAPH_BUILDER_H diff --git a/hpvm/llvm_patches/lib/Analysis/CMakeLists.txt b/hpvm/llvm_patches/lib/Analysis/CMakeLists.txt deleted file mode 100644 index 0f1c6014a329fab214630a8f102f51d47279750c..0000000000000000000000000000000000000000 --- a/hpvm/llvm_patches/lib/Analysis/CMakeLists.txt +++ /dev/null @@ -1,105 +0,0 @@ -add_llvm_library(LLVMAnalysis - AliasAnalysis.cpp - AliasAnalysisEvaluator.cpp - AliasAnalysisSummary.cpp - AliasSetTracker.cpp - Analysis.cpp - AssumptionCache.cpp - BasicAliasAnalysis.cpp - BlockFrequencyInfo.cpp - BlockFrequencyInfoImpl.cpp - BranchProbabilityInfo.cpp - CFG.cpp - CFGPrinter.cpp - CFLAndersAliasAnalysis.cpp - CFLSteensAliasAnalysis.cpp - CGSCCPassManager.cpp - CallGraph.cpp - CallGraphSCCPass.cpp - CallPrinter.cpp - CaptureTracking.cpp - CmpInstAnalysis.cpp - CostModel.cpp - CodeMetrics.cpp - ConstantFolding.cpp - DDG.cpp - Delinearization.cpp - DemandedBits.cpp - DependenceAnalysis.cpp - DependenceGraphBuilder.cpp - DivergenceAnalysis.cpp - DomPrinter.cpp - DomTreeUpdater.cpp - DominanceFrontier.cpp - EHPersonalities.cpp - GlobalsModRef.cpp - GuardUtils.cpp - IVDescriptors.cpp - IVUsers.cpp - IndirectCallPromotionAnalysis.cpp - InlineCost.cpp - InstCount.cpp - InstructionPrecedenceTracking.cpp - InstructionSimplify.cpp - Interval.cpp - IntervalPartition.cpp - LazyBranchProbabilityInfo.cpp - LazyBlockFrequencyInfo.cpp - LazyCallGraph.cpp - LazyValueInfo.cpp - LegacyDivergenceAnalysis.cpp - Lint.cpp - Loads.cpp - LoopAccessAnalysis.cpp - LoopAnalysisManager.cpp - LoopUnrollAnalyzer.cpp - LoopInfo.cpp - LoopPass.cpp - MemDepPrinter.cpp - MemDerefPrinter.cpp - MemoryBuiltins.cpp - MemoryDependenceAnalysis.cpp - MemoryLocation.cpp - MemorySSA.cpp - MemorySSAUpdater.cpp - ModuleDebugInfoPrinter.cpp - ModuleSummaryAnalysis.cpp - MustExecute.cpp - ObjCARCAliasAnalysis.cpp - ObjCARCAnalysisUtils.cpp - ObjCARCInstKind.cpp - OptimizationRemarkEmitter.cpp - OrderedBasicBlock.cpp - OrderedInstructions.cpp - PHITransAddr.cpp - PhiValues.cpp - PostDominators.cpp - ProfileSummaryInfo.cpp - PtrUseVisitor.cpp - RegionInfo.cpp - RegionPass.cpp - RegionPrinter.cpp - ScalarEvolution.cpp - ScalarEvolutionAliasAnalysis.cpp - ScalarEvolutionExpander.cpp - ScalarEvolutionNormalization.cpp - StackSafetyAnalysis.cpp - SyncDependenceAnalysis.cpp - SyntheticCountsUtils.cpp - TargetLibraryInfo.cpp - TargetTransformInfo.cpp - Trace.cpp - TypeBasedAliasAnalysis.cpp - TypeMetadataUtils.cpp - ScopedNoAliasAA.cpp - ValueLattice.cpp - ValueLatticeUtils.cpp - ValueTracking.cpp - VectorUtils.cpp - - ADDITIONAL_HEADER_DIRS - ${LLVM_MAIN_INCLUDE_DIR}/llvm/Analysis - - DEPENDS - intrinsics_gen - ) diff --git a/hpvm/llvm_patches/lib/Analysis/DDG.cpp b/hpvm/llvm_patches/lib/Analysis/DDG.cpp deleted file mode 100644 index 7bd3044918492ee29aae21599c77f6e66f38804c..0000000000000000000000000000000000000000 --- a/hpvm/llvm_patches/lib/Analysis/DDG.cpp +++ /dev/null @@ -1,326 +0,0 @@ -//===- DDG.cpp - Data Dependence Graph -------------------------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// The implementation for the data dependence graph. -//===----------------------------------------------------------------------===// -#include "llvm/Analysis/DDG.h" -#include "llvm/ADT/SCCIterator.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/LoopIterator.h" -#include "llvm/Support/CommandLine.h" -#include <memory> - -using namespace llvm; - -static cl::opt<bool> SimplifyDDG( - "ddg-simplify", cl::init(true), cl::Hidden, cl::ZeroOrMore, - cl::desc( - "Simplify DDG by merging nodes that have less interesting edges.")); - -static cl::opt<bool> - CreatePiBlocks("ddg-pi-blocks", cl::init(true), cl::Hidden, cl::ZeroOrMore, - cl::desc("Create pi-block nodes.")); - -#define DEBUG_TYPE "ddg" - -template class llvm::DGEdge<DDGNode, DDGEdge>; -template class llvm::DGNode<DDGNode, DDGEdge>; -template class llvm::DirectedGraph<DDGNode, DDGEdge>; - -//===--------------------------------------------------------------------===// -// DDGNode implementation -//===--------------------------------------------------------------------===// -DDGNode::~DDGNode() {} - -bool DDGNode::collectInstructions( - llvm::function_ref<bool(Instruction *)> const &Pred, - InstructionListType &IList) const { - assert(IList.empty() && "Expected the IList to be empty on entry."); - if (isa<SimpleDDGNode>(this)) { - for (Instruction *I : cast<const SimpleDDGNode>(this)->getInstructions()) - if (Pred(I)) - IList.push_back(I); - } else if (isa<PiBlockDDGNode>(this)) { - for (const DDGNode *PN : cast<const PiBlockDDGNode>(this)->getNodes()) { - assert(!isa<PiBlockDDGNode>(PN) && "Nested PiBlocks are not supported."); - SmallVector<Instruction *, 8> TmpIList; - PN->collectInstructions(Pred, TmpIList); - IList.insert(IList.end(), TmpIList.begin(), TmpIList.end()); - } - } else - llvm_unreachable("unimplemented type of node"); - return !IList.empty(); -} - -raw_ostream &llvm::operator<<(raw_ostream &OS, const DDGNode::NodeKind K) { - const char *Out; - switch (K) { - case DDGNode::NodeKind::SingleInstruction: - Out = "single-instruction"; - break; - case DDGNode::NodeKind::MultiInstruction: - Out = "multi-instruction"; - break; - case DDGNode::NodeKind::PiBlock: - Out = "pi-block"; - break; - case DDGNode::NodeKind::Root: - Out = "root"; - break; - case DDGNode::NodeKind::Unknown: - Out = "?? (error)"; - break; - } - OS << Out; - return OS; -} - -raw_ostream &llvm::operator<<(raw_ostream &OS, const DDGNode &N) { - OS << "Node Address:" << &N << ":" << N.getKind() << "\n"; - if (isa<SimpleDDGNode>(N)) { - OS << " Instructions:\n"; - for (const Instruction *I : cast<const SimpleDDGNode>(N).getInstructions()) - OS.indent(2) << *I << "\n"; - } else if (isa<PiBlockDDGNode>(&N)) { - OS << "--- start of nodes in pi-block ---\n"; - auto &Nodes = cast<const PiBlockDDGNode>(&N)->getNodes(); - unsigned Count = 0; - for (const DDGNode *N : Nodes) - OS << *N << (++Count == Nodes.size() ? "" : "\n"); - OS << "--- end of nodes in pi-block ---\n"; - } else if (!isa<RootDDGNode>(N)) - llvm_unreachable("unimplemented type of node"); - - OS << (N.getEdges().empty() ? " Edges:none!\n" : " Edges:\n"); - for (auto &E : N.getEdges()) - OS.indent(2) << *E; - return OS; -} - -//===--------------------------------------------------------------------===// -// SimpleDDGNode implementation -//===--------------------------------------------------------------------===// - -SimpleDDGNode::SimpleDDGNode(Instruction &I) - : DDGNode(NodeKind::SingleInstruction), InstList() { - assert(InstList.empty() && "Expected empty list."); - InstList.push_back(&I); -} - -SimpleDDGNode::SimpleDDGNode(const SimpleDDGNode &N) - : DDGNode(N), InstList(N.InstList) { - assert(((getKind() == NodeKind::SingleInstruction && InstList.size() == 1) || - (getKind() == NodeKind::MultiInstruction && InstList.size() > 1)) && - "constructing from invalid simple node."); -} - -SimpleDDGNode::SimpleDDGNode(SimpleDDGNode &&N) - : DDGNode(std::move(N)), InstList(std::move(N.InstList)) { - assert(((getKind() == NodeKind::SingleInstruction && InstList.size() == 1) || - (getKind() == NodeKind::MultiInstruction && InstList.size() > 1)) && - "constructing from invalid simple node."); -} - -SimpleDDGNode::~SimpleDDGNode() { InstList.clear(); } - -//===--------------------------------------------------------------------===// -// PiBlockDDGNode implementation -//===--------------------------------------------------------------------===// - -PiBlockDDGNode::PiBlockDDGNode(const PiNodeList &List) - : DDGNode(NodeKind::PiBlock), NodeList(List) { - assert(!NodeList.empty() && "pi-block node constructed with an empty list."); -} - -PiBlockDDGNode::PiBlockDDGNode(const PiBlockDDGNode &N) - : DDGNode(N), NodeList(N.NodeList) { - assert(getKind() == NodeKind::PiBlock && !NodeList.empty() && - "constructing from invalid pi-block node."); -} - -PiBlockDDGNode::PiBlockDDGNode(PiBlockDDGNode &&N) - : DDGNode(std::move(N)), NodeList(std::move(N.NodeList)) { - assert(getKind() == NodeKind::PiBlock && !NodeList.empty() && - "constructing from invalid pi-block node."); -} - -PiBlockDDGNode::~PiBlockDDGNode() { NodeList.clear(); } - -//===--------------------------------------------------------------------===// -// DDGEdge implementation -//===--------------------------------------------------------------------===// - -raw_ostream &llvm::operator<<(raw_ostream &OS, const DDGEdge::EdgeKind K) { - const char *Out; - switch (K) { - case DDGEdge::EdgeKind::RegisterDefUse: - Out = "def-use"; - break; - case DDGEdge::EdgeKind::MemoryDependence: - Out = "memory"; - break; - case DDGEdge::EdgeKind::Rooted: - Out = "rooted"; - break; - case DDGEdge::EdgeKind::Unknown: - Out = "?? (error)"; - break; - } - OS << Out; - return OS; -} - -raw_ostream &llvm::operator<<(raw_ostream &OS, const DDGEdge &E) { - OS << "[" << E.getKind() << "] to " << &E.getTargetNode() << "\n"; - return OS; -} - -//===--------------------------------------------------------------------===// -// DataDependenceGraph implementation -//===--------------------------------------------------------------------===// -using BasicBlockListType = SmallVector<BasicBlock *, 8>; - -DataDependenceGraph::DataDependenceGraph(Function &F, DependenceInfo &D) - : DependenceGraphInfo(F.getName().str(), D) { - // Put the basic blocks in program order for correct dependence - // directions. - BasicBlockListType BBList; - for (auto &SCC : make_range(scc_begin(&F), scc_end(&F))) - for (BasicBlock * BB : SCC) - BBList.push_back(BB); - std::reverse(BBList.begin(), BBList.end()); - DDGBuilder(*this, D, BBList).populate(); -} - -DataDependenceGraph::DataDependenceGraph(Loop &L, LoopInfo &LI, - DependenceInfo &D) - : DependenceGraphInfo(Twine(L.getHeader()->getParent()->getName() + "." + - L.getHeader()->getName()) - .str(), - D) { - // Put the basic blocks in program order for correct dependence - // directions. - LoopBlocksDFS DFS(&L); - DFS.perform(&LI); - BasicBlockListType BBList; - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) - BBList.push_back(BB); - DDGBuilder(*this, D, BBList).populate(); -} - -DataDependenceGraph::~DataDependenceGraph() { - for (auto *N : Nodes) { - for (auto *E : *N) - delete E; - delete N; - } -} - -bool DataDependenceGraph::addNode(DDGNode &N) { - if (!DDGBase::addNode(N)) - return false; - - // In general, if the root node is already created and linked, it is not safe - // to add new nodes since they may be unreachable by the root. However, - // pi-block nodes need to be added after the root node is linked, and they are - // always reachable by the root, because they represent components that are - // already reachable by root. - auto *Pi = dyn_cast<PiBlockDDGNode>(&N); - assert((!Root || Pi) && - "Root node is already added. No more nodes can be added."); - - if (isa<RootDDGNode>(N)) - Root = &N; - - if (Pi) - for (DDGNode *NI : Pi->getNodes()) - PiBlockMap.insert(std::make_pair(NI, Pi)); - - return true; -} - -const PiBlockDDGNode *DataDependenceGraph::getPiBlock(const NodeType &N) const { - if (PiBlockMap.find(&N) == PiBlockMap.end()) - return nullptr; - auto *Pi = PiBlockMap.find(&N)->second; - assert(PiBlockMap.find(Pi) == PiBlockMap.end() && - "Nested pi-blocks detected."); - return Pi; -} - -raw_ostream &llvm::operator<<(raw_ostream &OS, const DataDependenceGraph &G) { - for (DDGNode *Node : G) - // Avoid printing nodes that are part of a pi-block twice. They will get - // printed when the pi-block is printed. - if (!G.getPiBlock(*Node)) - OS << *Node << "\n"; - OS << "\n"; - return OS; -} - -//===--------------------------------------------------------------------===// -// DDGBuilder implementation -//===--------------------------------------------------------------------===// - -bool DDGBuilder::areNodesMergeable(const DDGNode &Src, - const DDGNode &Tgt) const { - // Only merge two nodes if they are both simple nodes and the consecutive - // instructions after merging belong to the same BB. - const auto *SimpleSrc = dyn_cast<const SimpleDDGNode>(&Src); - const auto *SimpleTgt = dyn_cast<const SimpleDDGNode>(&Tgt); - if (!SimpleSrc || !SimpleTgt) - return false; - - return SimpleSrc->getLastInstruction()->getParent() == - SimpleTgt->getFirstInstruction()->getParent(); -} - -void DDGBuilder::mergeNodes(DDGNode &A, DDGNode &B) { - DDGEdge &EdgeToFold = A.back(); - assert(A.getEdges().size() == 1 && EdgeToFold.getTargetNode() == B && - "Expected A to have a single edge to B."); - assert(isa<SimpleDDGNode>(&A) && isa<SimpleDDGNode>(&B) && - "Expected simple nodes"); - - // Copy instructions from B to the end of A. - cast<SimpleDDGNode>(&A)->appendInstructions(*cast<SimpleDDGNode>(&B)); - - // Move to A any outgoing edges from B. - for (DDGEdge *BE : B) - Graph.connect(A, BE->getTargetNode(), *BE); - - A.removeEdge(EdgeToFold); - destroyEdge(EdgeToFold); - Graph.removeNode(B); - destroyNode(B); -} - -bool DDGBuilder::shouldSimplify() const { return SimplifyDDG; } - -bool DDGBuilder::shouldCreatePiBlocks() const { return CreatePiBlocks; } - -//===--------------------------------------------------------------------===// -// DDG Analysis Passes -//===--------------------------------------------------------------------===// - -/// DDG as a loop pass. -DDGAnalysis::Result DDGAnalysis::run(Loop &L, LoopAnalysisManager &AM, - LoopStandardAnalysisResults &AR) { - Function *F = L.getHeader()->getParent(); - DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI); - return std::make_unique<DataDependenceGraph>(L, AR.LI, DI); -} -AnalysisKey DDGAnalysis::Key; - -PreservedAnalyses DDGAnalysisPrinterPass::run(Loop &L, LoopAnalysisManager &AM, - LoopStandardAnalysisResults &AR, - LPMUpdater &U) { - OS << "'DDG' for loop '" << L.getHeader()->getName() << "':\n"; - OS << *AM.getResult<DDGAnalysis>(L, AR); - return PreservedAnalyses::all(); -} diff --git a/hpvm/llvm_patches/lib/Analysis/DependenceGraphBuilder.cpp b/hpvm/llvm_patches/lib/Analysis/DependenceGraphBuilder.cpp deleted file mode 100644 index 95a39f984d6378af4d7a8042954a5913bbf1a971..0000000000000000000000000000000000000000 --- a/hpvm/llvm_patches/lib/Analysis/DependenceGraphBuilder.cpp +++ /dev/null @@ -1,535 +0,0 @@ -//===- DependenceGraphBuilder.cpp ------------------------------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// This file implements common steps of the build algorithm for construction -// of dependence graphs such as DDG and PDG. -//===----------------------------------------------------------------------===// - -#include "llvm/Analysis/DependenceGraphBuilder.h" -#include "llvm/ADT/EnumeratedArray.h" -#include "llvm/ADT/SCCIterator.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/DDG.h" - -using namespace llvm; - -#define DEBUG_TYPE "dgb" - -STATISTIC(TotalGraphs, "Number of dependence graphs created."); -STATISTIC(TotalDefUseEdges, "Number of def-use edges created."); -STATISTIC(TotalMemoryEdges, "Number of memory dependence edges created."); -STATISTIC(TotalFineGrainedNodes, "Number of fine-grained nodes created."); -STATISTIC(TotalPiBlockNodes, "Number of pi-block nodes created."); -STATISTIC(TotalConfusedEdges, - "Number of confused memory dependencies between two nodes."); -STATISTIC(TotalEdgeReversals, - "Number of times the source and sink of dependence was reversed to " - "expose cycles in the graph."); - -using InstructionListType = SmallVector<Instruction *, 2>; - -//===--------------------------------------------------------------------===// -// AbstractDependenceGraphBuilder implementation -//===--------------------------------------------------------------------===// - -template <class G> -void AbstractDependenceGraphBuilder<G>::computeInstructionOrdinals() { - // The BBList is expected to be in program order. - size_t NextOrdinal = 1; - for (auto *BB : BBList) - for (auto &I : *BB) - InstOrdinalMap.insert(std::make_pair(&I, NextOrdinal++)); -} - -template <class G> -void AbstractDependenceGraphBuilder<G>::createFineGrainedNodes() { - ++TotalGraphs; - assert(IMap.empty() && "Expected empty instruction map at start"); - for (BasicBlock *BB : BBList) - for (Instruction &I : *BB) { - auto &NewNode = createFineGrainedNode(I); - IMap.insert(std::make_pair(&I, &NewNode)); - NodeOrdinalMap.insert(std::make_pair(&NewNode, getOrdinal(I))); - ++TotalFineGrainedNodes; - } -} - -template <class G> -void AbstractDependenceGraphBuilder<G>::createAndConnectRootNode() { - // Create a root node that connects to every connected component of the graph. - // This is done to allow graph iterators to visit all the disjoint components - // of the graph, in a single walk. - // - // This algorithm works by going through each node of the graph and for each - // node N, do a DFS starting from N. A rooted edge is established between the - // root node and N (if N is not yet visited). All the nodes reachable from N - // are marked as visited and are skipped in the DFS of subsequent nodes. - // - // Note: This algorithm tries to limit the number of edges out of the root - // node to some extent, but there may be redundant edges created depending on - // the iteration order. For example for a graph {A -> B}, an edge from the - // root node is added to both nodes if B is visited before A. While it does - // not result in minimal number of edges, this approach saves compile-time - // while keeping the number of edges in check. - auto &RootNode = createRootNode(); - df_iterator_default_set<const NodeType *, 4> Visited; - for (auto *N : Graph) { - if (*N == RootNode) - continue; - for (auto I : depth_first_ext(N, Visited)) - if (I == N) - createRootedEdge(RootNode, *N); - } -} - -template <class G> void AbstractDependenceGraphBuilder<G>::createPiBlocks() { - if (!shouldCreatePiBlocks()) - return; - - LLVM_DEBUG(dbgs() << "==== Start of Creation of Pi-Blocks ===\n"); - - // The overall algorithm is as follows: - // 1. Identify SCCs and for each SCC create a pi-block node containing all - // the nodes in that SCC. - // 2. Identify incoming edges incident to the nodes inside of the SCC and - // reconnect them to the pi-block node. - // 3. Identify outgoing edges from the nodes inside of the SCC to nodes - // outside of it and reconnect them so that the edges are coming out of the - // SCC node instead. - - // Adding nodes as we iterate through the SCCs cause the SCC - // iterators to get invalidated. To prevent this invalidation, we first - // collect a list of nodes that are part of an SCC, and then iterate over - // those lists to create the pi-block nodes. Each element of the list is a - // list of nodes in an SCC. Note: trivial SCCs containing a single node are - // ignored. - SmallVector<NodeListType, 4> ListOfSCCs; - for (auto &SCC : make_range(scc_begin(&Graph), scc_end(&Graph))) { - if (SCC.size() > 1) - ListOfSCCs.emplace_back(SCC.begin(), SCC.end()); - } - - for (NodeListType &NL : ListOfSCCs) { - LLVM_DEBUG(dbgs() << "Creating pi-block node with " << NL.size() - << " nodes in it.\n"); - - // SCC iterator may put the nodes in an order that's different from the - // program order. To preserve original program order, we sort the list of - // nodes based on ordinal numbers computed earlier. - llvm::sort(NL, [&](NodeType *LHS, NodeType *RHS) { - return getOrdinal(*LHS) < getOrdinal(*RHS); - }); - - NodeType &PiNode = createPiBlock(NL); - ++TotalPiBlockNodes; - - // Build a set to speed up the lookup for edges whose targets - // are inside the SCC. - SmallPtrSet<NodeType *, 4> NodesInSCC(NL.begin(), NL.end()); - - // We have the set of nodes in the SCC. We go through the set of nodes - // that are outside of the SCC and look for edges that cross the two sets. - for (NodeType *N : Graph) { - - // Skip the SCC node and all the nodes inside of it. - if (*N == PiNode || NodesInSCC.count(N)) - continue; - - for (NodeType *SCCNode : NL) { - - enum Direction { - Incoming, // Incoming edges to the SCC - Outgoing, // Edges going ot of the SCC - DirectionCount // To make the enum usable as an array index. - }; - - // Use these flags to help us avoid creating redundant edges. If there - // are more than one edges from an outside node to inside nodes, we only - // keep one edge from that node to the pi-block node. Similarly, if - // there are more than one edges from inside nodes to an outside node, - // we only keep one edge from the pi-block node to the outside node. - // There is a flag defined for each direction (incoming vs outgoing) and - // for each type of edge supported, using a two-dimensional boolean - // array. - using EdgeKind = typename EdgeType::EdgeKind; - EnumeratedArray<bool, EdgeKind> EdgeAlreadyCreated[DirectionCount]{ - false, false}; - - auto createEdgeOfKind = [this](NodeType &Src, NodeType &Dst, - const EdgeKind K) { - switch (K) { - case EdgeKind::RegisterDefUse: - createDefUseEdge(Src, Dst); - break; - case EdgeKind::MemoryDependence: - createMemoryEdge(Src, Dst); - break; - case EdgeKind::Rooted: - createRootedEdge(Src, Dst); - break; - default: - llvm_unreachable("Unsupported type of edge."); - } - }; - - auto reconnectEdges = [&](NodeType *Src, NodeType *Dst, NodeType *New, - const Direction Dir) { - if (!Src->hasEdgeTo(*Dst)) - return; - LLVM_DEBUG(dbgs() - << "reconnecting(" - << (Dir == Direction::Incoming ? "incoming)" : "outgoing)") - << ":\nSrc:" << *Src << "\nDst:" << *Dst - << "\nNew:" << *New << "\n"); - assert((Dir == Direction::Incoming || Dir == Direction::Outgoing) && - "Invalid direction."); - - SmallVector<EdgeType *, 10> EL; - Src->findEdgesTo(*Dst, EL); - for (EdgeType *OldEdge : EL) { - EdgeKind Kind = OldEdge->getKind(); - if (!EdgeAlreadyCreated[Dir][Kind]) { - if (Dir == Direction::Incoming) { - createEdgeOfKind(*Src, *New, Kind); - LLVM_DEBUG(dbgs() << "created edge from Src to New.\n"); - } else if (Dir == Direction::Outgoing) { - createEdgeOfKind(*New, *Dst, Kind); - LLVM_DEBUG(dbgs() << "created edge from New to Dst.\n"); - } - EdgeAlreadyCreated[Dir][Kind] = true; - } - Src->removeEdge(*OldEdge); - destroyEdge(*OldEdge); - LLVM_DEBUG(dbgs() << "removed old edge between Src and Dst.\n\n"); - } - }; - - // Process incoming edges incident to the pi-block node. - reconnectEdges(N, SCCNode, &PiNode, Direction::Incoming); - - // Process edges that are coming out of the pi-block node. - reconnectEdges(SCCNode, N, &PiNode, Direction::Outgoing); - } - } - } - - // Ordinal maps are no longer needed. - InstOrdinalMap.clear(); - NodeOrdinalMap.clear(); - - LLVM_DEBUG(dbgs() << "==== End of Creation of Pi-Blocks ===\n"); -} - -template <class G> void AbstractDependenceGraphBuilder<G>::createDefUseEdges() { - for (NodeType *N : Graph) { - InstructionListType SrcIList; - N->collectInstructions([](const Instruction *I) { return true; }, SrcIList); - - // Use a set to mark the targets that we link to N, so we don't add - // duplicate def-use edges when more than one instruction in a target node - // use results of instructions that are contained in N. - SmallPtrSet<NodeType *, 4> VisitedTargets; - - for (Instruction *II : SrcIList) { - for (User *U : II->users()) { - Instruction *UI = dyn_cast<Instruction>(U); - if (!UI) - continue; - NodeType *DstNode = nullptr; - if (IMap.find(UI) != IMap.end()) - DstNode = IMap.find(UI)->second; - - // In the case of loops, the scope of the subgraph is all the - // basic blocks (and instructions within them) belonging to the loop. We - // simply ignore all the edges coming from (or going into) instructions - // or basic blocks outside of this range. - if (!DstNode) { - LLVM_DEBUG( - dbgs() - << "skipped def-use edge since the sink" << *UI - << " is outside the range of instructions being considered.\n"); - continue; - } - - // Self dependencies are ignored because they are redundant and - // uninteresting. - if (DstNode == N) { - LLVM_DEBUG(dbgs() - << "skipped def-use edge since the sink and the source (" - << N << ") are the same.\n"); - continue; - } - - if (VisitedTargets.insert(DstNode).second) { - createDefUseEdge(*N, *DstNode); - ++TotalDefUseEdges; - } - } - } - } -} - -template <class G> -void AbstractDependenceGraphBuilder<G>::createMemoryDependencyEdges() { - using DGIterator = typename G::iterator; - auto isMemoryAccess = [](const Instruction *I) { - return I->mayReadOrWriteMemory(); - }; - for (DGIterator SrcIt = Graph.begin(), E = Graph.end(); SrcIt != E; ++SrcIt) { - LLVM_DEBUG(errs() << "Src Node: " << *SrcIt << "\n"); - InstructionListType SrcIList; - (*SrcIt)->collectInstructions(isMemoryAccess, SrcIList); - if (SrcIList.empty()) - continue; - - for (DGIterator DstIt = SrcIt; DstIt != E; ++DstIt) { - if (**SrcIt == **DstIt) - continue; - InstructionListType DstIList; - (*DstIt)->collectInstructions(isMemoryAccess, DstIList); - if (DstIList.empty()) - continue; - bool ForwardEdgeCreated = false; - bool BackwardEdgeCreated = false; - LLVM_DEBUG(errs() << "***********************************************\n"); - for (Instruction *ISrc : SrcIList) { - LLVM_DEBUG(errs() << "Src: " << *ISrc << "\n"); - for (Instruction *IDst : DstIList) { - LLVM_DEBUG(errs() << "Dst: " << *IDst << "\n"); - auto D = DI.depends(ISrc, IDst, true); - if (!D) { - LLVM_DEBUG(errs() << "--> No Dependence, moving on!\n"); - continue; - - } - LLVM_DEBUG(D->dump(errs())); - - // If we have a dependence with its left-most non-'=' direction - // being '>' we need to reverse the direction of the edge, because - // the source of the dependence cannot occur after the sink. For - // confused dependencies, we will create edges in both directions to - // represent the possibility of a cycle. - - auto createConfusedEdges = [&](NodeType &Src, NodeType &Dst) { - if (!ForwardEdgeCreated) { - createMemoryEdge(Src, Dst); - ++TotalMemoryEdges; - } - if (!BackwardEdgeCreated) { - createMemoryEdge(Dst, Src); - ++TotalMemoryEdges; - } - ForwardEdgeCreated = BackwardEdgeCreated = true; - ++TotalConfusedEdges; - }; - - auto createForwardEdge = [&](NodeType &Src, NodeType &Dst) { - if (!ForwardEdgeCreated) { - createMemoryEdge(Src, Dst); - ++TotalMemoryEdges; - } - ForwardEdgeCreated = true; - }; - - auto createBackwardEdge = [&](NodeType &Src, NodeType &Dst) { - if (!BackwardEdgeCreated) { - createMemoryEdge(Dst, Src); - ++TotalMemoryEdges; - } - BackwardEdgeCreated = true; - }; - - if (D->isConfused()) { - LLVM_DEBUG(errs() << "--> Confused Dependence: creating Confused Edge\n"); - createConfusedEdges(**SrcIt, **DstIt); - } else if (D->isOrdered() && !D->isLoopIndependent()) { - LLVM_DEBUG(errs() << "--> Ordered, Loop-Dependent Dependence:\n"); - bool ReversedEdge = false; - for (unsigned Level = 1; Level <= D->getLevels(); ++Level) { - LLVM_DEBUG(errs() << "----> Lvl: " << Level << ": "); - if (D->getDirection(Level) == Dependence::DVEntry::EQ) { - LLVM_DEBUG(errs() << "EQ\n"); - continue; - } else if (D->getDirection(Level) == Dependence::DVEntry::GT) { - LLVM_DEBUG(errs() << "GT\n"); - LLVM_DEBUG(errs() << "------> Invalid Dependence. Creating Backward Edge!\n"); - createBackwardEdge(**SrcIt, **DstIt); - ReversedEdge = true; - ++TotalEdgeReversals; - break; - } else if (D->getDirection(Level) == Dependence::DVEntry::LT){ - LLVM_DEBUG(errs() << "LT\n"); - break; - } else { - LLVM_DEBUG(errs() << " Confused\n"); - createConfusedEdges(**SrcIt, **DstIt); - break; - } - } - if (!ReversedEdge) { - LLVM_DEBUG(errs() << "------> Creating Forward Edge!\n"); - createForwardEdge(**SrcIt, **DstIt); - } - } else { - LLVM_DEBUG(errs() << "--> Creating Forward Edge!\n"); - createForwardEdge(**SrcIt, **DstIt); - } - // Avoid creating duplicate edges. - if (ForwardEdgeCreated && BackwardEdgeCreated) { - LLVM_DEBUG(errs() << "--> Created all possible edges between Src and Dst!\n"); - break; - } - } - - // If we've created edges in both directions, there is no more - // unique edge that we can create between these two nodes, so we - // can exit early. - if (ForwardEdgeCreated && BackwardEdgeCreated) { - LLVM_DEBUG(errs() << "No more unique edges possible!\n"); - break; - } - } - } - } -} - -template <class G> void AbstractDependenceGraphBuilder<G>::simplify() { - if (!shouldSimplify()) - return; - LLVM_DEBUG(dbgs() << "==== Start of Graph Simplification ===\n"); - - // This algorithm works by first collecting a set of candidate nodes that have - // an out-degree of one (in terms of def-use edges), and then ignoring those - // whose targets have an in-degree more than one. Each node in the resulting - // set can then be merged with its corresponding target and put back into the - // worklist until no further merge candidates are available. - SmallPtrSet<NodeType *, 32> CandidateSourceNodes; - - // A mapping between nodes and their in-degree. To save space, this map - // only contains nodes that are targets of nodes in the CandidateSourceNodes. - DenseMap<NodeType *, unsigned> TargetInDegreeMap; - - for (NodeType *N : Graph) { - if (N->getEdges().size() != 1) - continue; - EdgeType &Edge = N->back(); - if (!Edge.isDefUse()) - continue; - CandidateSourceNodes.insert(N); - - // Insert an element into the in-degree map and initialize to zero. The - // count will get updated in the next step. - TargetInDegreeMap.insert({&Edge.getTargetNode(), 0}); - } - - LLVM_DEBUG({ - dbgs() << "Size of candidate src node list:" << CandidateSourceNodes.size() - << "\nNode with single outgoing def-use edge:\n"; - for (NodeType *N : CandidateSourceNodes) { - dbgs() << N << "\n"; - } - }); - - for (NodeType *N : Graph) { - for (EdgeType *E : *N) { - NodeType *Tgt = &E->getTargetNode(); - auto TgtIT = TargetInDegreeMap.find(Tgt); - if (TgtIT != TargetInDegreeMap.end()) - ++(TgtIT->second); - } - } - - LLVM_DEBUG({ - dbgs() << "Size of target in-degree map:" << TargetInDegreeMap.size() - << "\nContent of in-degree map:\n"; - for (auto &I : TargetInDegreeMap) { - dbgs() << I.first << " --> " << I.second << "\n"; - } - }); - - SmallVector<NodeType *, 32> Worklist(CandidateSourceNodes.begin(), - CandidateSourceNodes.end()); - while (!Worklist.empty()) { - NodeType &Src = *Worklist.pop_back_val(); - // As nodes get merged, we need to skip any node that has been removed from - // the candidate set (see below). - if (CandidateSourceNodes.find(&Src) == CandidateSourceNodes.end()) - continue; - CandidateSourceNodes.erase(&Src); - - assert(Src.getEdges().size() == 1 && - "Expected a single edge from the candidate src node."); - NodeType &Tgt = Src.back().getTargetNode(); - assert(TargetInDegreeMap.find(&Tgt) != TargetInDegreeMap.end() && - "Expected target to be in the in-degree map."); - - if (TargetInDegreeMap[&Tgt] != 1) - continue; - - if (!areNodesMergeable(Src, Tgt)) - continue; - - // Do not merge if there is also an edge from target to src (immediate - // cycle). - if (Tgt.hasEdgeTo(Src)) - continue; - - LLVM_DEBUG(dbgs() << "Merging:" << Src << "\nWith:" << Tgt << "\n"); - - mergeNodes(Src, Tgt); - - // If the target node is in the candidate set itself, we need to put the - // src node back into the worklist again so it gives the target a chance - // to get merged into it. For example if we have: - // {(a)->(b), (b)->(c), (c)->(d), ...} and the worklist is initially {b, a}, - // then after merging (a) and (b) together, we need to put (a,b) back in - // the worklist so that (c) can get merged in as well resulting in - // {(a,b,c) -> d} - // We also need to remove the old target (b), from the worklist. We first - // remove it from the candidate set here, and skip any item from the - // worklist that is not in the set. - if (CandidateSourceNodes.find(&Tgt) != CandidateSourceNodes.end()) { - Worklist.push_back(&Src); - CandidateSourceNodes.insert(&Src); - CandidateSourceNodes.erase(&Tgt); - LLVM_DEBUG(dbgs() << "Putting " << &Src << " back in the worklist.\n"); - } - } - LLVM_DEBUG(dbgs() << "=== End of Graph Simplification ===\n"); -} - -template <class G> -void AbstractDependenceGraphBuilder<G>::sortNodesTopologically() { - - // If we don't create pi-blocks, then we may not have a DAG. - if (!shouldCreatePiBlocks()) - return; - - SmallVector<NodeType *, 64> NodesInPO; - using NodeKind = typename NodeType::NodeKind; - for (NodeType *N : post_order(&Graph)) { - if (N->getKind() == NodeKind::PiBlock) { - // Put members of the pi-block right after the pi-block itself, for - // convenience. - const NodeListType &PiBlockMembers = getNodesInPiBlock(*N); - NodesInPO.insert(NodesInPO.end(), PiBlockMembers.begin(), - PiBlockMembers.end()); - } - NodesInPO.push_back(N); - } - - size_t OldSize = Graph.Nodes.size(); - Graph.Nodes.clear(); - for (NodeType *N : reverse(NodesInPO)) - Graph.Nodes.push_back(N); - if (Graph.Nodes.size() != OldSize) - assert(false && - "Expected the number of nodes to stay the same after the sort"); -} - -template class llvm::AbstractDependenceGraphBuilder<DataDependenceGraph>; -template class llvm::DependenceGraphInfo<DDGNode>; diff --git a/hpvm/projects/hpvm-rt/hpvm-rt.cpp b/hpvm/projects/hpvm-rt/hpvm-rt.cpp index f0716378fe8c62639555396a3d268aae051534ba..0ff234e952f6aec93377ec33d51fa2a1a9f64c4e 100644 --- a/hpvm/projects/hpvm-rt/hpvm-rt.cpp +++ b/hpvm/projects/hpvm-rt/hpvm-rt.cpp @@ -1,6 +1,3 @@ - -//#define HPVM_USE_OPENCL 1 - #include <algorithm> #include <cassert> #include <cstdio> @@ -10,13 +7,8 @@ #include <map> #include <pthread.h> #include <string> -#include <unistd.h> -#ifdef HPVM_USE_OPENCL - -#include <CL/cl.h> - -#endif +#include <unistd.h> #if _POSIX_VERSION >= 200112L #include <sys/time.h> @@ -48,22 +40,6 @@ typedef struct { } DFNodeContext_CPU; -#ifdef HPVM_USE_OPENCL - -typedef struct { - cl_context clOCLContext; - cl_command_queue clCommandQue; - cl_program clProgram; - cl_kernel clKernel; -} DFNodeContext_OCL; - -cl_context globalOCLContext; -cl_device_id *clDevices; -cl_command_queue globalCommandQue; - -#endif - - MemTracker MTracker; vector<DFGDepth> DStack; // Mutex to prevent concurrent access by multiple thereads in pipeline @@ -72,167 +48,6 @@ pthread_mutex_t ocl_mtx; #define NUM_TESTS 1 hpvm_TimerSet kernel_timer; -#ifdef HPVM_USE_OPENCL - -static const char *getErrorString(cl_int error) { - switch (error) { - // run-time and JIT compiler errors - case 0: - return "CL_SUCCESS"; - case -1: - return "CL_DEVICE_NOT_FOUND"; - case -2: - return "CL_DEVICE_NOT_AVAILABLE"; - case -3: - return "CL_COMPILER_NOT_AVAILABLE"; - case -4: - return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; - case -5: - return "CL_OUT_OF_RESOURCES"; - case -6: - return "CL_OUT_OF_HOST_MEMORY"; - case -7: - return "CL_PROFILING_INFO_NOT_AVAILABLE"; - case -8: - return "CL_MEM_COPY_OVERLAP"; - case -9: - return "CL_IMAGE_FORMAT_MISMATCH"; - case -10: - return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; - case -11: - return "CL_BUILD_PROGRAM_FAILURE"; - case -12: - return "CL_MAP_FAILURE"; - case -13: - return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; - case -14: - return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; - case -15: - return "CL_COMPILE_PROGRAM_FAILURE"; - case -16: - return "CL_LINKER_NOT_AVAILABLE"; - case -17: - return "CL_LINK_PROGRAM_FAILURE"; - case -18: - return "CL_DEVICE_PARTITION_FAILED"; - case -19: - return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; - - // compile-time errors - case -30: - return "CL_INVALID_VALUE"; - case -31: - return "CL_INVALID_DEVICE_TYPE"; - case -32: - return "CL_INVALID_PLATFORM"; - case -33: - return "CL_INVALID_DEVICE"; - case -34: - return "CL_INVALID_CONTEXT"; - case -35: - return "CL_INVALID_QUEUE_PROPERTIES"; - case -36: - return "CL_INVALID_COMMAND_QUEUE"; - case -37: - return "CL_INVALID_HOST_PTR"; - case -38: - return "CL_INVALID_MEM_OBJECT"; - case -39: - return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; - case -40: - return "CL_INVALID_IMAGE_SIZE"; - case -41: - return "CL_INVALID_SAMPLER"; - case -42: - return "CL_INVALID_BINARY"; - case -43: - return "CL_INVALID_BUILD_OPTIONS"; - case -44: - return "CL_INVALID_PROGRAM"; - case -45: - return "CL_INVALID_PROGRAM_EXECUTABLE"; - case -46: - return "CL_INVALID_KERNEL_NAME"; - case -47: - return "CL_INVALID_KERNEL_DEFINITION"; - case -48: - return "CL_INVALID_KERNEL"; - case -49: - return "CL_INVALID_ARG_INDEX"; - case -50: - return "CL_INVALID_ARG_VALUE"; - case -51: - return "CL_INVALID_ARG_SIZE"; - case -52: - return "CL_INVALID_KERNEL_ARGS"; - case -53: - return "CL_INVALID_WORK_DIMENSION"; - case -54: - return "CL_INVALID_WORK_GROUP_SIZE"; - case -55: - return "CL_INVALID_WORK_ITEM_SIZE"; - case -56: - return "CL_INVALID_GLOBAL_OFFSET"; - case -57: - return "CL_INVALID_EVENT_WAIT_LIST"; - case -58: - return "CL_INVALID_EVENT"; - case -59: - return "CL_INVALID_OPERATION"; - case -60: - return "CL_INVALID_GL_OBJECT"; - case -61: - return "CL_INVALID_BUFFER_SIZE"; - case -62: - return "CL_INVALID_MIP_LEVEL"; - case -63: - return "CL_INVALID_GLOBAL_WORK_SIZE"; - case -64: - return "CL_INVALID_PROPERTY"; - case -65: - return "CL_INVALID_IMAGE_DESCRIPTOR"; - case -66: - return "CL_INVALID_COMPILER_OPTIONS"; - case -67: - return "CL_INVALID_LINKER_OPTIONS"; - case -68: - return "CL_INVALID_DEVICE_PARTITION_COUNT"; - - // extension errors - case -1000: - return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR"; - case -1001: - return "CL_PLATFORM_NOT_FOUND_KHR"; - case -1002: - return "CL_INVALID_D3D10_DEVICE_KHR"; - case -1003: - return "CL_INVALID_D3D10_RESOURCE_KHR"; - case -1004: - return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR"; - case -1005: - return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR"; - default: - return "Unknown OpenCL error"; - } -} - -static inline void checkErr(cl_int err, cl_int success, const char *name) { - if (err != success) { - cout << "ERROR: " << name << flush << "\n"; - cout << "ErrorCode: " << getErrorString(err) << flush << "\n"; - exit(EXIT_FAILURE); - } -} - -#endif - - -void openCLAbort(){ - cout <<" ERROR: OpenCL NOT found!. Please Recompile with OpenCL - Make sure to have OpenCL on System \n "; - abort(); -} - - /************************* Depth Stack Routines ***************************/ void llvm_hpvm_cpu_dstack_push(unsigned n, uint64_t limitX, uint64_t iX, @@ -284,9 +99,6 @@ uint64_t llvm_hpvm_cpu_getDimInstance(unsigned level, unsigned dim) { /********************** Memory Tracking Routines **************************/ void llvm_hpvm_track_mem(void *ptr, size_t size) { - -#ifdef HPVM_USE_OPENCL - DEBUG(cout << "Start tracking memory: " << ptr << flush << "\n"); MemTrackerEntry *MTE = MTracker.lookup(ptr); if (MTE != NULL) { @@ -296,19 +108,9 @@ void llvm_hpvm_track_mem(void *ptr, size_t size) { DEBUG(cout << "Inserting ID " << ptr << " in the MemTracker Table\n"); MTracker.insert(ptr, size, MemTrackerEntry::HOST, ptr); DEBUG(MTracker.print()); - -#else - - openCLAbort(); - -#endif - } void llvm_hpvm_untrack_mem(void *ptr) { - -#ifdef HPVM_USE_OPENCL - DEBUG(cout << "Stop tracking memory: " << ptr << flush << "\n"); MemTrackerEntry *MTE = MTracker.lookup(ptr); if (MTE == NULL) { @@ -317,94 +119,16 @@ void llvm_hpvm_untrack_mem(void *ptr) { return; } DEBUG(cout << "Removing ID " << ptr << " from MemTracker Table\n"); - if (MTE->getLocation() == MemTrackerEntry::DEVICE) - clReleaseMemObject((cl_mem)MTE->getAddress()); MTracker.remove(ptr); DEBUG(MTracker.print()); - -#else - - openCLAbort(); - -#endif - -} - - -#ifdef HPVM_USE_OPENCL - -static void *llvm_hpvm_ocl_request_mem(void *ptr, size_t size, - DFNodeContext_OCL *Context, bool isInput, - bool isOutput) { - - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "[OCL] Request memory: " << ptr - << " for context: " << Context->clOCLContext << flush << "\n"); - MemTrackerEntry *MTE = MTracker.lookup(ptr); - if (MTE == NULL) { - MTracker.print(); - cout << "ERROR: Requesting memory not present in Table\n"; - exit(EXIT_FAILURE); - } - // If already on device - if (MTE->getLocation() == MemTrackerEntry::DEVICE && - ((DFNodeContext_OCL *)MTE->getContext())->clOCLContext == - Context->clOCLContext) { - DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush - << "\n"); - pthread_mutex_unlock(&ocl_mtx); - return MTE->getAddress(); - } - - DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush - << "\n"); - DEBUG(cout << "\t"; MTE->print(); cout << flush << "\n"); - // Else copy and update the latest copy - cl_mem_flags clFlags; - cl_int errcode; - - if (isInput && isOutput) - clFlags = CL_MEM_READ_WRITE; - else if (isInput) - clFlags = CL_MEM_READ_ONLY; - else if (isOutput) - clFlags = CL_MEM_WRITE_ONLY; - else - clFlags = CL_MEM_READ_ONLY; - - hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COPY); - cl_mem d_input = - clCreateBuffer(Context->clOCLContext, clFlags, size, NULL, &errcode); - checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device"); - DEBUG(cout << "\nMemory allocated on device: " << d_input << flush << "\n"); - if (isInput) { - DEBUG(cout << "\tCopying ..."); - errcode = clEnqueueWriteBuffer(Context->clCommandQue, d_input, CL_TRUE, 0, - size, MTE->getAddress(), 0, NULL, NULL); - checkErr(errcode, CL_SUCCESS, "Failure to copy memory to device"); - } - - hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE); - DEBUG(cout << " done\n"); - MTE->update(MemTrackerEntry::DEVICE, (void *)d_input, Context); - DEBUG(cout << "Updated Table\n"); - DEBUG(MTracker.print()); - pthread_mutex_unlock(&ocl_mtx); - return d_input; - } -#endif - void *llvm_hpvm_cpu_argument_ptr(void *ptr, size_t size) { return llvm_hpvm_request_mem(ptr, size); } void *llvm_hpvm_request_mem(void *ptr, size_t size) { - -#ifdef HPVM_USE_OPENCL - pthread_mutex_lock(&ocl_mtx); DEBUG(cout << "[CPU] Request memory: " << ptr << flush << "\n"); MemTrackerEntry *MTE = MTracker.lookup(ptr); @@ -422,32 +146,8 @@ void *llvm_hpvm_request_mem(void *ptr, size_t size) { } // Else copy from device and update table - DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush - << "\n"); - DEBUG(cout << "\tCopying ..."); - hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COPY); - // pthread_mutex_lock(&ocl_mtx); - cl_int errcode = clEnqueueReadBuffer( - ((DFNodeContext_OCL *)MTE->getContext())->clCommandQue, - (cl_mem)MTE->getAddress(), CL_TRUE, 0, size, ptr, 0, NULL, NULL); - // pthread_mutex_unlock(&ocl_mtx); - hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE); - DEBUG(cout << " done\n"); - checkErr(errcode, CL_SUCCESS, "[request mem] Failure to read output"); - DEBUG(cout << "Free mem object on device\n"); - clReleaseMemObject((cl_mem)MTE->getAddress()); - DEBUG(cout << "Updated Table\n"); - MTE->update(MemTrackerEntry::HOST, ptr); - DEBUG(MTracker.print()); - pthread_mutex_unlock(&ocl_mtx); - return ptr; - -#else - - openCLAbort(); - -#endif - + assert(0 && "Should not reach this point with CPU-only code-gen"); + return NULL; } /*************************** Timer Routines **********************************/ @@ -481,134 +181,17 @@ get_last_async(struct hpvm_TimerSet *timers) { } static void insert_marker(struct hpvm_TimerSet *tset, enum hpvm_TimerID timer) { - -#ifdef HPVM_USE_OPENCL - - cl_int ciErrNum = CL_SUCCESS; - struct hpvm_async_time_marker_list **new_event = &(tset->async_markers); - - while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { - new_event = &((*new_event)->next); - } - - if (*new_event == NULL) { - *new_event = (struct hpvm_async_time_marker_list *)malloc( - sizeof(struct hpvm_async_time_marker_list)); - (*new_event)->marker = calloc(1, sizeof(cl_event)); - (*new_event)->next = NULL; - } - - /* valid event handle now aquired: insert the event record */ - (*new_event)->label = NULL; - (*new_event)->timerID = timer; - ciErrNum = - clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Enqueueing Marker!\n"); - } - -#else - - openCLAbort(); - -#endif - + return; } static void insert_submarker(struct hpvm_TimerSet *tset, char *label, enum hpvm_TimerID timer) { - -#ifdef HPVM_USE_OPENCL - - cl_int ciErrNum = CL_SUCCESS; - struct hpvm_async_time_marker_list **new_event = &(tset->async_markers); - - while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { - new_event = &((*new_event)->next); - } - - if (*new_event == NULL) { - *new_event = (struct hpvm_async_time_marker_list *)malloc( - sizeof(struct hpvm_async_time_marker_list)); - (*new_event)->marker = calloc(1, sizeof(cl_event)); - (*new_event)->next = NULL; - } - - /* valid event handle now aquired: insert the event record */ - (*new_event)->label = label; - (*new_event)->timerID = timer; - ciErrNum = - clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Enqueueing Marker!\n"); - } - -#else - - openCLAbort(); - -#endif - + return; } /* Assumes that all recorded events have completed */ static hpvm_Timestamp record_async_times(struct hpvm_TimerSet *tset) { - -#ifdef HPVM_USE_OPENCL - - struct hpvm_async_time_marker_list *next_interval = NULL; - struct hpvm_async_time_marker_list *last_marker = get_last_async(tset); - hpvm_Timestamp total_async_time = 0; - - for (next_interval = tset->async_markers; next_interval != last_marker; - next_interval = next_interval->next) { - cl_ulong command_start = 0, command_end = 0; - cl_int ciErrNum = CL_SUCCESS; - - ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker), - CL_PROFILING_COMMAND_END, - sizeof(cl_ulong), &command_start, NULL); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error getting first EventProfilingInfo: %d\n", ciErrNum); - } - - ciErrNum = clGetEventProfilingInfo( - *((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END, - sizeof(cl_ulong), &command_end, NULL); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error getting second EventProfilingInfo: %d\n", - ciErrNum); - } - - hpvm_Timestamp interval = - (hpvm_Timestamp)(((double)(command_end - command_start))); - tset->timers[next_interval->timerID].elapsed += interval; - if (next_interval->label != NULL) { - struct hpvm_SubTimer *subtimer = - tset->sub_timer_list[next_interval->timerID]->subtimer_list; - while (subtimer != NULL) { - if (strcmp(subtimer->label, next_interval->label) == 0) { - subtimer->timer.elapsed += interval; - break; - } - subtimer = subtimer->next; - } - } - total_async_time += interval; - next_interval->timerID = INVALID_TIMERID; - } - - if (next_interval != NULL) - next_interval->timerID = INVALID_TIMERID; - - return total_async_time; - -#else - - openCLAbort(); - -#endif - + return 0; } static void accumulate_time(hpvm_Timestamp *accum, hpvm_Timestamp start, @@ -825,295 +408,10 @@ void hpvm_AddSubTimer(struct hpvm_TimerSet *timers, char *label, } void hpvm_SwitchToTimer(struct hpvm_TimerSet *timers, enum hpvm_TimerID timer) { - -#ifdef HPVM_USE_OPENCL - - // cerr << "Switch to timer: " << timer << flush << "\n"; - /* Stop the currently running timer */ - if (timers->current != hpvm_TimerID_NONE) { - struct hpvm_SubTimerList *subtimerlist = - timers->sub_timer_list[timers->current]; - struct hpvm_SubTimer *currSubTimer = - (subtimerlist != NULL) ? subtimerlist->current : NULL; - - if (!is_async(timers->current)) { - if (timers->current != timer) { - if (currSubTimer != NULL) { - hpvm_StopTimerAndSubTimer(&timers->timers[timers->current], - &currSubTimer->timer); - } else { - hpvm_StopTimer(&timers->timers[timers->current]); - } - } else { - if (currSubTimer != NULL) { - hpvm_StopTimer(&currSubTimer->timer); - } - } - } else { - insert_marker(timers, timer); - if (!is_async(timer)) { // if switching to async too, keep driver going - hpvm_StopTimer(&timers->timers[hpvm_TimerID_DRIVER]); - } - } - } - - hpvm_Timestamp currentTime = get_time(); - - /* The only cases we check for asynchronous task completion is - * when an overlapping CPU operation completes, or the next - * segment blocks on completion of previous async operations */ - if (asyncs_outstanding(timers) && - (!is_async(timers->current) || is_blocking(timer))) { - - struct hpvm_async_time_marker_list *last_event = get_last_async(timers); - /* CL_COMPLETE if completed */ - - cl_int ciErrNum = CL_SUCCESS; - cl_int async_done = CL_COMPLETE; - - ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), - CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), - &async_done, NULL); - if (ciErrNum != CL_SUCCESS) { - fprintf(stdout, "Error Querying EventInfo1!\n"); - } - - if (is_blocking(timer)) { - /* Async operations completed after previous CPU operations: - * overlapped time is the total CPU time since this set of async - * operations were first issued */ - - // timer to switch to is COPY or NONE - if (async_done != CL_COMPLETE) { - accumulate_time(&(timers->timers[hpvm_TimerID_OVERLAP].elapsed), - timers->async_begin, currentTime); - } - - /* Wait on async operation completion */ - ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Waiting for Events!\n"); - } - - hpvm_Timestamp total_async_time = record_async_times(timers); - - /* Async operations completed before previous CPU operations: - * overlapped time is the total async time */ - if (async_done == CL_COMPLETE) { - // fprintf(stderr, "Async_done: total_async_type = %lld\n", - // total_async_time); - timers->timers[hpvm_TimerID_OVERLAP].elapsed += total_async_time; - } - - } else - /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ - // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are - // outstanding so something is deeper in stack - if (async_done == CL_COMPLETE) { - /* Async operations completed before previous CPU operations: - * overlapped time is the total async time */ - timers->timers[hpvm_TimerID_OVERLAP].elapsed += - record_async_times(timers); - } - } - - /* Start the new timer */ - if (timer != hpvm_TimerID_NONE) { - if (!is_async(timer)) { - hpvm_StartTimer(&timers->timers[timer]); - } else { - // toSwitchTo Is Async (KERNEL/COPY_ASYNC) - if (!asyncs_outstanding(timers)) { - /* No asyncs outstanding, insert a fresh async marker */ - - insert_marker(timers, timer); - timers->async_begin = currentTime; - } else if (!is_async(timers->current)) { - /* Previous asyncs still in flight, but a previous SwitchTo - * already marked the end of the most recent async operation, - * so we can rename that marker as the beginning of this async - * operation */ - - struct hpvm_async_time_marker_list *last_event = get_last_async(timers); - last_event->label = NULL; - last_event->timerID = timer; - } - if (!is_async(timers->current)) { - hpvm_StartTimer(&timers->timers[hpvm_TimerID_DRIVER]); - } - } - } - timers->current = timer; - -#else - - openCLAbort(); - -#endif - - } void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label, enum hpvm_TimerID category) { - -#ifdef HPVM_USE_OPENCL - - struct hpvm_SubTimerList *subtimerlist = - timers->sub_timer_list[timers->current]; - struct hpvm_SubTimer *curr = - (subtimerlist != NULL) ? subtimerlist->current : NULL; - - if (timers->current != hpvm_TimerID_NONE) { - if (!is_async(timers->current)) { - if (timers->current != category) { - if (curr != NULL) { - hpvm_StopTimerAndSubTimer(&timers->timers[timers->current], - &curr->timer); - } else { - hpvm_StopTimer(&timers->timers[timers->current]); - } - } else { - if (curr != NULL) { - hpvm_StopTimer(&curr->timer); - } - } - } else { - insert_submarker(timers, label, category); - if (!is_async(category)) { // if switching to async too, keep driver going - hpvm_StopTimer(&timers->timers[hpvm_TimerID_DRIVER]); - } - } - } - - hpvm_Timestamp currentTime = get_time(); - - /* The only cases we check for asynchronous task completion is - * when an overlapping CPU operation completes, or the next - * segment blocks on completion of previous async operations */ - if (asyncs_outstanding(timers) && - (!is_async(timers->current) || is_blocking(category))) { - - struct hpvm_async_time_marker_list *last_event = get_last_async(timers); - /* CL_COMPLETE if completed */ - - cl_int ciErrNum = CL_SUCCESS; - cl_int async_done = CL_COMPLETE; - - ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), - CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), - &async_done, NULL); - if (ciErrNum != CL_SUCCESS) { - fprintf(stdout, "Error Querying EventInfo2!\n"); - } - - if (is_blocking(category)) { - /* Async operations completed after previous CPU operations: - * overlapped time is the total CPU time since this set of async - * operations were first issued */ - - // timer to switch to is COPY or NONE - // if it hasn't already finished, then just take now and use that as the - // elapsed time in OVERLAP anything happening after now isn't OVERLAP - // because everything is being stopped to wait for synchronization it - // seems that the extra sync wall time isn't being recorded anywhere - if (async_done != CL_COMPLETE) - accumulate_time(&(timers->timers[hpvm_TimerID_OVERLAP].elapsed), - timers->async_begin, currentTime); - - /* Wait on async operation completion */ - ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Waiting for Events!\n"); - } - hpvm_Timestamp total_async_time = record_async_times(timers); - - /* Async operations completed before previous CPU operations: - * overlapped time is the total async time */ - // If it did finish, then accumulate all the async time that did happen - // into OVERLAP the immediately preceding EventSynchronize theoretically - // didn't have any effect since it was already completed. - if (async_done == CL_COMPLETE /*cudaSuccess*/) - timers->timers[hpvm_TimerID_OVERLAP].elapsed += total_async_time; - - } else - /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ - // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are - // outstanding so something is deeper in stack - if (async_done == CL_COMPLETE /*cudaSuccess*/) { - /* Async operations completed before previous CPU operations: - * overlapped time is the total async time */ - timers->timers[hpvm_TimerID_OVERLAP].elapsed += - record_async_times(timers); - } - // else, this isn't blocking, so just check the next time around - } - - subtimerlist = timers->sub_timer_list[category]; - struct hpvm_SubTimer *subtimer = NULL; - - if (label != NULL) { - subtimer = subtimerlist->subtimer_list; - while (subtimer != NULL) { - if (strcmp(subtimer->label, label) == 0) { - break; - } else { - subtimer = subtimer->next; - } - } - } - - /* Start the new timer */ - if (category != hpvm_TimerID_NONE) { - if (!is_async(category)) { - if (subtimerlist != NULL) { - subtimerlist->current = subtimer; - } - - if (category != timers->current && subtimer != NULL) { - hpvm_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer); - } else if (subtimer != NULL) { - hpvm_StartTimer(&subtimer->timer); - } else { - hpvm_StartTimer(&timers->timers[category]); - } - } else { - if (subtimerlist != NULL) { - subtimerlist->current = subtimer; - } - - // toSwitchTo Is Async (KERNEL/COPY_ASYNC) - if (!asyncs_outstanding(timers)) { - /* No asyncs outstanding, insert a fresh async marker */ - insert_submarker(timers, label, category); - timers->async_begin = currentTime; - } else if (!is_async(timers->current)) { - /* Previous asyncs still in flight, but a previous SwitchTo - * already marked the end of the most recent async operation, - * so we can rename that marker as the beginning of this async - * operation */ - - struct hpvm_async_time_marker_list *last_event = get_last_async(timers); - last_event->timerID = category; - last_event->label = label; - } // else, marker for switchToThis was already inserted - - // toSwitchto is already asynchronous, but if current/prev state is async - // too, then DRIVER is already running - if (!is_async(timers->current)) { - hpvm_StartTimer(&timers->timers[hpvm_TimerID_DRIVER]); - } - } - } - - timers->current = category; - -#else - - openCLAbort(); - -#endif - } void hpvm_PrintTimerSet(struct hpvm_TimerSet *timers) { @@ -1182,53 +480,6 @@ void hpvm_PrintTimerSet(struct hpvm_TimerSet *timers) { } void hpvm_DestroyTimerSet(struct hpvm_TimerSet *timers) { - -#ifdef HPVM_USE_OPENCL - - /* clean up all of the async event markers */ - struct hpvm_async_time_marker_list *event = timers->async_markers; - while (event != NULL) { - - cl_int ciErrNum = CL_SUCCESS; - ciErrNum = clWaitForEvents(1, (cl_event *)(event)->marker); - if (ciErrNum != CL_SUCCESS) { - // fprintf(stderr, "Error Waiting for Events!\n"); - } - - ciErrNum = clReleaseEvent(*((cl_event *)(event)->marker)); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Release Events!\n"); - } - - free((event)->marker); - struct hpvm_async_time_marker_list *next = ((event)->next); - - free(event); - - event = next; - } - - int i = 0; - for (i = 0; i < hpvm_TimerID_LAST; ++i) { - if (timers->sub_timer_list[i] != NULL) { - struct hpvm_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list; - struct hpvm_SubTimer *prev = NULL; - while (subtimer != NULL) { - free(subtimer->label); - prev = subtimer; - subtimer = subtimer->next; - free(prev); - } - free(timers->sub_timer_list[i]); - } - } - -#else - - openCLAbort(); - -#endif - } /**************************** Pipeline API ************************************/ @@ -1427,541 +678,6 @@ void llvm_hpvm_cpu_wait(void *graphID) { DEBUG(cout << "\t... pthread Done!\n"); } - -#ifdef HPVM_USE_OPENCL - -// Returns the platform name. -std::string getPlatformName(cl_platform_id pid) { - - cl_int status; - size_t sz; - status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, &sz); - checkErr(status, CL_SUCCESS, "Query for platform name size failed"); - - char *name = new char[sz]; - status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, sz, name, NULL); - checkErr(status, CL_SUCCESS, "Query for platform name failed"); - - const auto &tmp = std::string(name, name + sz); - delete[] name; - return tmp; -} - -#endif - - -#ifdef HPVM_USE_OPENCL - -// Searches all platforms for the first platform whose name -// contains the search string (case-insensitive). -cl_platform_id findPlatform(const char *platform_name_search) { - - cl_int status; - - std::string search = platform_name_search; - std::transform(search.begin(), search.end(), search.begin(), ::tolower); - - // Get number of platforms. - cl_uint num_platforms; - status = clGetPlatformIDs(0, NULL, &num_platforms); - checkErr(status, CL_SUCCESS, "Query for number of platforms failed"); - - // Get a list of all platform ids. - cl_platform_id *pids = - (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); - status = clGetPlatformIDs(num_platforms, pids, NULL); - checkErr(status, CL_SUCCESS, "Query for all platform ids failed"); - - // For each platform, get name and compare against the search string. - for (unsigned i = 0; i < num_platforms; ++i) { - std::string name = getPlatformName(pids[i]); - - // Convert to lower case. - std::transform(name.begin(), name.end(), name.begin(), ::tolower); - - if (name.find(search) != std::string::npos) { - // Found! - cl_platform_id pid = pids[i]; - free(pids); - return pid; - } - } - - free(pids); - // No platform found. - assert(false && "No matching platform found!"); -} - -#endif - - -void *llvm_hpvm_ocl_initContext(enum hpvm::Target T) { - -#ifdef HPVM_USE_OPENCL - - pthread_mutex_lock(&ocl_mtx); - DEBUG(std::string Target = T == hpvm::GPU_TARGET ? "GPU" : "SPIR"); - DEBUG(cout << "Initializing Context for " << Target << " device\n"); - cl_uint numPlatforms; - cl_int errcode; - errcode = clGetPlatformIDs(0, NULL, &numPlatforms); - checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms"); - - // now get all the platform IDs - cl_platform_id *platforms = - (cl_platform_id *)malloc(sizeof(cl_platform_id) * numPlatforms); - errcode = clGetPlatformIDs(numPlatforms, platforms, NULL); - checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs"); - - for (unsigned i = 0; i < numPlatforms; i++) { - char buffer[10240]; - DEBUG(cout << "Device " << i << " Info -->\n"); - clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL); - DEBUG(cout << "\tPROFILE = " << buffer << flush << "\n"); - clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL); - DEBUG(cout << "\tVERSION = " << buffer << flush << "\n"); - clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL); - DEBUG(cout << "\tNAME = " << buffer << flush << "\n"); - clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL); - DEBUG(cout << "\tVENDOR = " << buffer << flush << "\n"); - clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, - NULL); - DEBUG(cout << "\tEXTENSIONS = " << buffer << flush << "\n"); - } - cl_platform_id platformId; - if (T == hpvm::GPU_TARGET) { - platformId = findPlatform("nvidia"); - char buffer[10240]; - DEBUG(cout << "Found NVIDIA Device \n"); - clGetPlatformInfo(platformId, CL_PLATFORM_PROFILE, 10240, buffer, NULL); - DEBUG(cout << "\tPROFILE = " << buffer << flush << "\n"); - clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, 10240, buffer, NULL); - DEBUG(cout << "\tVERSION = " << buffer << flush << "\n"); - clGetPlatformInfo(platformId, CL_PLATFORM_NAME, 10240, buffer, NULL); - DEBUG(cout << "\tNAME = " << buffer << flush << "\n"); - clGetPlatformInfo(platformId, CL_PLATFORM_VENDOR, 10240, buffer, NULL); - DEBUG(cout << "\tVENDOR = " << buffer << flush << "\n"); - clGetPlatformInfo(platformId, CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL); - DEBUG(cout << "\tEXTENSIONS = " << buffer << flush << "\n"); - } else { - platformId = findPlatform("intel"); - char buffer[10240]; - DEBUG(cout << "Found Intel Device \n"); - clGetPlatformInfo(platformId, CL_PLATFORM_PROFILE, 10240, buffer, NULL); - DEBUG(cout << "\tPROFILE = " << buffer << flush << "\n"); - clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, 10240, buffer, NULL); - DEBUG(cout << "\tVERSION = " << buffer << flush << "\n"); - clGetPlatformInfo(platformId, CL_PLATFORM_NAME, 10240, buffer, NULL); - DEBUG(cout << "\tNAME = " << buffer << flush << "\n"); - clGetPlatformInfo(platformId, CL_PLATFORM_VENDOR, 10240, buffer, NULL); - DEBUG(cout << "\tVENDOR = " << buffer << flush << "\n"); - clGetPlatformInfo(platformId, CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL); - DEBUG(cout << "\tEXTENSIONS = " << buffer << flush << "\n"); - } - DEBUG(cout << "Found plarform with id: " << platformId << "\n"); - cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (long)platformId, - 0}; - globalOCLContext = clCreateContextFromType( - properties, - T == hpvm::GPU_TARGET ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, NULL, - NULL, &errcode); - checkErr(errcode, CL_SUCCESS, "Failure to create context"); - // get the list of OCL devices associated with context - size_t dataBytes; - errcode = clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, 0, NULL, - &dataBytes); - checkErr(errcode, CL_SUCCESS, "Failure to get context info length"); - - DEBUG(cout << "Got databytes: " << dataBytes << "\n"); - - clDevices = (cl_device_id *)malloc(dataBytes); - errcode |= clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, dataBytes, - clDevices, NULL); - checkErr(errcode, CL_SUCCESS, "Failure to get context info"); - - free(platforms); - DEBUG(cout << "\tContext " << globalOCLContext << flush << "\n"); - checkErr(errcode, CL_SUCCESS, "Failure to create OCL context"); - - DEBUG(cout << "Initialize Kernel Timer\n"); - hpvm_InitializeTimerSet(&kernel_timer); - - pthread_mutex_unlock(&ocl_mtx); - return globalOCLContext; - -#else - - openCLAbort(); - -#endif - -} - -void llvm_hpvm_ocl_clearContext(void *graphID) { - -#ifdef HPVM_USE_OPENCL - - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "Clear Context\n"); - DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; - // FIXME: Have separate function to release command queue and clear context. - // Would be useful when a context has multiple command queues - clReleaseKernel(Context->clKernel); - free(Context); - DEBUG(cout << "Done with OCL kernel\n"); - cout << "Printing HPVM Timer: KernelTimer\n"; - hpvm_PrintTimerSet(&kernel_timer); - pthread_mutex_unlock(&ocl_mtx); - -#else - - openCLAbort(); - -#endif - -} - -void llvm_hpvm_ocl_argument_shared(void *graphID, int arg_index, size_t size) { - -#ifdef HPVM_USE_OPENCL - - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "Set Shared Memory Input:"); - DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size - << flush << "\n"); - DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; - DEBUG(cout << "Using Context: " << Context << flush << "\n"); - DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n"); - cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, NULL); - checkErr(errcode, CL_SUCCESS, "Failure to set shared memory argument"); - pthread_mutex_unlock(&ocl_mtx); - -#else - - openCLAbort(); - -#endif - -} - -void llvm_hpvm_ocl_argument_scalar(void *graphID, void *input, int arg_index, - size_t size) { - -#ifdef HPVM_USE_OPENCL - - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "Set Scalar Input:"); - DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size - << flush << "\n"); - DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; - DEBUG(cout << "Using Context: " << Context << flush << "\n"); - DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n"); - cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input); - checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument"); - pthread_mutex_unlock(&ocl_mtx); - -#else - - openCLAbort(); - -#endif - -} - -void *llvm_hpvm_ocl_argument_ptr(void *graphID, void *input, int arg_index, - size_t size, bool isInput, bool isOutput) { - -#ifdef HPVM_USE_OPENCL - - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "Set Pointer Input:"); - DEBUG(cout << "\tArgument Index = " << arg_index << ", Ptr = " << input - << ", Size = " << size << flush << "\n"); - // Size should be non-zero - assert(size != 0 && "Size of data pointed to has to be non-zero!"); - DEBUG(cout << "\tInput = " << isInput << "\tOutput = " << isOutput << flush - << "\n"); - DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; - - pthread_mutex_unlock(&ocl_mtx); - // Check with runtime the location of this memory - cl_mem d_input = (cl_mem)llvm_hpvm_ocl_request_mem(input, size, Context, - isInput, isOutput); - - pthread_mutex_lock(&ocl_mtx); - // Set Kernel Argument - cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), - (void *)&d_input); - checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument"); - DEBUG(cout << "\tDevicePtr = " << d_input << flush << "\n"); - pthread_mutex_unlock(&ocl_mtx); - return d_input; - -#else - - openCLAbort(); - -#endif - -} - -void *llvm_hpvm_ocl_output_ptr(void *graphID, int arg_index, size_t size) { - -#ifdef HPVM_USE_OPENCL - - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "Set device memory for Output Struct:"); - DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size - << flush << "\n"); - DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; - cl_int errcode; - cl_mem d_output = clCreateBuffer(Context->clOCLContext, CL_MEM_WRITE_ONLY, - size, NULL, &errcode); - checkErr(errcode, CL_SUCCESS, "Failure to create output buffer on device"); - errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), - (void *)&d_output); - checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument"); - DEBUG(cout << "\tDevicePtr = " << d_output << flush << "\n"); - pthread_mutex_unlock(&ocl_mtx); - return d_output; - -#else - - openCLAbort(); - -#endif - -} - -void llvm_hpvm_ocl_free(void *ptr) {} - -void *llvm_hpvm_ocl_getOutput(void *graphID, void *h_output, void *d_output, - size_t size) { - -#ifdef HPVM_USE_OPENCL - - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "Get Output:\n"); - DEBUG(cout << "\tHostPtr = " << h_output << ", DevicePtr = " << d_output - << ", Size = " << size << flush << "\n"); - if (h_output == NULL) - h_output = malloc(size); - DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; - cl_int errcode = - clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, - size, h_output, 0, NULL, NULL); - checkErr(errcode, CL_SUCCESS, "[getOutput] Failure to read output"); - pthread_mutex_unlock(&ocl_mtx); - return h_output; - -#else - - openCLAbort(); - -#endif - -} - -void *llvm_hpvm_ocl_executeNode(void *graphID, unsigned workDim, - const size_t *localWorkSize, - const size_t *globalWorkSize) { - -#ifdef HPVM_USE_OPENCL - - pthread_mutex_lock(&ocl_mtx); - - size_t GlobalWG[3]; - size_t LocalWG[3]; - - // OpenCL EnqeueNDRangeKernel function results in segementation fault if we - // directly use local and global work groups arguments. Hence, allocating it - // on stack and copying. - for (unsigned i = 0; i < workDim; i++) { - GlobalWG[i] = globalWorkSize[i]; - } - - // OpenCL allows local workgroup to be null. - if (localWorkSize != NULL) { - for (unsigned i = 0; i < workDim; i++) { - LocalWG[i] = localWorkSize[i]; - } - } - - DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; - // TODO: Would like to use event to ensure better scheduling of kernels. - // Currently passing the event paratemeter results in seg fault with - // clEnqueueNDRangeKernel. - DEBUG(cout << "Enqueuing kernel:\n"); - DEBUG(cout << "\tCommand Queue: " << Context->clCommandQue << flush << "\n"); - DEBUG(cout << "\tKernel: " << Context->clKernel << flush << "\n"); - DEBUG(cout << "\tNumber of dimensions: " << workDim << flush << "\n"); - DEBUG(cout << "\tGlobal Work Group: ( "); - for (unsigned i = 0; i < workDim; i++) { - DEBUG(cout << GlobalWG[i] << " "); - } - DEBUG(cout << ")\n"); - if (localWorkSize != NULL) { - DEBUG(cout << "\tLocal Work Group: ( "); - for (unsigned i = 0; i < workDim; i++) { - DEBUG(cout << LocalWG[i] << " "); - } - DEBUG(cout << ")\n"); - } - clFinish(Context->clCommandQue); - hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_COMPUTATION); - cl_int errcode = clEnqueueNDRangeKernel( - Context->clCommandQue, Context->clKernel, workDim, NULL, GlobalWG, - (localWorkSize == NULL) ? NULL : LocalWG, 0, NULL, NULL); - checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel"); - clFinish(Context->clCommandQue); - hpvm_SwitchToTimer(&kernel_timer, hpvm_TimerID_NONE); - - pthread_mutex_unlock(&ocl_mtx); - return NULL; - -#else - - openCLAbort(); - -#endif - -} - -////////////////////////////////////////////////////////////////////////////// -//! Loads a Program binary file. -//! -//! @return the source string if succeeded, 0 otherwise -//! @param Filename program filename -//! @param szFinalLength returned length of the code string -////////////////////////////////////////////////////////////////////////////// -static char *LoadProgSource(const char *Filename, size_t *szFinalLength) { - DEBUG(cout << "Load Prog Source\n"); - // locals - FILE *pFileStream = NULL; - size_t szSourceLength; - - // open the OpenCL source code file - pFileStream = fopen(Filename, "rb"); - if (pFileStream == 0) { - return NULL; - } - - // get the length of the source code - fseek(pFileStream, 0, SEEK_END); - szSourceLength = ftell(pFileStream); - fseek(pFileStream, 0, SEEK_SET); - - // allocate a buffer for the source code string and read it in - char *cSourceString = (char *)malloc(szSourceLength + 1); - if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1) { - fclose(pFileStream); - free(cSourceString); - return 0; - } - - // close the file and return the total length of the combined (preamble + - // source) string - fclose(pFileStream); - if (szFinalLength != 0) { - *szFinalLength = szSourceLength; - } - cSourceString[szSourceLength] = '\0'; - - return cSourceString; -} - -void *llvm_hpvm_ocl_launch(const char *FileName, const char *KernelName) { - -#ifdef HPVM_USE_OPENCL - - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "Launch OCL Kernel\n"); - // Initialize OpenCL - - // OpenCL specific variables - DFNodeContext_OCL *Context = - (DFNodeContext_OCL *)malloc(sizeof(DFNodeContext_OCL)); - - size_t kernelLength; - cl_int errcode; - - // For a single context for all kernels - Context->clOCLContext = globalOCLContext; - - // Create a command-queue - Context->clCommandQue = clCreateCommandQueue( - Context->clOCLContext, clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode); - globalCommandQue = Context->clCommandQue; - checkErr(errcode, CL_SUCCESS, "Failure to create command queue"); - - DEBUG(cout << "Loading program binary: " << FileName << flush << "\n"); - char *programSource = LoadProgSource(FileName, &kernelLength); - checkErr(programSource != NULL, 1 /*bool true*/, - "Failure to load Program Binary"); - - Context->clProgram = clCreateProgramWithSource( - Context->clOCLContext, 1, (const char **)&programSource, NULL, &errcode); - checkErr(errcode, CL_SUCCESS, "Failure to create program from binary"); - - DEBUG(cout << "Building kernel - " << KernelName << " from file " << FileName - << flush << "\n"); - errcode = - clBuildProgram(Context->clProgram, 1, &clDevices[0], "", NULL, NULL); - // If build fails, get build log from device - if (errcode != CL_SUCCESS) { - cout << "ERROR: Failure to build program\n"; - size_t len = 0; - errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0], - CL_PROGRAM_BUILD_LOG, 0, NULL, &len); - cout << "LOG LENGTH: " << len << flush << "\n"; - checkErr(errcode, CL_SUCCESS, - "Failure to collect program build log length"); - char *log = (char *)malloc(len * sizeof(char)); - errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0], - CL_PROGRAM_BUILD_LOG, len, log, NULL); - checkErr(errcode, CL_SUCCESS, "Failure to collect program build log"); - - cout << "Device Build Log:\n" << log << flush << "\n"; - free(log); - pthread_mutex_unlock(&ocl_mtx); - exit(EXIT_FAILURE); - } - - Context->clKernel = clCreateKernel(Context->clProgram, KernelName, &errcode); - checkErr(errcode, CL_SUCCESS, "Failure to create kernel"); - - DEBUG(cout << "Kernel ID = " << Context->clKernel << "\n"); - free(programSource); - - pthread_mutex_unlock(&ocl_mtx); - return Context; - -#else - - openCLAbort(); - -#endif - -} - -void llvm_hpvm_ocl_wait(void *graphID) { - -#ifdef HPVM_USE_OPENCL - - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "Wait\n"); - DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID; - clFinish(Context->clCommandQue); - pthread_mutex_unlock(&ocl_mtx); - -#else - - openCLAbort(); - -#endif - -} - void llvm_hpvm_switchToTimer(void **timerSet, enum hpvm_TimerID timer) { pthread_mutex_lock(&ocl_mtx); pthread_mutex_unlock(&ocl_mtx); @@ -1984,5 +700,3 @@ void *llvm_hpvm_initializeTimerSet() { pthread_mutex_unlock(&ocl_mtx); return TS; } - -