From 86a02888b4a94e3aaf4fcda01bcb2e2cb133bf34 Mon Sep 17 00:00:00 2001 From: Akash Kothari <akashk4@tyler.cs.illinois.edu> Date: Sun, 20 Dec 2020 15:45:06 -0600 Subject: [PATCH] Copy out the ApproxHPVM headers and passes --- include/BuildDFG/BuildDFG.h | 82 + include/DFGraph.h | 415 +++ .../ExtractHPVMLeafNodes.h | 25 + .../FuseHPVMTensorNodes/FuseHPVMTensorNodes.h | 178 ++ include/GenVISC/GenVISC.h | 52 + include/InPlaceDFG/InPlaceDFGAnalysis.h | 52 + include/SupportVISC/DFG2LLVM.h | 497 ++++ include/SupportVISC/DFGTreeTraversal.h | 64 + include/SupportVISC/VISCHint.h | 35 + include/SupportVISC/VISCTimer.h | 159 ++ include/SupportVISC/VISCUtils.h | 601 +++++ lib/BuildDFG/BuildDFG.cpp | 395 +++ lib/BuildDFG/BuildDFG.exports | 0 lib/BuildDFG/CMakeLists.txt | 12 + lib/BuildDFG/LLVMBuild.txt | 21 + lib/ClearDFG/CMakeLists.txt | 12 + lib/ClearDFG/ClearDFG.cpp | 172 ++ lib/ClearDFG/ClearDFG.exports | 0 lib/ClearDFG/LLVMBuild.txt | 21 + lib/DFG2LLVM_CUDNN/CMakeLists.txt | 12 + lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp | 645 +++++ lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.exports | 0 lib/DFG2LLVM_CUDNN/LLVMBuild.txt | 21 + lib/DFG2LLVM_NVPTX/CMakeLists.txt | 12 + lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp | 2075 +++++++++++++++ lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.exports | 0 lib/DFG2LLVM_NVPTX/LLVMBuild.txt | 21 + lib/DFG2LLVM_PROMISE/CMakeLists.txt | 12 + lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.cpp | 1283 +++++++++ lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.exports | 0 lib/DFG2LLVM_PROMISE/LLVMBuild.txt | 21 + lib/DFG2LLVM_SPIR/CMakeLists.txt | 12 + lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp | 2010 ++++++++++++++ lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.exports | 0 lib/DFG2LLVM_SPIR/LLVMBuild.txt | 21 + lib/DFG2LLVM_WrapperAPI/CMakeLists.txt | 12 + .../DFG2LLVM_WrapperAPI.cpp | 1532 +++++++++++ .../DFG2LLVM_WrapperAPI.exports | 0 lib/DFG2LLVM_WrapperAPI/LLVMBuild.txt | 21 + lib/DFG2LLVM_X86/CMakeLists.txt | 11 + lib/DFG2LLVM_X86/DFG2LLVM_X86.cpp | 2082 +++++++++++++++ lib/DFG2LLVM_X86/DFG2LLVM_X86.exports | 0 lib/DFG2LLVM_X86/LLVMBuild.txt | 21 + lib/DFG2LLVM_X86_dsoc/CMakeLists.txt | 13 + lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports | 0 lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp | 2128 +++++++++++++++ lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt | 22 + lib/ExtractHPVMLeafNodes/CMakeLists.txt | 13 + .../ExtractHPVMLeafNodes.cpp | 246 ++ .../ExtractHPVMLeafNodes.exports | 0 lib/ExtractHPVMLeafNodes/LLVMBuild.txt | 22 + lib/FuseHPVMTensorNodes/CMakeLists.txt | 12 + .../FuseHPVMTensorNodes.cpp | 1007 +++++++ .../FuseHPVMTensorNodes.exports | 0 lib/FuseHPVMTensorNodes/LLVMBuild.txt | 21 + lib/GenVISC/CMakeLists.txt | 12 + lib/GenVISC/GenVISC.cpp | 1590 +++++++++++ lib/GenVISC/GenVISC.exports | 0 lib/GenVISC/LLVMBuild.txt | 21 + lib/InPlaceDFG/CMakeLists.txt | 12 + lib/InPlaceDFG/InPlaceDFGAnalysis.cpp | 318 +++ lib/InPlaceDFG/InPlaceDFGAnalysis.exports | 0 lib/InPlaceDFG/LLVMBuild.txt | 21 + lib/InlineTensorCalls/CMakeLists.txt | 13 + lib/InlineTensorCalls/InlineTensorCalls.cpp | 77 + .../InlineTensorCalls.exports | 0 lib/InlineTensorCalls/LLVMBuild.txt | 22 + lib/InsertApproxInfo/CMakeLists.txt | 12 + lib/InsertApproxInfo/InsertApproxInfo.cpp | 498 ++++ lib/InsertApproxInfo/LLVMBuild.txt | 21 + lib/LocalMem/CMakeLists.txt | 12 + lib/LocalMem/LLVMBuild.txt | 21 + lib/LocalMem/LocalMem.cpp | 224 ++ lib/LocalMem/LocalMem.exports | 0 lib/MergeDFN/CMakeLists.txt | 12 + lib/MergeDFN/LLVMBuild.txt | 21 + lib/MergeDFN/MergeDFN.cpp | 2338 +++++++++++++++++ lib/MergeDFN/MergeDFN.exports | 0 lib/ReplaceIntrinsics/CMakeLists.txt | 13 + lib/ReplaceIntrinsics/LLVMBuild.txt | 22 + lib/ReplaceIntrinsics/ReplaceIntrinsics.cpp | 516 ++++ .../ReplaceIntrinsics.exports | 0 82 files changed, 21897 insertions(+) create mode 100644 include/BuildDFG/BuildDFG.h create mode 100644 include/DFGraph.h create mode 100644 include/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.h create mode 100644 include/FuseHPVMTensorNodes/FuseHPVMTensorNodes.h create mode 100644 include/GenVISC/GenVISC.h create mode 100644 include/InPlaceDFG/InPlaceDFGAnalysis.h create mode 100644 include/SupportVISC/DFG2LLVM.h create mode 100644 include/SupportVISC/DFGTreeTraversal.h create mode 100644 include/SupportVISC/VISCHint.h create mode 100644 include/SupportVISC/VISCTimer.h create mode 100644 include/SupportVISC/VISCUtils.h create mode 100644 lib/BuildDFG/BuildDFG.cpp create mode 100644 lib/BuildDFG/BuildDFG.exports create mode 100644 lib/BuildDFG/CMakeLists.txt create mode 100644 lib/BuildDFG/LLVMBuild.txt create mode 100644 lib/ClearDFG/CMakeLists.txt create mode 100644 lib/ClearDFG/ClearDFG.cpp create mode 100644 lib/ClearDFG/ClearDFG.exports create mode 100644 lib/ClearDFG/LLVMBuild.txt create mode 100644 lib/DFG2LLVM_CUDNN/CMakeLists.txt create mode 100644 lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp create mode 100644 lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.exports create mode 100644 lib/DFG2LLVM_CUDNN/LLVMBuild.txt create mode 100644 lib/DFG2LLVM_NVPTX/CMakeLists.txt create mode 100644 lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp create mode 100644 lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.exports create mode 100644 lib/DFG2LLVM_NVPTX/LLVMBuild.txt create mode 100644 lib/DFG2LLVM_PROMISE/CMakeLists.txt create mode 100644 lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.cpp create mode 100644 lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.exports create mode 100644 lib/DFG2LLVM_PROMISE/LLVMBuild.txt create mode 100644 lib/DFG2LLVM_SPIR/CMakeLists.txt create mode 100644 lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp create mode 100644 lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.exports create mode 100644 lib/DFG2LLVM_SPIR/LLVMBuild.txt create mode 100644 lib/DFG2LLVM_WrapperAPI/CMakeLists.txt create mode 100644 lib/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp create mode 100644 lib/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.exports create mode 100644 lib/DFG2LLVM_WrapperAPI/LLVMBuild.txt create mode 100644 lib/DFG2LLVM_X86/CMakeLists.txt create mode 100644 lib/DFG2LLVM_X86/DFG2LLVM_X86.cpp create mode 100644 lib/DFG2LLVM_X86/DFG2LLVM_X86.exports create mode 100644 lib/DFG2LLVM_X86/LLVMBuild.txt create mode 100644 lib/DFG2LLVM_X86_dsoc/CMakeLists.txt create mode 100644 lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports create mode 100644 lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp create mode 100644 lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt create mode 100644 lib/ExtractHPVMLeafNodes/CMakeLists.txt create mode 100644 lib/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.cpp create mode 100644 lib/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.exports create mode 100644 lib/ExtractHPVMLeafNodes/LLVMBuild.txt create mode 100644 lib/FuseHPVMTensorNodes/CMakeLists.txt create mode 100644 lib/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp create mode 100644 lib/FuseHPVMTensorNodes/FuseHPVMTensorNodes.exports create mode 100644 lib/FuseHPVMTensorNodes/LLVMBuild.txt create mode 100644 lib/GenVISC/CMakeLists.txt create mode 100644 lib/GenVISC/GenVISC.cpp create mode 100644 lib/GenVISC/GenVISC.exports create mode 100644 lib/GenVISC/LLVMBuild.txt create mode 100644 lib/InPlaceDFG/CMakeLists.txt create mode 100644 lib/InPlaceDFG/InPlaceDFGAnalysis.cpp create mode 100644 lib/InPlaceDFG/InPlaceDFGAnalysis.exports create mode 100644 lib/InPlaceDFG/LLVMBuild.txt create mode 100644 lib/InlineTensorCalls/CMakeLists.txt create mode 100644 lib/InlineTensorCalls/InlineTensorCalls.cpp create mode 100644 lib/InlineTensorCalls/InlineTensorCalls.exports create mode 100644 lib/InlineTensorCalls/LLVMBuild.txt create mode 100644 lib/InsertApproxInfo/CMakeLists.txt create mode 100644 lib/InsertApproxInfo/InsertApproxInfo.cpp create mode 100644 lib/InsertApproxInfo/LLVMBuild.txt create mode 100644 lib/LocalMem/CMakeLists.txt create mode 100644 lib/LocalMem/LLVMBuild.txt create mode 100644 lib/LocalMem/LocalMem.cpp create mode 100644 lib/LocalMem/LocalMem.exports create mode 100644 lib/MergeDFN/CMakeLists.txt create mode 100644 lib/MergeDFN/LLVMBuild.txt create mode 100644 lib/MergeDFN/MergeDFN.cpp create mode 100644 lib/MergeDFN/MergeDFN.exports create mode 100644 lib/ReplaceIntrinsics/CMakeLists.txt create mode 100644 lib/ReplaceIntrinsics/LLVMBuild.txt create mode 100644 lib/ReplaceIntrinsics/ReplaceIntrinsics.cpp create mode 100644 lib/ReplaceIntrinsics/ReplaceIntrinsics.exports diff --git a/include/BuildDFG/BuildDFG.h b/include/BuildDFG/BuildDFG.h new file mode 100644 index 0000000000..7d51d32022 --- /dev/null +++ b/include/BuildDFG/BuildDFG.h @@ -0,0 +1,82 @@ +#ifndef __BUILD_DFG_H__ +#define __BUILD_DFG_H__ + +//== BuildDFG.h - Header file for "Hierarchical Dataflow Graph Builder Pass" =// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/ValueMap.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/DFGraph.h" +#include "llvm/Pass.h" + +using namespace llvm; + +namespace builddfg { +// BuildDFG - The first implementation. +struct BuildDFG : public ModulePass { + static char ID; // Pass identification, replacement for typeid + BuildDFG() : ModulePass(ID) {} + + typedef ValueMap<Value*, DFNode*> HandleToDFNode; + typedef ValueMap<Value*, DFEdge*> HandleToDFEdge; + +private: + // Member variables + DFInternalNode *Root; + std::vector<DFInternalNode*> Roots; + + HandleToDFNode HandleToDFNodeMap; // This map associates the i8* pointer + // with the DFNode structure that it + // represents + HandleToDFEdge HandleToDFEdgeMap; // This map associates the i8* pointer + // with the DFEdge structure that it + // represents + + + // Functions +public: + void handleCreateNode (DFInternalNode* N, IntrinsicInst* II); +private: + void handleCreateEdge (DFInternalNode* N, IntrinsicInst* II); + void handleGetParentNode (DFInternalNode* N, IntrinsicInst* II); + void handleBindInput (DFInternalNode* N, IntrinsicInst* II); + void handleBindOutput (DFInternalNode* N, IntrinsicInst* II); + + void BuildGraph (DFInternalNode* N, Function* F); + +public: + // Functions + virtual bool runOnModule(Module &M); + + static bool isViscLaunchIntrinsic(Instruction * I); + static bool isViscGraphIntrinsic(Instruction * I); + static bool isViscQueryIntrinsic(Instruction* I); + static bool isViscIntrinsic(Instruction* I); + static bool isTypeCongruent(Type *L, Type *R); + + //TODO: Maybe make these fields const + DFInternalNode *getRoot() const; + std::vector<DFInternalNode*> &getRoots(); + HandleToDFNode &getHandleToDFNodeMap(); + HandleToDFEdge &getHandleToDFEdgeMap(); + void addElementToHandleToDFNodeMap(Value* V, DFNode* N); + void removeElementFromHandleToDFNodeMap(Value* V); + void addElementToHandleToDFEdgeMap(Value* V, DFEdge* E); + void removeElementFromHandleToDFEdgeMap(Value* V); + +}; + +} // End of namespace + +#endif + diff --git a/include/DFGraph.h b/include/DFGraph.h new file mode 100644 index 0000000000..8307e56889 --- /dev/null +++ b/include/DFGraph.h @@ -0,0 +1,415 @@ +//===----- llvm/IR/DFGraph.h - Classes to represent a Dataflow Graph ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the definition of the following classes: +// 1. DFNode +// 2. DFGraph +// 3. DFInternalNode +// 4. DFLeafNode +// 5. DFEdge. +// +// FIXME : We still need to figure out whether these functions are independent +//===----------------------------------------------------------------------===// + +#ifndef LLVM_IR_DFGRAPH_H +#define LLVM_IR_DFGRAPH_H + +#include "llvm/IR/Function.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Compiler.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/GraphWriter.h" + + +namespace llvm { + +class DFNode; +class DFInternalNode; +class DFLeafNode; +class DFEdge; +class DFNodeVisitor; +class DFTreeTraversal; +class DFEdgeVisitor; +class DFGraph; + +//template<> struct GraphTraits + +typedef std::vector<DFNode*> DFNodeListType; + +class DFGraph { + +private: + typedef std::vector<DFEdge*> DFEdgeListType; + + // Important things that make up a Dataflow graph + // DFLeafNode* Entry; + DFInternalNode* Parent; + DFNodeListType ChildrenList; ///< List of children Dataflow Nodes + DFEdgeListType DFEdgeList; ///< List of Dataflow edges among children + +public: + DFGraph(DFInternalNode* P) { + //ChildrenList.push_back(llvm::DFLeafNode::Create(NULL, NULL, NULL)); + Parent = P; + } + + void addChildDFNode(DFNode* child) { + ChildrenList.push_back(child); + } + + void addDFEdge(DFEdge* E) { + DFEdgeList.push_back(E); + } + + // Iterators + typedef DFNodeListType::iterator children_iterator; + typedef DFNodeListType::const_iterator const_children_iterator; + + typedef DFEdgeListType::iterator dfedge_iterator; + typedef DFEdgeListType::const_iterator const_dfedge_iterator; + + //===--------------------------------------------------------------------===// + // DFNodeList iterator forwarding functions + // + children_iterator begin() { return ChildrenList.begin(); } + const_children_iterator begin() const { return ChildrenList.begin(); } + children_iterator end () { return ChildrenList.end(); } + const_children_iterator end () const { return ChildrenList.end(); } + + size_t size() const { return ChildrenList.size(); } + bool empty() const { return ChildrenList.empty(); } + const DFNode *front() const { return ChildrenList.front(); } + DFNode *front() { return ChildrenList.front(); } + const DFNode *back() const { return ChildrenList.back(); } + DFNode *back() { return ChildrenList.back(); } + + //===--------------------------------------------------------------------===// + + //===--------------------------------------------------------------------===// + // DFEdgeList iterator forwarding functions + // + dfedge_iterator dfedge_begin() { return DFEdgeList.begin(); } + const_dfedge_iterator dfedge_begin() const { return DFEdgeList.begin(); } + dfedge_iterator dfedge_end () { return DFEdgeList.end(); } + const_dfedge_iterator dfedge_end () const { return DFEdgeList.end(); } + + size_t dfedge_size() const { return DFEdgeList.size(); } + bool dfedge_empty() const { return DFEdgeList.empty(); } + const DFEdge *dfedge_front() const { return DFEdgeList.front(); } + DFEdge *dfedge_front() { return DFEdgeList.front(); } + const DFEdge *dfedge_back() const { return DFEdgeList.back(); } + DFEdge *dfedge_back() { return DFEdgeList.back(); } + + //===--------------------------------------------------------------------===// + + DFInternalNode* getParent() { + return Parent; + } + +}; + +// DFNode represents a single VISC Dataflow Node in LLVM. +// This is an abstract class. +// +// A Dataflow Node basically consists of +// 1. Pointer to a function describing this dataflow node +// 2. Number of dimensions in which the node is replicated +// 3. Number of instances in each dimension +// 4. Pointer to parent Dataflow Node +// 5. List of children Dataflow Nodes (empty if it is a leaf node) +// 6. List of Dataflow Edges among children + +class DFNode { + + public: + enum DFNodeKind { + Internal, + Leaf + }; + + private: + + const DFNodeKind Kind; + + // Important things that make up a Dataflow Node + IntrinsicInst* II; ///< Associated IntrinsicInst/Value + Function* FuncPointer; ///< Associated Function + DFNode* Parent; ///< Pointer to parent dataflow Node + int NumOfDim; ///< Number of dimensions + std::vector<Value*> DimLimits; ///< Number of instances in each dimension + DFNodeListType Successors; ///< List of successors i.e., + ///< destination DFNodes to DFEdges + ///< originating from this DFNode + + public: + DFNodeKind getKind() const {return Kind;} + + // Iterators + typedef DFNodeListType::iterator successor_iterator; + typedef DFNodeListType::const_iterator const_successor_iterator; + + //===--------------------------------------------------------------------===// + // DFNodeList iterator forwarding functions + // + successor_iterator successors_begin() { return Successors.begin(); } + const_successor_iterator successors_begin() const { return Successors.begin(); } + successor_iterator successors_end () { return Successors.end(); } + const_successor_iterator successors_end () const { return Successors.end(); } + + size_t successors_size() const { return Successors.size(); } + bool successors_empty() const { return Successors.empty(); } + const DFNode* successors_front() const { return Successors.front(); } + DFNode* successors_front() { return Successors.front(); } + const DFNode* successors_back() const { return Successors.back(); } + DFNode* successors_back() { return Successors.back(); } + + //===--------------------------------------------------------------------===// + + // Functions + DFNode(DFNodeKind _Kind, IntrinsicInst* _II, Function* _FuncPointer, DFNode* _Parent, + int _NumOfDim, std::vector<Value*> _DimLimits) : Kind(_Kind), II(_II), + FuncPointer(_FuncPointer), Parent(_Parent), NumOfDim(_NumOfDim), + DimLimits(_DimLimits) {} + + void addSuccessor(DFNode* N) { + Successors.push_back(N); + } + + Function* getFuncPointer() { + return FuncPointer; + } + + + + virtual void applyDFNodeVisitor(DFNodeVisitor &V, DFNodeListType *L = NULL) = 0; +// virtual void applyDFEdgeVisitor(DFEdgeVisitor &V) = 0; + +}; + + +class DFInternalNode : public DFNode { + + private: + DFGraph* childGraph; + + // Constructor + DFInternalNode(IntrinsicInst* II, Function* FuncPointer, DFNode* Parent, + int NumOfDim, std::vector<Value*> DimLimits) : + DFNode(Internal, II, FuncPointer, Parent, NumOfDim, DimLimits) { + childGraph = new DFGraph(this); + //childGraph->addChildDFNode(DFLeafNode::Create(NULL, NULL, this)); + } + + public: + static DFInternalNode *Create(IntrinsicInst* II, Function* FuncPointer, + DFNode* Parent = NULL, int NumOfDim = 0, + std::vector<Value*> DimLimits = std::vector<Value*>()) { + return new DFInternalNode(II, FuncPointer, Parent, NumOfDim, DimLimits); + } + + static bool classof(const DFNode *N) { + return N->getKind() == Internal; + } + + + void addChildToDFGraph(DFNode* N) { + childGraph->addChildDFNode(N); + } + + void addEdgeToDFGraph(DFEdge* E) { + childGraph->addDFEdge(E); + } + + DFGraph* getChildGraph() { + return childGraph; + } + + void applyDFNodeVisitor(DFNodeVisitor &V, DFNodeListType *L = NULL); /*virtual*/ +// void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/ +}; + +class DFLeafNode : public DFNode { + private: + // Constructor + DFLeafNode(IntrinsicInst* II, Function* FuncPointer, DFNode* Parent, + int NumOfDim = 0, std::vector<Value*> DimLimits = std::vector<Value*>()) + : DFNode(Leaf, II, FuncPointer, Parent, NumOfDim, DimLimits) {} + + public: + + static DFLeafNode *Create(IntrinsicInst* II, Function* FuncPointer, + DFNode* Parent, int NumOfDim = 0, + std::vector<Value*> DimLimits = std::vector<Value*>()) { + return new DFLeafNode(II, FuncPointer, Parent, NumOfDim, DimLimits); + } + + static bool classof(const DFNode *N) { + return N->getKind() == Leaf; + } + + + void applyDFNodeVisitor(DFNodeVisitor &V, DFNodeListType *L = NULL); /*virtual*/ +// void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/ + +}; + +// DFEdge represents a single VISC Dataflow Edge in LLVM. +// +// A Dataflow Edge basically consists of +// 1. Pointer to the dataflow node that is the source of this edge +// 2. Pointer to the dataflow node that is the destination of this edge +// 3. Pointer to a function that describes which instances of the source +// dataflow node are connected to which instances of the destination +// dataflow node via this edge +// 4. Pointer to a function that describes which input arguments of the +// destination dataflow node are connected to which outputs of the source +// dataflow node via this edge + +class DFEdge { + private: + // Important things that make up a Dataflow Edge + DFNode* SrcDF; ///< Pointer to source dataflow Node + DFNode* DestDF; ///< Pointer to destination dataflow Node + Function* DFMapFuncPointer; ///< Function that associates the appropriate + ///< instances of source and destination + ///< dataflow nodes + Function* ArgMapFuncPointer; ///< Function that associates the input + ///< arguments of destination with the outputs + ///< of source dataflow node + // Functions + DFEdge(DFNode* _SrcDF, DFNode* _DestDF, Function* _DFMapFuncPointer, + Function* _ArgMapFuncPointer) : SrcDF(_SrcDF), DestDF(_DestDF), + DFMapFuncPointer(_DFMapFuncPointer), + ArgMapFuncPointer(_ArgMapFuncPointer) {} + + public: + + static DFEdge *Create(DFNode* SrcDF, DFNode* DestDF, Function* DFMapFuncPtr, + Function* ArgMapFuncPtr) { + return new DFEdge(SrcDF, DestDF, DFMapFuncPtr, ArgMapFuncPtr); + + } +}; + + +//===-------------------------- Visitor Classes ---------------------------===// +// Visitor for DFNode objects +class DFNodeVisitor { + public: + virtual void visit(DFInternalNode* N, DFNodeListType* L = NULL) = 0; + virtual void visit(DFLeafNode* N, DFNodeListType* L = NULL) = 0; +}; + +class DFTreeTraversal : public DFNodeVisitor { + + public: + virtual void visit(DFInternalNode* N, DFNodeListType *L = NULL){ + errs() << "Visted Node (I) - " << N->getFuncPointer()->getName() << "\n"; + if (L != NULL) + L->push_back(N); + for(DFGraph::children_iterator i = N->getChildGraph()->begin(), + e = N->getChildGraph()->end(); i != e; ++i) { + DFNode* child = *i; + child->applyDFNodeVisitor(*this, L); + } + } + + virtual void visit(DFLeafNode* N, DFNodeListType *L = NULL) { + errs() << "Visted Node (L) - " << N->getFuncPointer()->getName() << "\n"; + if (L != NULL) + L->push_back(N); + } + +}; + +class FollowSuccessors : public DFNodeVisitor { + + public: + virtual void visit(DFInternalNode* N, DFNodeListType *L = NULL) { + errs() << "Visted Node (I) - " << N->getFuncPointer()->getName() << "\n"; + for(DFInternalNode::successor_iterator i = N->successors_begin(), + e = N->successors_end(); i != e; ++i) { + /* Traverse the graph. + * Choose the kind of traversal we want + * Do we do a DAG kind of traversal? + */ + } + } + + virtual void visit(DFLeafNode* N, DFNodeListType* L = NULL) { + errs() << "Visted Node (L) - " << N->getFuncPointer()->getName() << "\n"; + } +}; + +// Print functions +inline raw_ostream& operator<<(raw_ostream &O, DFInternalNode &N) { + O << N.getFuncPointer()->getName(); + return O; +} + +inline raw_ostream& operator<<(raw_ostream &O, DFLeafNode &N) { + O << N.getFuncPointer()->getName(); + return O; +} + +/* +// Visitor for DFEdge objects +class DFEdgeVisitor { +public: + virtual void visit(DFEdge* E) = 0; +}; + + +//===--------------------------------------------------------------------===// +// GraphTraits specializations for DFNode graph (DFG) +//===--------------------------------------------------------------------===// + +// Provide specializations of GraphTraits to be able to treat a DFNode as a +// graph of DFNodes...struct GraphTraits { + // Elements to provide: + + // typedef NodeType - Type of Node in the graph + // typedef ChildIteratorType - Type used to iterate over children in graph + + // static NodeType *getEntryNode(const GraphType &) + // Return the entry node of the graph + + // static ChildIteratorType child_begin(NodeType *) + // static ChildIteratorType child_end (NodeType *) + // Return iterators that point to the beginning and ending of the child + // node list for the specified node. + // + + + // typedef ...iterator nodes_iterator; + // static nodes_iterator nodes_begin(GraphType *G) + // static nodes_iterator nodes_end (GraphType *G) + // nodes_iterator/begin/end - Allow iteration over all nodes in the graph + + // static unsigned size (GraphType *G) + // Return total number of nodes in the graph + // + + + // If anyone tries to use this class without having an appropriate + // specialization, make an error. If you get this error, it's because you + // need to include the appropriate specialization of GraphTraits<> for your + // graph, or you need to define it for a new graph type. Either that or + // your argument to XXX_begin(...) is unknown or needs to have the proper .h + // file #include'd. + // + typedef typename GraphType::UnknownGraphTypeError NodeType; +//}; +*/ + +} // End llvm namespace + +#endif diff --git a/include/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.h b/include/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.h new file mode 100644 index 0000000000..dfbd09402d --- /dev/null +++ b/include/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.h @@ -0,0 +1,25 @@ +#ifndef __EXTRACT_HPVM_LEAF_NODE_FUNCTIONS_H__ + #define __EXTRACT_HPVM_LEAF_NODE_FUNCTIONS_H__ + + //===-------------------- ExtractHPVMLeafNodeFunctions.h ------------------===// + // + // The LLVM Compiler Infrastructure + // + // This file is distributed under the University of Illinois Open Source + // License. See LICENSE.TXT for details. + // + //===----------------------------------------------------------------------===// + + #include "llvm/IR/Module.h" + #include "llvm/BuildDFG/BuildDFG.h" + + namespace extracthpvmleaf { + + class ExtractHPVMLeafNodeFunctions { + public: + void run(Module &M, builddfg::BuildDFG &DFG); + }; + + } // end namespace extracthpvmleaf + + #endif \ No newline at end of file diff --git a/include/FuseHPVMTensorNodes/FuseHPVMTensorNodes.h b/include/FuseHPVMTensorNodes/FuseHPVMTensorNodes.h new file mode 100644 index 0000000000..72812071a3 --- /dev/null +++ b/include/FuseHPVMTensorNodes/FuseHPVMTensorNodes.h @@ -0,0 +1,178 @@ +#ifndef __FUSE_HPVM_TENSOR_NODES_H__ +#define __FUSE_HPVM_TENSOR_NODES_H__ + +//=== FuseHPVMTensorNodes.h ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/DFGraph.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +#include "llvm/BuildDFG/BuildDFG.h" +#include "llvm/SupportVISC/DFG2LLVM.h" + +using namespace llvm; + +namespace tensorfuse { + +class FuseHPVMTensorNodes { +public: + typedef std::vector< std::vector< IntrinsicInst* > > FusionTargets; +private: + // Member variables + + // Functions + +/* Create an identical bind (in or out, depending on the argument intrinsic) * + * with different src (true) or dst (false) port */ + IntrinsicInst* createIdenticalBindWithDifferentPort(IntrinsicInst* II, + unsigned port, + bool srcport); +/* Given two createNode intrinsics describing connected nodes, this function * + * returns the argument list type of the fused function */ + void createArgTypes(IntrinsicInst* II1, + IntrinsicInst* II2, + std::vector<Type*> &ArgTypes); +/* Get the return type of the function for fused node II1-II2 */ + StructType* createReturnType(IntrinsicInst* II1, IntrinsicInst* II2); +/* Copy argument names, from functions of II1 and II2 to F */ + void copyArgumentNames(IntrinsicInst* II1, + IntrinsicInst* II2, + Function* F); +/* Copy attributes, from functions of II1 and II2 to F */ + void copyAttrList(IntrinsicInst* II1, + IntrinsicInst* II2, + Function* F); +/* Creates and inserts an empty function of the rght type for the fused node */ + Function* createEmptyDFNodeFunction(IntrinsicInst* II1, + IntrinsicInst* II2, + Module &M); +/* Inline first node function, updating required mappings * + * - F1: first node function * + * - M: module containing the node function * + * - Ffused: fused node function * + * - VMap: maps values used in the body of F1 to those that mst be used in * + the body of the fused function instead * + * OutVs: This maps the output struct field index to the stored value */ + void inlineFirstNodeFunction(Module &M, + Function *F1, + Function *Ffused, + ValueMap<Value*, Value*> &VMap, + std::vector<Value*> &OutVs); +/* Inline second node function, updating required mappings * + * - F2: second node function * + * - M: module containing the node function * + * - Ffused: fused node function * + * - VMap: maps values used in the body of F2 to those that mst be used in * + the body of the fused function instead */ + void inlineSecondNodeFunction(Module &M, + Function *F2, + Function *Ffused, + ValueMap<Value*, Value*> &VMap); +/* Create function of leaf node after fusion * + * - create type * + * - create empty function of the type * + * - inline body of first function (applying and updating appropriate * + * mappings) * + * - inline body of second function (applying and updating appropriate * + * mappings) */ + Function* createLeafDFNodeFunction(IntrinsicInst* II1, + IntrinsicInst* II2, + Module &M); +/* Updates parent of fused nodes to use the new node intrinsic */ + void updateParentNodeFunction(IntrinsicInst* II1, + IntrinsicInst* II2, + IntrinsicInst* IInew); +/* Performs all operations required at the IR level for fusion of HPVM tensor * + * nodes with intrinsic instructions II1 and II2 * + * - Creates fused node function * + * - Creates createNode intrinsic for it and returns it * + * - Updates parent function: * + * - - adds new intrinsic * + * - - edges and binds consistently use the new intrinsic * + * - Removes old functions */ + IntrinsicInst* FuseHPVMTensorNodesStep(IntrinsicInst* II1, + IntrinsicInst* II2, + Module &M); +/* Fuse node sequence described by creaetNode intrinsics in IIs. * + * Contents of IIs are cleared. */ + void FuseHPVMTensorNodeSequence(std::vector<IntrinsicInst*> &IIs, Module &M); +public: + void run(Module &M, FusionTargets &FTs); + + void printFusionTargets(FusionTargets &FTs); +}; + +// Visitor for finding nodes to fuse +class FindFusionTargetsTraversal : public dfg2llvm::CodeGenTraversal { + +private: + typedef std::map< visc::Target, std::vector< std::vector<Intrinsic::ID> > > + FusePatterns; + //Member variables + + /* Map, from HPVM target to sequences of intrinsic IDs that if found, + need to be fused */ + /* TODO: use this in the future. Current (for PLDI 2018) implementation + * - assumes only two patterns, for PROMISE + * - assumes that nodes belonging to a single pattern only, if any. */ +// FusePatterns FPs; + FuseHPVMTensorNodes::FusionTargets FTs; + //Functions + + // Virtual Functions + void init() {} + void initRuntimeAPI() {} + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); + +public: + // Constructor + + FindFusionTargetsTraversal(Module &_M, builddfg::BuildDFG &_DFG) : + CodeGenTraversal(_M, _DFG) { +/* FPs[visc::PROMISE_TARGET] = { {Intrinsic::visc_tensor_conv, + Intrinsic::visc_tensor_add, + Intrinsic::visc_tensor_relu, + Intrinsic::visc_tensor_pooling + }, + {Intrinsic::visc_tensor_mul, + Intrinsic::visc_tensor_add, + Intrinsic::visc_tensor_relu + } + } +*/ + } + + FuseHPVMTensorNodes::FusionTargets &getFusionTargets() { + return FTs; + } + +}; + +struct FuseHPVMTensorNodesWrapper : public ModulePass { + static char ID; // Pass identification, replacement for typeid + FuseHPVMTensorNodesWrapper() : ModulePass(ID) {} + +private: + // Member variables + +public: + // Functions + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<builddfg::BuildDFG>(); + } + + bool runOnModule(Module &M); + +}; + +} // End of namespace + +#endif diff --git a/include/GenVISC/GenVISC.h b/include/GenVISC/GenVISC.h new file mode 100644 index 0000000000..fcdb636a05 --- /dev/null +++ b/include/GenVISC/GenVISC.h @@ -0,0 +1,52 @@ +//== GenVISC.h - Header file for "LLVM IR to VISC IR Pass" =// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/Module.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/SupportVISC/VISCTimer.h" + +using namespace llvm; + +namespace genvisc { +// GenVISC - The first implementation. +struct GenVISC : public ModulePass { + static char ID; // Pass identification, replacement for typeid + GenVISC() : ModulePass(ID) {} + + +private: + // Member variables + Module* M; + Constant* llvm_visc_initializeTimerSet; + Constant* llvm_visc_switchToTimer; + Constant* llvm_visc_printTimerSet; + + GlobalVariable* TimerSet; + + // Functions + void initializeTimerSet(Instruction*); + void switchToTimer(enum visc_TimerID, Instruction*); + void printTimerSet(Instruction*); + Value* getStringPointer(const Twine& S, Instruction* InsertBefore, const Twine& Name = ""); + +public: + // Functions + virtual bool runOnModule(Module &M); + + void generateTest(CallInst* CI); + Function* genKernel(Function* KernelFunction, CallInst* CI, StructType* RetTy); + void genHost(CallInst*, Function*, unsigned, unsigned, unsigned, unsigned, StructType*); +}; + +} // End of namespace + diff --git a/include/InPlaceDFG/InPlaceDFGAnalysis.h b/include/InPlaceDFG/InPlaceDFGAnalysis.h new file mode 100644 index 0000000000..fc4c7f3ee9 --- /dev/null +++ b/include/InPlaceDFG/InPlaceDFGAnalysis.h @@ -0,0 +1,52 @@ +#ifndef __IN_PLACE_DFG_ANALYSIS_H__ +#define __IN_PLACE_DFG_ANALYSIS_H__ + +//===------------------------- InPlaceDFGAnalysis.h -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/IR/DFGraph.h" +#include "llvm/BuildDFG/BuildDFG.h" + +using namespace llvm; + +namespace inplacedfg { + +// InPlaceDFGAnalysis +class InPlaceDFGAnalysis{ +public: + typedef std::map<DFNode*, std::vector<bool> > InPlaceDFGParameter; + + void run(Module &M, builddfg::BuildDFG &DFG, InPlaceDFGParameter &IPP); +}; + +// InPlaceDFGAnalysisWrapper pass for ApproxHPVM - The first implementation. +struct InPlaceDFGAnalysisWrapper : public ModulePass { + static char ID; // Pass identification, replacement for typeid + InPlaceDFGAnalysisWrapper() : ModulePass(ID) {} + +private: + // Member variables + InPlaceDFGAnalysis::InPlaceDFGParameter IPP; + +public: + // Functions + bool runOnModule(Module &M); + void getAnalysisUsage(AnalysisUsage &AU) const; + + const InPlaceDFGAnalysis::InPlaceDFGParameter &getIPP(); +}; + +// Helper Functions +void printInPlaceDFGParameter(InPlaceDFGAnalysis::InPlaceDFGParameter &IPP); + +} // End of namespace + +#endif diff --git a/include/SupportVISC/DFG2LLVM.h b/include/SupportVISC/DFG2LLVM.h new file mode 100644 index 0000000000..355fb18570 --- /dev/null +++ b/include/SupportVISC/DFG2LLVM.h @@ -0,0 +1,497 @@ +#ifndef __DFG2LLVM_H__ +#define __DFG2LLVM_H__ + +//===---- DFG2LLVM.h - Header file for "VISC Dataflow Graph to Target" ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/Module.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/BuildDFG/BuildDFG.h" +#include "llvm/SupportVISC/VISCHint.h" +#include "llvm/SupportVISC/VISCTimer.h" +#include "llvm/SupportVISC/VISCUtils.h" + +using namespace llvm; +using namespace builddfg; + +#define TIMER(X) do { if (VISCTimer) { X; } } while (0) +#define DECLARE(X) X = M.getOrInsertFunction(#X, \ + runtimeModule->getFunction(#X)->getFunctionType()); \ + DEBUG(errs() << *X) + +namespace dfg2llvm { +// Helper Functions +static inline ConstantInt* getTimerID(Module&, enum visc_TimerID); +static inline ConstantInt* getTimerID(Module&, enum visc::Target); + +bool hasAttribute(Function*, unsigned, Attribute::AttrKind); + +// DFG2LLVM abstract class implementation +class DFG2LLVM : public ModulePass { +protected: + DFG2LLVM(char ID) : ModulePass(ID) {} + + // Member variables + + // Functions + +public: + // Pure Virtual Functions + virtual bool runOnModule(Module &M) = 0; + + // Functions + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<BuildDFG>(); + AU.addPreserved<BuildDFG>(); + } + +}; + +// Abstract Visitor for Code generation traversal (tree traversal for now) +class CodeGenTraversal : public DFNodeVisitor { + +protected: + //Member variables + Module &M; + BuildDFG &DFG; + bool VISCTimer = false; + std::string TargetName = "None"; + + // Map from Old function associated with DFNode to new cloned function with + // extra index and dimension arguments. This map also serves to find out if + // we already have an index and dim extended function copy or not (i.e., + // "Have we visited this function before?") + DenseMap<DFNode*, Value*> OutputMap; + + // VISC Runtime API + std::unique_ptr<Module> runtimeModule; + + Constant* llvm_visc_initializeTimerSet; + Constant* llvm_visc_switchToTimer; + Constant* llvm_visc_printTimerSet; + GlobalVariable* TimerSet; + GlobalVariable* GraphIDAddr; + Instruction* InitCall; + Instruction* CleanupCall; + + + // Functions + Value* getStringPointer(const Twine& S, Instruction* InsertBefore, const Twine& Name = ""); +// void addArgument(Function*, Type*, const Twine& Name = ""); + Function *addArgument(Function*, Type*, const Twine& Name = ""); +// void addIdxDimArgs(Function* F); + Function *addIdxDimArgs(Function* F); + std::vector<Value*> extractElements(Value*, std::vector<Type*>, + std::vector<std::string>, Instruction*); + Argument* getArgumentAt(Function* F, unsigned offset); + void initTimerAPI(); + + // Pure Virtual Functions + virtual void init() = 0; + virtual void initRuntimeAPI() = 0; + virtual void codeGen(DFInternalNode* N) = 0; + virtual void codeGen(DFLeafNode* N) = 0; + + // Virtual Functions + virtual void initializeTimerSet(Instruction*); + virtual void switchToTimer(enum visc_TimerID, Instruction*); + virtual void printTimerSet(Instruction*); + + virtual ~CodeGenTraversal() {} + + +public: + + // Constructor + CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {} + + static bool checkPreferredTarget(DFNode* N, visc::Target T); + static bool preferredTargetIncludes(DFNode* N, visc::Target T); + visc::Target getPreferredTarget(DFNode *N); + + virtual void visit(DFInternalNode* N) { + // If code has already been generated for this internal node, skip the + // children + if(N->getGenFunc() != NULL) + return; + + errs() << "Start: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n"; + + // Follows a bottom-up approach for code generation. + // First generate code for all the child nodes + for(DFGraph::children_iterator i = N->getChildGraph()->begin(), + e = N->getChildGraph()->end(); i != e; ++i) { + DFNode* child = *i; + child->applyDFNodeVisitor(*this); + } + // Generate code for this internal node now. This way all the cloned + // functions for children exist. + codeGen(N); + errs() << "DONE: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n"; + } + + virtual void visit(DFLeafNode* N) { + errs() << "Start: Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n"; + codeGen(N); + errs() << "DONE: Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n"; + } +}; + +// -------------- CodeGenTraversal Implementation ----------------- + +bool CodeGenTraversal::checkPreferredTarget(DFNode* N, visc::Target T) { + Function* F = N->getFuncPointer(); + Module* M = F->getParent(); + NamedMDNode* HintNode; + switch (T) { + case visc::GPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); + break; + case visc::SPIR_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_spir"); + break; + case visc::CUDNN_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn"); + break; + case visc::PROMISE_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_promise"); + break; + case visc::CPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); + break; + default: + llvm_unreachable("Target Not supported yet!"); + } + for (unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* MetaNode = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue(); + if(F == FHint) + return true; + } + return false; +} + +visc::Target CodeGenTraversal::getPreferredTarget(DFNode *N) { + + Function* F = N->getFuncPointer(); + Module* M = F->getParent(); + NamedMDNode* HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); + for (unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* MetaNode = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::CPU_TARGET; + } + + HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); + for (unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* MetaNode = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::GPU_TARGET; + } + + HintNode = M->getOrInsertNamedMetadata("visc_hint_spir"); + for (unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* MetaNode = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::SPIR_TARGET; + } + + HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn"); + for (unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* MetaNode = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::CUDNN_TARGET; + } + + HintNode = M->getOrInsertNamedMetadata("visc_hint_promise"); + for (unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* MetaNode = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::PROMISE_TARGET; + } + + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"); + for (unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* MetaNode = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::CPU_OR_GPU_TARGET; + } + + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_spir"); + for (unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* MetaNode = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::CPU_OR_SPIR_TARGET; + } + + return visc::None; +} + +bool CodeGenTraversal::preferredTargetIncludes(DFNode* N, visc::Target T) { + + Function* F = N->getFuncPointer(); + Module* M = F->getParent(); + std::vector<NamedMDNode *> HintNode; + switch (T) { + case visc::GPU_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_gpu")); + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu")); + break; + case visc::SPIR_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_spir")); + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir")); + break; + case visc::CPU_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu")); + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu")); + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir")); + break; + case visc::CUDNN_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cudnn")); + break; + case visc::PROMISE_TARGET: + HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_promise")); + break; + case visc::CPU_OR_GPU_TARGET: + case visc::CPU_OR_SPIR_TARGET: + assert(false && "Target should be one of CPU/GPU/SPIR\n"); + break; + default: + llvm_unreachable("Target Not supported yet!"); + } + + for (unsigned h = 0; h < HintNode.size(); h++) { + for (unsigned i = 0; i < HintNode[h]->getNumOperands(); i++) { + MDNode *MetaNode = HintNode[h]->getOperand(i); + Value *FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue(); + if (F == FHint) + return true; + } + } + + return false; +} + + +// Generate Code for declaring a constant string [L x i8] and return a pointer +// to the start of it. +Value* CodeGenTraversal::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) { + Constant* SConstant = ConstantDataArray::getString(M.getContext(), S.str(), true); + Value* SGlobal = new GlobalVariable(M, SConstant->getType(), true, + GlobalValue::InternalLinkage, SConstant, Name); + Value* Zero = ConstantInt::get(Type::getInt64Ty(M.getContext()), 0); + Value* GEPArgs[] = {Zero, Zero}; + GetElementPtrInst* SPtr = GetElementPtrInst::Create(nullptr, SGlobal, + ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB); + return SPtr; +} + +// Add an argument of type Ty to the given function F +//void CodeGenTraversal::addArgument(Function* F, Type* Ty, const Twine& name) { +// // Add the argument to argument list +// new Argument(Ty, name, F); +// +// // Create the argument type list with added argument types +// std::vector<Type*> ArgTypes; +// for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); +// ai != ae; ++ai) { +// ArgTypes.push_back(ai->getType()); +// } +// // Adding new arguments to the function argument list, would not change the +// // function type. We need to change the type of this function to reflect the +// // added arguments +// FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg()); +// PointerType* PTy = PointerType::get(FTy, cast<PointerType>(F->getType())->getAddressSpace()); +// +// // Change the function type +// F->mutateType(PTy); +//} + +// Creates a function with an additional argument of the specified type and +// name. The previous function is not deleted. +Function *CodeGenTraversal::addArgument(Function* F, Type* Ty, const Twine& name) { + // Add the argument to argument list + new Argument(Ty, name, F); + + // Create the argument type list with added argument types + std::vector<Type*> ArgTypes; + for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + ArgTypes.push_back(ai->getType()); + } + // Adding new arguments to the function argument list, would not change the + // function type. We need to change the type of this function to reflect the + // added arguments. So, we create a clone of this function with the correct + // type. + FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg()); + Function *newF = viscUtils::cloneFunction(F, FTy, false); + + // Check if the function is used by a metadata node + if(F->isUsedByMetadata()) { + viscUtils::fixHintMetadata(*F->getParent(), F, newF); + } + + return newF; +} + +// Change the argument list of function F to add index and limit arguments +//void CodeGenTraversal::addIdxDimArgs(Function* F) { +// // Add Index and Dim arguments +// std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"}; +// for (int i = 0; i < 6; ++i) { +// addArgument(F, Type::getInt32Ty(F->getContext()), names[i]); +// } +//} + +// Return new function with additional index and limit arguments. +// The original function is removed from the module and erased. +Function *CodeGenTraversal::addIdxDimArgs(Function* F) { + errs() << "Function Type: " << *F->getFunctionType() << "\n"; + // Add Index and Dim arguments + std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"}; + Function *newF; + for (int i = 0; i < 6; ++i) { + newF = addArgument(F, Type::getInt64Ty(F->getContext()), names[i]); + F->replaceAllUsesWith(UndefValue::get(F->getType())); + F->eraseFromParent(); + F = newF; + } + errs() << "Function Type after adding args: " << *newF->getFunctionType() << "\n"; + return newF; +} + +// Extract elements from an aggregate value. TyList contains the type of each +// element, and names vector contains a name. IB is the instruction before which +// all the generated code would be inserted. +std::vector<Value*> CodeGenTraversal::extractElements(Value* Aggregate, + std::vector<Type*> TyList, std::vector<std::string> names, Instruction* IB) { + // Extract input data from i8* Aggregate.addr and store them in a vector. + // For each argument + std::vector<Value*> Elements; + GetElementPtrInst* GEP; + unsigned argNum = 0; + for(Type* Ty: TyList) { + // BitCast: %arg.addr = bitcast i8* Aggregate.addr to <pointer-to-argType> + CastInst* BI = BitCastInst::CreatePointerCast(Aggregate, + Ty->getPointerTo(), + names[argNum]+".addr", + IB); + // Load: %arg = load <pointer-to-argType> %arg.addr + LoadInst* LI = new LoadInst(BI, names[argNum], IB); + // Patch argument to call instruction + Elements.push_back(LI); + //errs() << "Pushing element " << *LI << "\n"; + //CI->setArgOperand(argNum, LI); + + // TODO: Minor Optimization - The last GEP statement can/should be left out + // as no more arguments left + // Increment using GEP: %nextArg = getelementptr <ptr-to-argType> %arg.addr, i64 1 + // This essentially takes us to the next argument in memory + Constant* IntOne = ConstantInt::get(Type::getInt64Ty(M.getContext()), 1); + if (argNum < TyList.size()-1) + GEP = GetElementPtrInst::Create(nullptr, BI, + ArrayRef<Value*>(IntOne), + "nextArg", + IB); + // Increment argNum and for the next iteration use result of this GEP to + // extract next argument + argNum++; + Aggregate = GEP; + } + return Elements; +} + +// Traverse the function F argument list to get argument at offset +Argument* CodeGenTraversal::getArgumentAt(Function* F, unsigned offset) { + DEBUG(errs() << "Finding argument " << offset << ":\n"); + assert((F->getFunctionType()->getNumParams() > offset && offset >= 0) + && "Invalid offset to access arguments!"); + Argument* arg; + Function::arg_iterator i = F->arg_begin(), e = F->arg_end(); + for(; offset != 0 && i!=e; i++) { + offset--; + } + arg = &*i; + DEBUG(errs() << "\t" << *arg <<"\n"); + return arg; +} + +void CodeGenTraversal::initTimerAPI() { + DECLARE(llvm_visc_initializeTimerSet); + DECLARE(llvm_visc_switchToTimer); + DECLARE(llvm_visc_printTimerSet); +} + +// Timer Routines +// Initialize the timer set +void CodeGenTraversal::initializeTimerSet(Instruction* InsertBefore) { + DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << "\n"); + TIMER(TimerSet = new GlobalVariable(M, + Type::getInt8PtrTy(M.getContext()), + false, + GlobalValue::CommonLinkage, + Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), + Twine("viscTimerSet_")+TargetName); + errs() << "New global variable: " << *TimerSet << "\n"; + + Value* TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet, + None, + "", + InsertBefore); + new StoreInst(TimerSetAddr, TimerSet, InsertBefore); + ); +} + +void CodeGenTraversal::switchToTimer(enum visc_TimerID timer, Instruction* InsertBefore) { + Value* switchArgs[] = {TimerSet, getTimerID(M, timer)}; + TIMER(CallInst::Create(llvm_visc_switchToTimer, + ArrayRef<Value*>(switchArgs, 2), + "", + InsertBefore)); +} + +void CodeGenTraversal::printTimerSet(Instruction* InsertBefore) { + Value* TimerName; + TIMER(TimerName = getStringPointer(TargetName+Twine("_Timer"), InsertBefore)); + Value* printArgs[] = {TimerSet, TimerName}; + TIMER(CallInst::Create(llvm_visc_printTimerSet, + ArrayRef<Value*>(printArgs, 2), + "", + InsertBefore)); +} + +// Implementation of Helper Functions +static inline ConstantInt* getTimerID(Module& M, enum visc_TimerID timer) { + return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer); +} + +static inline ConstantInt* getTargetID(Module& M, enum visc::Target T) { + return ConstantInt::get(Type::getInt32Ty(M.getContext()), T); +} + +// Find if argument has the given attribute +bool hasAttribute(Function* F, unsigned arg_index, Attribute::AttrKind AK) { + return F->getAttributes().hasAttribute(arg_index+1, AK); +} + +} // End of namespace + +#endif + diff --git a/include/SupportVISC/DFGTreeTraversal.h b/include/SupportVISC/DFGTreeTraversal.h new file mode 100644 index 0000000000..c031c112fe --- /dev/null +++ b/include/SupportVISC/DFGTreeTraversal.h @@ -0,0 +1,64 @@ +#ifndef __DFGTREETRAVERSAL_H__ +#define __DFGTREETRAVERSAL_H__ + +//=== DFGTreeTraversal.h - Header file for Tree Traversal of the HPVM DFG ====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/Module.h" +#include "llvm/IR/Function.h" +#include "llvm/Pass.h" +#include "llvm/BuildDFG/BuildDFG.h" + +using namespace llvm; +using namespace builddfg; + +namespace dfg2llvm { + + class DFGTreeTraversal : public DFNodeVisitor { + + protected: + //Member variables + Module &M; + BuildDFG &DFG; + + virtual void process(DFInternalNode* N) = 0; + virtual void process(DFLeafNode* N) = 0; + + virtual ~DFGTreeTraversal() {} + + public: + // Constructor + DFGTreeTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {} + + void visit(DFInternalNode* N) { + // May visit a nodemore than once, there is no marking it as visited + errs() << "Start: In Node (I) - " << N->getFuncPointer()->getName() << "\n"; + + // Follows a bottom-up approach. + for (DFGraph::children_iterator i = N->getChildGraph()->begin(), + e = N->getChildGraph()->end(); i != e; ++i) { + DFNode* child = *i; + child->applyDFNodeVisitor(*this); + } + + // Process this internal node now. + process(N); + errs() << "DONE: In Node (I) - " << N->getFuncPointer()->getName() << "\n"; + } + + void visit(DFLeafNode* N) { + errs() << "Start: In Node (L) - " << N->getFuncPointer()->getName() << "\n"; + process(N); + errs() << "DONE: In Node (L) - " << N->getFuncPointer()->getName() << "\n"; + } + }; + +} // end namespace dfg2llvm + +#endif diff --git a/include/SupportVISC/VISCHint.h b/include/SupportVISC/VISCHint.h new file mode 100644 index 0000000000..5324c0fabd --- /dev/null +++ b/include/SupportVISC/VISCHint.h @@ -0,0 +1,35 @@ +//===------------ VISCTimer.h - Header file for "VISC Timer API" ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef VISC_HINT_HEADER +#define VISC_HINT_HEADER + +/************************** Hint Routines ***************************/ +#ifdef __cplusplus +namespace visc { +#endif + + enum Target { + None, + CPU_TARGET, + GPU_TARGET, + SPIR_TARGET, + CUDNN_TARGET, + PROMISE_TARGET, + CPU_OR_GPU_TARGET, + CPU_OR_SPIR_TARGET, +// ALL_TARGETS, + NUM_TARGETS + }; + +#ifdef __cplusplus +} +#endif + +#endif //VISC_HINT_HEADER diff --git a/include/SupportVISC/VISCTimer.h b/include/SupportVISC/VISCTimer.h new file mode 100644 index 0000000000..4dbadbd34f --- /dev/null +++ b/include/SupportVISC/VISCTimer.h @@ -0,0 +1,159 @@ +//===------------ VISCTimer.h - Header file for "VISC Timer API" ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef VISC_TIMER_HEADER +#define VISC_TIMER_HEADER + +/************************** Timer Routines ***************************/ +extern "C" { + +/* A time or duration. */ +//#if _POSIX_VERSION >= 200112L +typedef unsigned long long visc_Timestamp; /* time in microseconds */ +//#else +//# error "Timestamps not implemented" +//#endif + +enum visc_TimerState { + visc_Timer_STOPPED, + visc_Timer_RUNNING, +}; + +struct visc_Timer { + enum visc_TimerState state; + visc_Timestamp elapsed; /* Amount of time elapsed so far */ + visc_Timestamp init; /* Beginning of the current time interval, + * if state is RUNNING. End of the last + * recorded time interfal otherwise. */ +}; + +/* Reset a timer. + * Use this to initialize a timer or to clear + * its elapsed time. The reset timer is stopped. + */ +void +visc_ResetTimer(struct visc_Timer *timer); + +/* Start a timer. The timer is set to RUNNING mode and + * time elapsed while the timer is running is added to + * the timer. + * The timer should not already be running. + */ +void +visc_StartTimer(struct visc_Timer *timer); + +/* Stop a timer. + * This stops adding elapsed time to the timer. + * The timer should not already be stopped. + */ +void +visc_StopTimer(struct visc_Timer *timer); + +/* Get the elapsed time in seconds. */ +double +visc_GetElapsedTime(struct visc_Timer *timer); + +/* Execution time is assigned to one of these categories. */ +enum visc_TimerID { + visc_TimerID_NONE = 0, + visc_TimerID_IO, /* Time spent in input/output */ + visc_TimerID_KERNEL, /* Time spent computing on the device, + * recorded asynchronously */ + visc_TimerID_COPY, /* Time spent synchronously moving data + * to/from device and allocating/freeing + * memory on the device */ + visc_TimerID_DRIVER, /* Time spent in the host interacting with the + * driver, primarily for recording the time + * spent queueing asynchronous operations */ + visc_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */ + visc_TimerID_COMPUTE, /* Time for all program execution other + * than parsing command line arguments, + * I/O, kernel, and copy */ + visc_TimerID_OVERLAP, /* Time double-counted in asynchronous and + * host activity: automatically filled in, + * not intended for direct usage */ + // GPU FUNCTION + visc_TimerID_INIT_CTX, + visc_TimerID_CLEAR_CTX, + visc_TimerID_COPY_SCALAR, + visc_TimerID_COPY_PTR, + visc_TimerID_MEM_FREE, + visc_TimerID_READ_OUTPUT, + visc_TimerID_SETUP, + visc_TimerID_MEM_TRACK, + visc_TimerID_MEM_UNTRACK, + visc_TimerID_MISC, + // LAUNCH FUNCTION + visc_TimerID_PTHREAD_CREATE, + visc_TimerID_ARG_PACK, + visc_TimerID_ARG_UNPACK, + visc_TimerID_COMPUTATION, + visc_TimerID_OUTPUT_PACK, + visc_TimerID_OUTPUT_UNPACK, + + visc_TimerID_LAST /* Number of timer IDs */ +}; + +/* Dynamic list of asynchronously tracked times between events */ +struct visc_async_time_marker_list { + char *label; // actually just a pointer to a string + enum visc_TimerID timerID; /* The ID to which the interval beginning + * with this marker should be attributed */ + void * marker; + //cudaEvent_t marker; /* The driver event for this marker */ + struct visc_async_time_marker_list *next; +}; + +struct visc_SubTimer { + char *label; + struct visc_Timer timer; + struct visc_SubTimer *next; +}; + +struct visc_SubTimerList { + struct visc_SubTimer *current; + struct visc_SubTimer *subtimer_list; +}; + +/* A set of timers for recording execution times. */ +struct visc_TimerSet { + enum visc_TimerID current; + struct visc_async_time_marker_list* async_markers; + visc_Timestamp async_begin; + visc_Timestamp wall_begin; + struct visc_Timer timers[visc_TimerID_LAST]; + struct visc_SubTimerList *sub_timer_list[visc_TimerID_LAST]; +}; + +/* Reset all timers in the set. */ +void +visc_InitializeTimerSet(struct visc_TimerSet *timers); + +void +visc_AddSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID visc_Category); + +/* Select which timer the next interval of time should be accounted + * to. The selected timer is started and other timers are stopped. + * Using visc_TimerID_NONE stops all timers. */ +inline void +visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer); + +void +visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID category); + +/* Print timer values to standard output. */ +void +visc_PrintTimerSet(struct visc_TimerSet *timers); + +/* Release timer resources */ +void +visc_DestroyTimerSet(struct visc_TimerSet * timers); + +} +#endif //VISC_RT_HEADER diff --git a/include/SupportVISC/VISCUtils.h b/include/SupportVISC/VISCUtils.h new file mode 100644 index 0000000000..a20ce8bccd --- /dev/null +++ b/include/SupportVISC/VISCUtils.h @@ -0,0 +1,601 @@ +// +//===---- DFG2LLVM.h - Header file for "VISC Dataflow Graph to Target" ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef VISC_UTILS_HEADER +#define VISC_UTILS_HEADER + +#include <assert.h> + +#include "llvm/IR/Module.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/IR/Metadata.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/SupportVISC/VISCHint.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" + +using namespace llvm; + +namespace viscUtils { +// Helper Functions + +static bool isViscCreateNodeIntrinsic(Instruction* I) { + if(!isa<IntrinsicInst>(I)) + return false; + IntrinsicInst* II = cast<IntrinsicInst>(I); + return (II->getCalledFunction()->getName()).startswith("llvm.visc.createNode"); +} + +static bool isViscCreateNodeCall(Instruction* I) { + if(!isa<CallInst>(I)) + return false; + CallInst* CI = cast<CallInst>(I); + return (CI->getCalledValue()->stripPointerCasts()->getName()).startswith("__visc__createNode"); +} + +static bool isViscLaunchCall(Instruction* I) { + if(!isa<CallInst>(I)) + return false; + CallInst* CI = cast<CallInst>(I); + return (CI->getCalledValue()->stripPointerCasts()->getName()).startswith("__visc__launch"); +} +// Creates a new createNode intrinsic, similar to II but with different +// associated function F instead +IntrinsicInst* createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function* F, + IntrinsicInst* II) { + Module* M = F->getParent(); + + // Find which createNode intrinsic we need to create + Function* CreateNodeF = Intrinsic::getDeclaration(M, II->getIntrinsicID()); + Constant* Fp = ConstantExpr::getPointerCast(F, + Type::getInt8PtrTy(II->getContext())); + + ArrayRef<Value*> CreateNodeArgs; + switch (II->getIntrinsicID()) { + case Intrinsic::visc_createNode: + { + CreateNodeArgs = ArrayRef<Value*>(Fp); + break; + } + case Intrinsic::visc_createNode1D: + { + Value* CreateNode1DArgs[] = {Fp, II->getArgOperand(1)}; + CreateNodeArgs = ArrayRef<Value*>(CreateNode1DArgs, 2); + break; + } + case Intrinsic::visc_createNode2D: + { + Value* CreateNode2DArgs[] = {Fp, II->getArgOperand(1), + II->getArgOperand(2)}; + CreateNodeArgs = ArrayRef<Value*>(CreateNode2DArgs, 3); + break; + } + case Intrinsic::visc_createNode3D: + { + Value* CreateNode3DArgs[] = {Fp, II->getArgOperand(1), + II->getArgOperand(2), + II->getArgOperand(3)}; + CreateNodeArgs = ArrayRef<Value*>(CreateNode3DArgs, 4); + break; + } + default : + assert(false && "Unknown createNode intrinsic"); + break; + } + + CallInst* CI = CallInst::Create(CreateNodeF, + CreateNodeArgs, + F->getName()+".node"); + IntrinsicInst* CreateNodeII = cast<IntrinsicInst>(CI); + return CreateNodeII; +} +/* +CallInst* createIdenticalCreateNodeCallWithDifferentFunction(Function* F, + CallInst* CI) { + + // Find which createNode function call we need to create + Function* CreateNodeF = CI->getCalledValue(); + + ArrayRef<Value*> CreateNodeArgs; + if ((CreateNodeF->stripPointerCasts()->getName()).equals("__visc__createNode")) { + // This is a createNode call + CreateNodeArgs = ArrayRef<Value*>(CreateNodeF); + } else if ((CreateNodeF->stripPointerCasts()->getName()).equals("__visc__createNode1D")) { + // This is a createNode1D call + Value* CreateNode1DArgs[] = {CreateNodeF, CI->getArgOperand(1)}; + CreateNodeArgs = ArrayRef<Value*>(CreateNode1DArgs, 2); + } else if ((CreateNodeF->stripPointerCasts()->getName()).equals("__visc__createNode2D")) { + // This is a createNode2D call + Value* CreateNode2DArgs[] = {CreateNodeF, + CI->getArgOperand(1), + CI->getArgOperand(2)}; + CreateNodeArgs = ArrayRef<Value*>(CreateNode2DArgs, 3); + } else if ((CreateNodeF->stripPointerCasts()->getName()).equals("__visc__createNode3D")) { + // This is a createNode3D call + Value* CreateNode3DArgs[] = {CreateNodeF, + CI->getArgOperand(1), + CI->getArgOperand(2), + CI->getArgOperand(3)}; + CreateNodeArgs = ArrayRef<Value*>(CreateNode3DArgs, 4); + } else { + assert(false && "Unknown createNode call"); + } + + CallInst* newCI = CallInst::Create(CreateNodeF, + CreateNodeArgs, + F->getName()+".cncall"); + return newCI; +} +*/ + +// Fix VISC hints for this function +void fixHintMetadata(Module &M, Function* F, Function* G) { + Metadata* MD_F = ValueAsMetadata::getIfExists(F); + MDTuple* MDT_F = MDTuple::getIfExists(F->getContext(), ArrayRef<Metadata*>(MD_F)); + DEBUG(errs() << "Associated Metadata: " << *MDT_F << "\n"); + MDTuple* MDT_G = MDNode::get(F->getContext(), ArrayRef<Metadata*>(ValueAsMetadata::get(G))); + DEBUG(errs() << "New Metadata: " << *MDT_G << "\n"); + + NamedMDNode* HintNode = M.getOrInsertNamedMetadata("visc_hint_gpu"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + if(HintNode->getOperand(i) == MDT_F) + HintNode->setOperand(i, MDT_G); + } + HintNode = M.getOrInsertNamedMetadata("visc_hint_spir"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + if(HintNode->getOperand(i) == MDT_F) + HintNode->setOperand(i, MDT_G); + } + HintNode = M.getOrInsertNamedMetadata("visc_hint_cudnn"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + if(HintNode->getOperand(i) == MDT_F) + HintNode->setOperand(i, MDT_G); + } + HintNode = M.getOrInsertNamedMetadata("visc_hint_promise"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + if(HintNode->getOperand(i) == MDT_F) + HintNode->setOperand(i, MDT_G); + } + HintNode = M.getOrInsertNamedMetadata("visc_hint_cpu"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + if(HintNode->getOperand(i) == MDT_F) + HintNode->setOperand(i, MDT_G); + } + HintNode = M.getOrInsertNamedMetadata("visc_hint_cpu_gpu"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + if(HintNode->getOperand(i) == MDT_F) + HintNode->setOperand(i, MDT_G); + } + HintNode = M.getOrInsertNamedMetadata("visc_hint_cpu_spir"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + if(HintNode->getOperand(i) == MDT_F) + HintNode->setOperand(i, MDT_G); + } +} + +// Assuming that the changed function is a node function, it is only used as a +// first operand of createNode*. It is enough to iterate through all createNode* +// calls in the program. +void replaceNodeFunctionInIR(Module &M, Function* F, Function* G) { + + for (Module::iterator mi = M.begin(), me = M.end(); mi != me; ++mi) { + Function* f = &*mi; + DEBUG(errs() << "Function: " << f->getName() << "\n"); + + std::vector<Instruction*> toBeErased; + + for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e ; ++i) { + Instruction* I = &*i; // Grab pointer to Instruction + + if (isViscCreateNodeIntrinsic(I)) { + IntrinsicInst* II = cast<IntrinsicInst>(I); + // The found createNode is not associated with the changed function + if (II->getArgOperand(0) != F) + continue; // skip it + + // Otherwise, create a new createNode similar to the other one, + // but with the changed function as first operand. + IntrinsicInst* CreateNodeII = + createIdenticalCreateNodeIntrinsicWithDifferentFunction(G, II); + II->replaceAllUsesWith(CreateNodeII); + toBeErased.push_back(II); + } else if (isViscCreateNodeCall(I)) { + CallInst* CI = cast<CallInst>(I); + // The found createNode is not associated with the changed function + if (CI->getArgOperand(1) != F) + continue; // skip it + + DEBUG(errs() << "Fixing use: " << *CI << "\n"); + DEBUG(errs() << "in function: " << f->getName() << "\n"); + // Replace use of F with use of G + CI->setArgOperand(1, G); + DEBUG(errs() << "Fixed use: " << *CI << "\n"); + } else if(isViscLaunchCall(I)) { + CallInst* CI = cast<CallInst>(I); + // The found launch call is not associated with the changed function + if (CI->getArgOperand(1)->stripPointerCasts() != F) + continue; + + // Otherwise, replace F with G + errs() << *G->getType() << "\n"; + errs() << *CI->getArgOperand(1)->getType() << "\n"; + CI->setArgOperand(1, G); + } + + } + + for(auto I: toBeErased) { + DEBUG(errs() << "\tErasing " << *I << "\n"); + I->eraseFromParent(); + } + } + + // Check if the function is used by a metadata node + if(F->isUsedByMetadata()) { + fixHintMetadata(M, F, G); + } + DEBUG(errs() << "DONE: Replacing function " << F->getName() << " with " << G->getName() << "\n"); + + // Remove replaced function from the module + //assert(F->user_empty() && "Still some uses of older function left\n"); + F->replaceAllUsesWith(UndefValue::get(F->getType())); + F->eraseFromParent(); + +} + + +// Create new function F' as a copy of old function F with a new signature. +// The following two most used cases are handled by this function. +// 1. When some extra arguments need to be added to this function +// - Here we can map the old function arguments to +// new ones +// 2. When each pointer argument needs an additional size argument +// - Here, in the absence of VMap, we map the arguments in order, skipping +// over extra pointer arguments. +// The function returns the list of return instructions to the caller to fix in +// case the return type is also changed. +Function* cloneFunction(Function* F, FunctionType* newFT, bool + isAddingPtrSizeArg, SmallVectorImpl<ReturnInst*>* Returns = NULL) { + + DEBUG(errs() << "Cloning Function: " << F->getName() << "\n"); + DEBUG(errs() << "Old Function Type: " << *F->getFunctionType() << "\n"); + DEBUG(errs() << "New Function Type: " << *newFT << "\n"); + + assert(F->getFunctionType()->getNumParams() <= newFT->getNumParams() + && "This function assumes that the new function has more arguments than the old function!"); + + // Create Function of specified type + Function* newF = Function::Create(newFT, F->getLinkage(), F->getName()+"_cloned", F->getParent()); + DEBUG(errs() << "Old Function name: " << F->getName() << "\n"); + DEBUG(errs() << "New Function name: " << newF->getName() << "\n"); + ValueToValueMapTy VMap; + DEBUG(errs() << "No value map provided. Creating default value map\n"); + if(isAddingPtrSizeArg) { + DEBUG(errs() << "Case 1: Pointer arg followed by a i64 size argument in new function\n"); + Function::arg_iterator new_ai = newF->arg_begin(); + for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n"); + assert(ai->getType() == new_ai->getType() && "Arguments type do not match!"); + VMap[&*ai] = &*new_ai; + new_ai->takeName(&*ai); + if(ai->getType()->isPointerTy()) { + std::string oldName = new_ai->getName(); + // If the current argument is pointer type, the next argument in new + // function would be an i64 type containing the data size of this + // argument. Hence, skip the next arguement in new function. + ++new_ai; + new_ai->setName("bytes_"+oldName); + } + ++new_ai; + } + } + else { + DEBUG(errs() << "Case 2: Extra arguments are added at the end of old function\n"); + Function::arg_iterator new_ai = newF->arg_begin(); + for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai, ++new_ai) { + DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n"); + assert(ai->getType() == new_ai->getType() && "Arguments type do not match!"); + VMap[&*ai] = &*new_ai; + new_ai->takeName(&*ai); + } + } + + // Clone function + if (Returns == NULL) + Returns = new SmallVector<ReturnInst*, 8>(); + CloneFunctionInto(newF, F, VMap, false, *Returns); + + return newF; +} + + //------------------- Helper Functions For Handling Hints -------------------// + +// Return true if 1st arg (tag) contains 2nd (target) +bool tagIncludesTarget(visc::Target Tag, visc::Target T) { + switch (Tag) { + case visc::None: + return false; + case visc::CPU_TARGET: + if (T == visc::CPU_TARGET) + return true; + else + return false; + case visc::GPU_TARGET: + if (T == visc::GPU_TARGET) + return true; + else + return false; + case visc::SPIR_TARGET: + if (T == visc::SPIR_TARGET) + return true; + else + return false; + case visc::CUDNN_TARGET: + if (T == visc::CUDNN_TARGET) + return true; + else + return false; + case visc::PROMISE_TARGET: + if (T == visc::PROMISE_TARGET) + return true; + else + return false; + case visc::CPU_OR_GPU_TARGET: + if ((T == visc::CPU_TARGET) || + (T == visc::GPU_TARGET) || + (T == visc::CPU_OR_GPU_TARGET)) + return true; + else + return false; + case visc::CPU_OR_SPIR_TARGET: + if ((T == visc::CPU_TARGET) || + (T == visc::SPIR_TARGET) || + (T == visc::CPU_OR_SPIR_TARGET)) + return true; + else + return false; + default: + assert(false && "Unknown Target\n"); + } +} + +bool isSingleTargetTag(visc::Target T) { + return ((T == visc::CPU_TARGET) || + (T == visc::GPU_TARGET) || + (T == visc::SPIR_TARGET) || + (T == visc::CUDNN_TARGET) || + (T == visc::PROMISE_TARGET)); +} + +// Add the specified target to the given tag +visc::Target getUpdatedTag(visc::Target Tag, visc::Target T) { + assert(((T == visc::CPU_TARGET) || + (T == visc::GPU_TARGET) || + (T == visc::SPIR_TARGET) || + (T == visc::CUDNN_TARGET) || + (T == visc::PROMISE_TARGET)) && + "The target is only allowed to be a single target: CPU, GPU, SPIR, CUDNN, PROMISE\n"); + + switch (Tag) { + case visc::None: + return T; + case visc::CPU_TARGET: + assert((T != visc::CUDNN_TARGET) && (T != visc::PROMISE_TARGET) && + "Unsupported target combination\n"); + break; + if (T == visc::CPU_TARGET) + return visc::CPU_TARGET; + if (T == visc::GPU_TARGET) + return visc::CPU_OR_GPU_TARGET; + if (T == visc::SPIR_TARGET) + return visc::CPU_OR_SPIR_TARGET; + case visc::GPU_TARGET: + assert((T != visc::SPIR_TARGET) && "Unsupported target combination\n"); + assert((T != visc::CUDNN_TARGET) && (T != visc::PROMISE_TARGET) && + "Unsupported target combination\n"); + break; + if (T == visc::CPU_TARGET) + return visc::CPU_OR_GPU_TARGET; + if (T == visc::GPU_TARGET) + return visc::GPU_TARGET; + case visc::SPIR_TARGET: + assert((T != visc::GPU_TARGET) && "Unsupported target combination\n"); + assert((T != visc::CUDNN_TARGET) && (T != visc::PROMISE_TARGET) && + "Unsupported target combination\n"); + break; + if (T == visc::CPU_TARGET) + return visc::CPU_OR_SPIR_TARGET; + if (T == visc::SPIR_TARGET) + return visc::SPIR_TARGET; + case visc::CPU_OR_GPU_TARGET: + assert((T != visc::CUDNN_TARGET) && (T != visc::PROMISE_TARGET) && + "Unsupported target combination\n"); + break; + assert((T != visc::SPIR_TARGET) && "Unsupported target combination\n"); + return visc::CPU_OR_GPU_TARGET; + case visc::CPU_OR_SPIR_TARGET: + assert((T != visc::CUDNN_TARGET) && (T != visc::PROMISE_TARGET) && + "Unsupported target combination\n"); + break; + assert((T != visc::GPU_TARGET) && "Unsupported target combination\n"); + return visc::CPU_OR_SPIR_TARGET; + default: + assert(false && "Unknown Target\n"); + } +} + +// This functions add the hint as metadata in visc code +void addHint(Function* F, visc::Target T) { + // Get Module + Module* M = F->getParent(); + DEBUG(errs() << "Set preferred target for " << F->getName() << ": "); + + // Based on the hint, get the hint metadata + NamedMDNode* HintNode; + switch (T) { + case visc::GPU_TARGET: + DEBUG(errs() << "GPU Target\n"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); + break; + case visc::SPIR_TARGET: + DEBUG(errs() << "SPIR Target\n"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_spir"); + break; + case visc::CUDNN_TARGET: + DEBUG(errs() << "CUDNN Target\n"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn"); + break; + case visc::PROMISE_TARGET: + DEBUG(errs() << "PROMISE Target\n"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_promise"); + break; + case visc::CPU_TARGET: + DEBUG(errs() << "CPU Target\n"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); + break; + case visc::CPU_OR_GPU_TARGET: + DEBUG(errs() << "CPU or GPU Target\n"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"); + break; + case visc::CPU_OR_SPIR_TARGET: + DEBUG(errs() << "CPU or SPIR Target\n"); + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_spir"); + break; + default: + llvm_unreachable("Unsupported Target Hint!"); + break; + } + + // Create a node for the function and add it to the hint node + MDTuple* N = MDNode::get(M->getContext(), ArrayRef<Metadata*>(ValueAsMetadata::get(F))); + HintNode->addOperand(N); +} + +// This function removes the hint as metadata in visc code +void removeHint(Function* F, visc::Target T) { + // Get Module + Module* M = F->getParent(); + DEBUG(errs() << "Remove preferred target for " << F->getName() << ": " << T << "\n"); + + // Based on the hint, get the hint metadata + NamedMDNode* HintNode; + switch (T) { + case visc::GPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); + break; + case visc::SPIR_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_spir"); + break; + case visc::CUDNN_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn"); + break; + case visc::PROMISE_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_promise"); + break; + case visc::CPU_OR_GPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"); + break; + case visc::CPU_OR_SPIR_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_spir"); + break; + case visc::CPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); + break; + default: + llvm_unreachable("Unsupported Target Hint!"); + break; + } + + // Gather metadata nodes, and keep those not associated with this function + MDNode* N = MDNode::get(M->getContext(), + ArrayRef<Metadata*>(ValueAsMetadata::get(F))); + std::vector<MDNode*> MDNodes; + + for (unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* MDN = HintNode->getOperand(i); + if (MDN == N) { + continue; + } + MDNodes.push_back(MDN); + } + + HintNode->dropAllReferences(); + + for (unsigned i = 0; i < MDNodes.size(); i++) { + HintNode->addOperand(MDNodes[i]); + } + +} + +visc::Target getPreferredTarget(Function* F) { + DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n"); + Module* M = F->getParent(); + NamedMDNode* HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* N = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::GPU_TARGET; + } + + HintNode = M->getOrInsertNamedMetadata("visc_hint_spir"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* N = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::SPIR_TARGET; + } + + HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* N = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::CUDNN_TARGET; + } + + HintNode = M->getOrInsertNamedMetadata("visc_hint_promise"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* N = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::PROMISE_TARGET; + } + + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* N = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::CPU_OR_GPU_TARGET; + } + + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_spir"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* N = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::CPU_OR_SPIR_TARGET; + } + return visc::CPU_TARGET; +} + + +} // End of namespace + +#endif //VISC_UTILS_HEADER diff --git a/lib/BuildDFG/BuildDFG.cpp b/lib/BuildDFG/BuildDFG.cpp new file mode 100644 index 0000000000..04b01e332b --- /dev/null +++ b/lib/BuildDFG/BuildDFG.cpp @@ -0,0 +1,395 @@ +//=== BuildDFG.cpp - Implements "Hierarchical Dataflow Graph Builder Pass" ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "buildDFG" +#include "llvm/BuildDFG/BuildDFG.h" + +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/ValueSymbolTable.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Debug.h" +#include "llvm/SupportVISC/VISCHint.h" +#include "llvm/SupportVISC/VISCUtils.h" + +using namespace llvm; + +namespace builddfg { + +bool BuildDFG::runOnModule(Module &M) { + errs() << "\nBUILDDFG PASS\n"; + DEBUG(errs() << "-------- Searching for launch sites ----------\n"); + + IntrinsicInst* II; + + // Iterate over all functions in the module + for (Module::iterator mi = M.begin(), me = M.end(); mi != me; ++mi) { + Function* f = &*mi; + DEBUG(errs() << "Function: " << f->getName() << "\n"); + + for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e ; ++i) { + Instruction* I = &*i; // Grab pointer to Instruction + if (isViscLaunchIntrinsic(I)) { + DEBUG(errs() << "------------ Found launch site --------------\n"); + II = cast<IntrinsicInst>(I); + + assert(II && "Launch intrinsic not recognized."); + + // Intrinsic Instruction has been initialized from this point on. + Function* F = cast<Function>(II->getOperand(0)->stripPointerCasts()); + Root = DFInternalNode::Create(II, F, viscUtils::getPreferredTarget(F)); + Roots.push_back(Root); + BuildGraph(Root, F); + + for(DFGraph::children_iterator i = Root->getChildGraph()->begin(), + e = Root->getChildGraph()->end(); i!=e; i++) { + DFNode* N = *i; + DEBUG(errs() << "\t" << N->getFuncPointer()->getName() << "\n"); + } + Root->getChildGraph()->sortChildren(); + for(DFGraph::children_iterator i = Root->getChildGraph()->begin(), + e = Root->getChildGraph()->end(); i!=e; i++) { + DFNode* N = *i; + DEBUG(errs() << "\t" << N->getFuncPointer()->getName() << "\n"); + } + viewDFGraph(Root->getChildGraph()); + + } + } + } + + // Checking that we found at least one launch site + assert((Roots.size() != 0) && "Launch site not found."); + + return false; //TODO: What does returning "false" mean? +} + +DFInternalNode *BuildDFG::getRoot() const { + return Root; +} + +std::vector<DFInternalNode*> &BuildDFG::getRoots() { + return Roots; +} + +//TODO: Maybe make this const +BuildDFG::HandleToDFNode &BuildDFG::getHandleToDFNodeMap() { + return HandleToDFNodeMap; +} + +//TODO: Maybe make this const +BuildDFG::HandleToDFEdge &BuildDFG::getHandleToDFEdgeMap() { + return HandleToDFEdgeMap; +} + +void BuildDFG::addElementToHandleToDFNodeMap(Value* V, DFNode* N) { + assert((HandleToDFNodeMap.find(V) == HandleToDFNodeMap.end()) && + "Attempted to insert duplicate key in HandleToDFNodeMap"); + HandleToDFNodeMap.insert(std::pair<Value*, DFNode*>(V,N)); +} + +//TODO: check if the removed element was not there +void BuildDFG::removeElementFromHandleToDFNodeMap(Value* V) { + HandleToDFNodeMap.erase(V); +} + +void BuildDFG::addElementToHandleToDFEdgeMap(Value* V, DFEdge* E) { + assert((HandleToDFEdgeMap.find(V) == HandleToDFEdgeMap.end()) && + "Attempted to insert duplicate key in HandleToDFEdgeMap"); + HandleToDFEdgeMap.insert(std::pair<Value*, DFEdge*>(V,E)); +} + +//TODO: check if the removed element was not there +void BuildDFG::removeElementFromHandleToDFEdgeMap(Value* V) { + HandleToDFEdgeMap.erase(V); +} + +// Returns true if instruction I is a visc launch intrinsic, false otherwise +bool BuildDFG::isViscLaunchIntrinsic(Instruction* I) { + if(!isa<IntrinsicInst>(I)) + return false; + IntrinsicInst* II = cast<IntrinsicInst>(I); + return (II->getCalledFunction()->getName()).equals("llvm.visc.launch"); +} + +// Returns true if instruction I is a visc graph intrinsic, false otherwise +bool BuildDFG::isViscGraphIntrinsic(Instruction* I) { + if(!isa<IntrinsicInst>(I)) + return false; + IntrinsicInst* II = cast<IntrinsicInst>(I); + return (II->getCalledFunction()->getName()).startswith("llvm.visc.create") + || (II->getCalledFunction()->getName()).startswith("llvm.visc.bind"); +} + +// Returns true if instruction I is a visc query intrinsic, false otherwise +bool BuildDFG::isViscQueryIntrinsic(Instruction* I) { + if(!isa<IntrinsicInst>(I)) + return false; + IntrinsicInst* II = cast<IntrinsicInst>(I); + return (II->getCalledFunction()->getName()).startswith("llvm.visc.get"); +} + +// Returns true if instruction I is a visc intrinsic, false otherwise +bool BuildDFG::isViscIntrinsic(Instruction* I) { + if(!isa<IntrinsicInst>(I)) + return false; + IntrinsicInst* II = cast<IntrinsicInst>(I); + return (II->getCalledFunction()->getName()).startswith("llvm.visc"); +} + +// Two types are "congruent" if they are identical, or if they are both +// pointer types with different pointee types and the same address space. +bool BuildDFG::isTypeCongruent(Type* L, Type* R) { + if(L == R) + return true; + PointerType *PL = dyn_cast<PointerType>(L); + PointerType *PR = dyn_cast<PointerType>(R); + if (!PL || !PR) + return false; + return PL->getAddressSpace() == PR->getAddressSpace(); +} + +// Handles all the createNodeXX visc intrinsics. +void BuildDFG::handleCreateNode(DFInternalNode* N, IntrinsicInst* II) { + bool isInternalNode = false; + + Function* F = cast<Function>((II->getOperand(0))->stripPointerCasts()); + + // Check if the function associated with this intrinsic is a leaf or + // internal node + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction* I = &*i; // Grab pointer to Instruction + if (isViscGraphIntrinsic(I)) + isInternalNode = true; + } + + // Number of Dimensions would be equal to the (number of operands - 1) as + // the first operand is the pointer to associated Function and the + // remaining operands are the limits in each dimension. + unsigned numOfDim = II->getCalledFunction()->getFunctionType()->getNumParams()-1; + assert(numOfDim <= 3 + && "Invalid number of dimensions for createNode intrinsic!"); + std::vector<Value*> dimLimits; + for (unsigned i = 1; i <= numOfDim; i++) { + // The operands of II are same as the operands of the called + // intrinsic. It has one extra operand at the end, which is the intrinsic + // being called. + dimLimits.push_back(cast<Value> (II->getOperand(i))); + } + + if(isInternalNode) { + // Create Internal DFNode, add it to the map and recursively build its + // dataflow graph + DFInternalNode* childDFNode = DFInternalNode::Create(II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits); + N->addChildToDFGraph(childDFNode); + HandleToDFNodeMap[II] = childDFNode; + BuildGraph(childDFNode, F); + } + else { + // Create Leaf DFnode and add it to the map. + DFLeafNode* childDFNode = DFLeafNode::Create(II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits); + N->addChildToDFGraph(childDFNode); + HandleToDFNodeMap[II] = childDFNode; + } +} + +void BuildDFG::handleCreateEdge (DFInternalNode* N, IntrinsicInst* II) { + // The DFNode structures must be in the map before the edge is processed + HandleToDFNode::iterator DFI = HandleToDFNodeMap.find(II->getOperand(0)); + assert(DFI != HandleToDFNodeMap.end()); + DFI = HandleToDFNodeMap.find(II->getOperand(1)); + assert(DFI != HandleToDFNodeMap.end()); + + DFNode* SrcDF = HandleToDFNodeMap[II->getOperand(0)]; + DFNode* DestDF = HandleToDFNodeMap[II->getOperand(1)]; + + bool EdgeType = !cast<ConstantInt>(II->getOperand(2))->isZero(); + + unsigned SourcePosition = cast<ConstantInt>(II->getOperand(3))->getZExtValue(); + unsigned DestPosition = cast<ConstantInt>(II->getOperand(4))->getZExtValue(); + + bool isStreaming = !cast<ConstantInt>(II->getOperand(5))->isZero(); + + Type *SrcTy, *DestTy; + + // Get destination type + FunctionType *FT = DestDF->getFuncPointer()->getFunctionType(); + assert((FT->getNumParams() > DestPosition) + && "Invalid argument number for destination dataflow node!"); + DestTy = FT->getParamType(DestPosition); + + // Get source type + StructType* OutTy = SrcDF->getOutputType(); + assert((OutTy->getNumElements() > SourcePosition) + && "Invalid argument number for source dataflow node!"); + SrcTy = OutTy->getElementType(SourcePosition); + + // check if the types are compatible + assert(isTypeCongruent(SrcTy, DestTy) + && "Source and destination type of edge do not match"); + + DFEdge* newDFEdge = DFEdge::Create(SrcDF, + DestDF, + EdgeType, + SourcePosition, + DestPosition, + DestTy, + isStreaming); + + HandleToDFEdgeMap[II] = newDFEdge; + + // Add Edge to the dataflow graph associated with the parent node + N->addEdgeToDFGraph(newDFEdge); +} + +void BuildDFG::handleBindInput(DFInternalNode* N, IntrinsicInst* II) { + // The DFNode structures must be in the map before the edge is processed + HandleToDFNode::iterator DFI = HandleToDFNodeMap.find(II->getOperand(0)); + assert(DFI != HandleToDFNodeMap.end()); + + DFNode* SrcDF = N->getChildGraph()->getEntry(); + DFNode* DestDF = HandleToDFNodeMap[II->getOperand(0)]; + + unsigned SourcePosition = cast<ConstantInt>(II->getOperand(1))->getZExtValue(); + unsigned DestPosition = cast<ConstantInt>(II->getOperand(2))->getZExtValue(); + + bool isStreaming = !cast<ConstantInt>(II->getOperand(3))->isZero(); + + // Get destination type + FunctionType *FT = DestDF->getFuncPointer()->getFunctionType(); + assert((FT->getNumParams() > DestPosition) + && "Invalid argument number for destination dataflow node!"); + Type* DestTy = FT->getParamType(DestPosition); + + // Get source type + FT = SrcDF->getFuncPointer()->getFunctionType(); + assert((FT->getNumParams() > SourcePosition) + && "Invalid argument number for parent dataflow node!"); + Type* SrcTy = FT->getParamType(SourcePosition); + + // check if the types are compatible + assert(isTypeCongruent(SrcTy, DestTy) + && "Source and destination type of edge do not match"); + + // Add Binding as an edge between Entry and child Node + DFEdge* newDFEdge = DFEdge::Create(SrcDF, + DestDF, + false, + SourcePosition, + DestPosition, + DestTy, + isStreaming); + + HandleToDFEdgeMap[II] = newDFEdge; + + // Add Edge to the dataflow graph associated with the parent node + N->addEdgeToDFGraph(newDFEdge); +} + +void BuildDFG::handleBindOutput(DFInternalNode* N, IntrinsicInst* II) { + // The DFNode structures must be in the map before the edge is processed + HandleToDFNode::iterator DFI = HandleToDFNodeMap.find(II->getOperand(0)); + assert(DFI != HandleToDFNodeMap.end()); + + DFNode* SrcDF = HandleToDFNodeMap[II->getOperand(0)]; + DFNode* DestDF = N->getChildGraph()->getExit(); + + unsigned SourcePosition = cast<ConstantInt>(II->getOperand(1))->getZExtValue(); + unsigned DestPosition = cast<ConstantInt>(II->getOperand(2))->getZExtValue(); + + bool isStreaming = !cast<ConstantInt>(II->getOperand(3))->isZero(); + + // Get destination type + StructType* OutTy = DestDF->getOutputType(); + assert((OutTy->getNumElements() > DestPosition) + && "Invalid argument number for destination parent dataflow node!"); + Type* DestTy = OutTy->getElementType(DestPosition); + + // Get source type + OutTy = SrcDF->getOutputType(); + assert((OutTy->getNumElements() > SourcePosition) + && "Invalid argument number for source dataflow node!"); + Type* SrcTy = OutTy->getElementType(SourcePosition); + + // check if the types are compatible + assert(isTypeCongruent(SrcTy, DestTy) + && "Source and destination type of edge do not match"); + + // Add Binding as an edge between child and exit node + DFEdge* newDFEdge = DFEdge::Create(SrcDF, + DestDF, + false, + SourcePosition, + DestPosition, + DestTy, + isStreaming); + + HandleToDFEdgeMap[II] = newDFEdge; + + // Add Edge to the dataflow graph associated with the parent node + N->addEdgeToDFGraph(newDFEdge); +} + +void BuildDFG::BuildGraph (DFInternalNode* N, Function *F) { + + // TODO: Place checks for valid visc functions. For example one of the + // check can be that any function that contains visc dataflow graph + // construction intrinsics should not have other llvm IR statements. + + // Iterate over all the instructions of a function and look for visc + // intrinsics. + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction* I = &*i; // Grab pointer to instruction reference + DEBUG(errs() << *I << "\n"); + if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(I)) { + DEBUG(errs() << "IntrinsicID = " << II->getIntrinsicID() << ": " << II->getCalledFunction()->getName()<<"\n"); + switch(II->getIntrinsicID()) { + + case Intrinsic::visc_createNode: + case Intrinsic::visc_createNode1D: + case Intrinsic::visc_createNode2D: + case Intrinsic::visc_createNode3D: + handleCreateNode (N, II); + break; + + case Intrinsic::visc_createEdge: + handleCreateEdge(N, II); + break; + case Intrinsic::visc_bind_input: + handleBindInput(N, II); + break; + case Intrinsic::visc_bind_output: + handleBindOutput(N, II); + break; + + //TODO: Reconsider launch within a dataflow graph (recursion?) + case Intrinsic::visc_wait: + case Intrinsic::visc_launch: + errs() << "Error: Launch/wait intrinsic used within a dataflow graph\n\t" << *II << "\n"; + break; + + default: + errs() << "Error: Invalid VISC Intrinsic inside Internal node!\n\t" << *II << "\n"; + break; + } + } + else if(!isa<ReturnInst>(I)) { + errs() << "Non-intrinsic instruction: " << *I << "\n"; + llvm_unreachable("Found non-intrinsic instruction inside an internal node. Only return instruction is allowed!"); + + } + + } +} + +char BuildDFG::ID = 0; +static RegisterPass<BuildDFG> X("buildDFG", "Hierarchical Dataflow Graph Builder Pass", false, false); + +} // End of namespace builddfg + diff --git a/lib/BuildDFG/BuildDFG.exports b/lib/BuildDFG/BuildDFG.exports new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/BuildDFG/CMakeLists.txt b/lib/BuildDFG/CMakeLists.txt new file mode 100644 index 0000000000..0b1fa4837c --- /dev/null +++ b/lib/BuildDFG/CMakeLists.txt @@ -0,0 +1,12 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( LLVMBuildDFG + BuildDFG.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) diff --git a/lib/BuildDFG/LLVMBuild.txt b/lib/BuildDFG/LLVMBuild.txt new file mode 100644 index 0000000000..26d8856162 --- /dev/null +++ b/lib/BuildDFG/LLVMBuild.txt @@ -0,0 +1,21 @@ +;===- ./lib/Transforms/BuildDFG/LLVMBuild.txt ------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = BuildDFG +parent = Transforms diff --git a/lib/ClearDFG/CMakeLists.txt b/lib/ClearDFG/CMakeLists.txt new file mode 100644 index 0000000000..f928c8acda --- /dev/null +++ b/lib/ClearDFG/CMakeLists.txt @@ -0,0 +1,12 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( LLVMClearDFG + ClearDFG.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) diff --git a/lib/ClearDFG/ClearDFG.cpp b/lib/ClearDFG/ClearDFG.cpp new file mode 100644 index 0000000000..84f9bec04f --- /dev/null +++ b/lib/ClearDFG/ClearDFG.cpp @@ -0,0 +1,172 @@ +//===-------------------------- ClearDFG.cpp --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "ClearDFG" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Support/Debug.h" +#include "llvm/BuildDFG/BuildDFG.h" + +using namespace llvm; +using namespace builddfg; + +//STATISTIC(IntrinsicCounter, "Counts number of visc intrinsics greeted"); + +namespace { + +// ClearDFG - The first implementation. +struct ClearDFG : public ModulePass { + static char ID; // Pass identification, replacement for typeid + ClearDFG() : ModulePass(ID) {} + +private: + // Member variables + + // Functions + +public: + bool runOnModule(Module &M); + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<BuildDFG>(); + } + + +}; + +// Visitor for Code generation traversal (tree traversal for now) +class TreeTraversal : public DFNodeVisitor { + +private: + //Member variables + Module &M; + BuildDFG &DFG; + + // Map from Old function associated with DFNode to new cloned function with + // extra index and dimension arguments. This map also serves to find out if + // we already have an index and dim extended function copy or not (i.e., + // "Have we visited this function before?") + ValueMap<Function*, Function*> FMap; + DenseMap<DFNode*, CallInst*> CallMap; + + //Functions + void deleteNode(DFNode* N); + +public: + // Constructor + TreeTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) { } + + virtual void visit(DFInternalNode* N) { + // Follows a bottom-up approach for code generation. + // First generate code for all the child nodes + for(DFGraph::children_iterator i = N->getChildGraph()->begin(), + e = N->getChildGraph()->end(); i != e; ++i) { + DFNode* child = *i; + child->applyDFNodeVisitor(*this); + } + DEBUG(errs() << "Erasing Node (I) - " << N->getFuncPointer()->getName() << "\n"); + // Generate code for this internal node now. This way all the cloned + // functions for children exist. + deleteNode(N); + DEBUG(errs() << "\tDone - " << "\n"); + //errs() << "DONE: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n"; + } + + virtual void visit(DFLeafNode* N) { + DEBUG(errs() << "Erasing Node (L) - " << N->getFuncPointer()->getName() << "\n"); + deleteNode(N); + DEBUG(errs() << "DONE" << "\n"); + } + +}; + +bool ClearDFG::runOnModule(Module &M) { + + errs() << "\nCLEARDFG PASS\n"; + // Get the BuildDFG Analysis Results: + // - Dataflow graph + // - Maps from i8* hansles to DFNode and DFEdge + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + // DFInternalNode *Root = DFG.getRoot(); + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); + // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); + + Function* VI = M.getFunction("llvm.visc.init"); + assert(VI->hasOneUse() && "More than one use of llvm.visc.init\n"); + for(Value::user_iterator ui = VI->user_begin(), ue = VI->user_end(); ui != ue; ui++) { + Instruction* I = dyn_cast<Instruction>(*ui); + I->eraseFromParent(); + } + VI->replaceAllUsesWith(UndefValue::get(VI->getType())); + VI->eraseFromParent(); + + Function* VC = M.getFunction("llvm.visc.cleanup"); + assert(VC->hasOneUse() && "More than one use of llvm.visc.cleanup\n"); + for(Value::user_iterator ui = VC->user_begin(), ue = VC->user_end(); ui != ue; ui++) { + Instruction* I = dyn_cast<Instruction>(*ui); + I->eraseFromParent(); + } + + VC->replaceAllUsesWith(UndefValue::get(VC->getType())); + VC->eraseFromParent(); + + + Function* VN = M.getFunction("llvm.visc.node.id"); + if (VN != NULL){ // Delete visc.node.id intrinsic calls if they exist + for(Value::user_iterator ui = VN->user_begin(), ue = VN->user_end(); ui != ue; ui++) { + Instruction* I = dyn_cast<Instruction>(*ui); + I->eraseFromParent(); + } + + VN->replaceAllUsesWith(UndefValue::get(VN->getType())); + VN->eraseFromParent(); + } + + + // Visitor for Code Generation Graph Traversal + TreeTraversal *Visitor = new TreeTraversal(M, DFG); + + // Initiate code generation for root DFNode + for (auto rootNode: Roots) { + Visitor->visit(rootNode); + } + + delete Visitor; + + return true; +} + +void TreeTraversal::deleteNode(DFNode* N) { + if(N->isDummyNode()) + return; + // Erase Function associated with this node + Function* F = N->getFuncPointer(); + F->replaceAllUsesWith(UndefValue::get(F->getType())); + F->eraseFromParent(); + // If N is not a root node, we are done. Return. + if(!N->isRoot()) + return; + // N is a root node. Delete the Launch Intrinsic associated it with as well. + IntrinsicInst* LI = N->getInstruction(); + LI->replaceAllUsesWith(UndefValue::get(LI->getType())); + LI->eraseFromParent(); +} + +} // End of namespace + +char ClearDFG::ID = 0; +static RegisterPass<ClearDFG> X("clearDFG", + "Delete all DFG functions for which code has been generated", + false /* does not modify the CFG */, + true /* transformation, not just analysis */); + diff --git a/lib/ClearDFG/ClearDFG.exports b/lib/ClearDFG/ClearDFG.exports new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/ClearDFG/LLVMBuild.txt b/lib/ClearDFG/LLVMBuild.txt new file mode 100644 index 0000000000..ebca891469 --- /dev/null +++ b/lib/ClearDFG/LLVMBuild.txt @@ -0,0 +1,21 @@ +;===- ./lib/Transforms/ClearDFG/LLVMBuild.txt ------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = ClearDFG +parent = Transforms diff --git a/lib/DFG2LLVM_CUDNN/CMakeLists.txt b/lib/DFG2LLVM_CUDNN/CMakeLists.txt new file mode 100644 index 0000000000..dc98faafec --- /dev/null +++ b/lib/DFG2LLVM_CUDNN/CMakeLists.txt @@ -0,0 +1,12 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( LLVMDFG2LLVM_CUDNN + DFG2LLVM_CUDNN.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) diff --git a/lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp b/lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp new file mode 100644 index 0000000000..f18325588c --- /dev/null +++ b/lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp @@ -0,0 +1,645 @@ +//=== DFG2LLVM_CUDNN.cpp ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#define ENABLE_ASSERTS + +#define DEBUG_TYPE "DFG2LLVM_CUDNN" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/IR/Attributes.h" +#include "llvm-c/Core.h" +#include "llvm/SupportVISC/VISCTimer.h" +#include "llvm/SupportVISC/DFG2LLVM.h" +#include "llvm/InPlaceDFG/InPlaceDFGAnalysis.h" +#include <sstream> + +using namespace llvm; +using namespace builddfg; +using namespace dfg2llvm; + +using namespace inplacedfg; + +namespace { +// Helper class declarations + +// DFG2LLVM_CUDNN - The first implementation. + +struct DFG2LLVM_CUDNN : public DFG2LLVM { + static char ID; // Pass identification, replacement for typeid + DFG2LLVM_CUDNN() : DFG2LLVM(ID) {} +private: + +public: + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<BuildDFG>(); + AU.addRequired<InPlaceDFGAnalysisWrapper>(); + AU.addPreserved<BuildDFG>(); + AU.addPreserved<InPlaceDFGAnalysisWrapper>(); + } + + bool runOnModule(Module &M); +}; + +// Visitor for Code generation traversal (tree traversal for now) +class CGT_CUDNN : public CodeGenTraversal { + +private: + //Member variables + InPlaceDFGAnalysis::InPlaceDFGParameter *IPP; + + // VISC Runtime API and Tensor runtime API + Constant* llvm_hpvm_initTensorRt; + Constant* llvm_hpvm_cleanupTensorRt; + Constant* hpvm_request_tensor; + + // Functions + bool isValidOperandForInPlaceOperation(Value *Op, Function *Fgen, DFNode *N); + + + + // Virtual Functions + void init(); + void initRuntimeAPI(); + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); + +public: + + // Constructor + CGT_CUDNN(Module &_M, BuildDFG &_DFG, InPlaceDFGAnalysis::InPlaceDFGParameter &_IPP) + : CodeGenTraversal(_M, _DFG), IPP(&_IPP) { + initRuntimeAPI(); + } + +}; + +bool CGT_CUDNN::isValidOperandForInPlaceOperation(Value *Op, + Function *Fgen, + DFNode *N) { + + if (Argument *Arg = dyn_cast<Argument>(Op)) { + DEBUG(errs() << *Arg << "\t: argument, candidate for in place\n"); + assert((Arg->getParent() == Fgen) && + "Extra Parameter in body of Function\n"); + // Candidae parameter is a function argument + // In this case, consult the result of in place analysis + // Find position in arg list + unsigned pos = Arg->getArgNo(); + // If this parameter cannot be used for in place operation + // code gen cannot continue + if (IPP->at(N)[pos]) { + DEBUG(errs() << *Arg << "\t: argument, suitable for in place\n"); + return true; + } else { + DEBUG(errs() << *Arg << "\t: argument, not suitable for in place\n"); + return false; + } + } + else { + // If it is not an argument, then it needs to be the result of + // another intrinsic. These are new objects that are allocated, + // and consumed by next intrinsic. + DEBUG(errs() << *Op << "\t: Test for result of intrinsic operation\n"); + if (dyn_cast<IntrinsicInst>(Op)) { + DEBUG(errs() << *Arg << "\t: local, suitable for in place\n"); + return true; + } else { + DEBUG(errs() << *Arg << "\t: local, not suitable for in place\n"); + return false; + } + } +} + + +void CGT_CUDNN::init() { +} + +// Initialize the VISC runtime API. This makes it easier to insert these calls +void CGT_CUDNN::initRuntimeAPI() { + + // Load Runtime API Module + SMDiagnostic Err; + + char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!\n"); + + // FIXME: set correct path + Twine llvmSrcRoot = LLVM_SRC_ROOT; + Twine runtimeAPI = llvmSrcRoot+"/projects/hpvm-tensor-rt/lib/tensor_runtime.ll"; + runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); + if(runtimeModule == nullptr) + DEBUG(errs() << Err.getMessage()); + else + DEBUG(errs() << "Successfully loaded hpvm-tensor-rt API module\n"); + + // Get or insert Global declarations for + // - initialization + // - cleanup + // - request a tensor + DECLARE(llvm_hpvm_initTensorRt); + DECLARE(llvm_hpvm_cleanupTensorRt); + DECLARE(hpvm_request_tensor); + + // Find visc.init and visc.cleanup calls, and add placeholder methods + // for initialization and cleanup of the hpvm tensor runtime + /* + LLVMContext &C = M.getContext(); + auto *FuncType = FunctionType::get(Type::getVoidTy(C), ArrayRef<Type *>({Type::getInt32Ty(C)}), false); + llvm_hpvm_initTensorRt = M.getOrInsertFunction(StringRef("llvm_hpvm_initTensorRt"), FuncType); + FuncType = FunctionType::get(Type::getVoidTy(C), ArrayRef<Type *>({}), false); + llvm_hpvm_cleanupTensorRt = M.getOrInsertFunction(StringRef("llvm_hpvm_cleanupTensorRt"), FuncType); + FuncType = FunctionType::get(Type::getVoidTy(C), ArrayRef<Type *>({Type::getInt8PtrTy(C), Type::getInt32Ty(C)}), false); + hpvm_request_tensor = M.getOrInsertFunction(StringRef("hpvm_request_tensor"), FuncType); +*/ + Function* VI = M.getFunction("llvm.visc.init"); + assert(VI->getNumUses() == 1 && "__visc__init should only be used once\n"); + InitCall = cast<Instruction>(*VI->user_begin()); + CallInst::Create(llvm_hpvm_initTensorRt, + ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(M.getContext()), 0)), + "", InitCall); + + + Function* VC = M.getFunction("llvm.visc.cleanup"); + assert(VC->getNumUses() == 1 && "__visc__clear should only be used once\n"); + CleanupCall = cast<Instruction>(*VC->user_begin()); + CallInst::Create(llvm_hpvm_cleanupTensorRt, ArrayRef<Value*>(), "", CleanupCall); + +} + +void CGT_CUDNN::codeGen(DFInternalNode* N) { + errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n"; + errs () << "Skipping internal node\n"; +} + + +void CGT_CUDNN::codeGen(DFLeafNode* N) { + + // Skip code generation if it is a dummy node + if(N->isDummyNode()) { + DEBUG(errs() << "Skipping dummy node\n"); + return; + } + + // Abort code generation if it is an allocation node + if(N->isAllocationNode()) { + assert(false && "Allocation Node not expected in ApproxHPVM"); + return; + } + + // Generate code only if it has the right hint + if (!checkPreferredTarget(N, visc::CUDNN_TARGET)) { + errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; + return; + } + + // Get the function associated with the dataflow node + Function *F = N->getFuncPointer(); + errs()<<"function name = "<< F->getName()<<"\n"; + + /* Removing HPVM in/out/inout function attributes */ + for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae; ai++){ + Argument *Arg = &*ai; + if(Arg->hasAttribute(Attribute::In)) + Arg->removeAttr(Attribute::In); + if(Arg->hasAttribute(Attribute::Out)) + Arg->removeAttr(Attribute::Out); + if(Arg->hasAttribute(Attribute::InOut)) + Arg->removeAttr(Attribute::InOut); + } + + // Look up if we have visited this function before. If we have, then just + // get the cloned function pointer from DFNode. Otherwise, create the cloned + // function and add it to the DFNode GenFunc. + Function *F_cudnn = N->getGenFuncForTarget(visc::CUDNN_TARGET); + + assert((F_cudnn == NULL) && + "Error: Visiting a node for which code already generated"); + + // Clone the function + ValueToValueMapTy VMap; + std::string FName(F->getName().data()); + F_cudnn = CloneFunction(F, VMap); + F_cudnn->setName(FName + "_cudnn"); + errs()<<"Cloned function name2 = "<<F_cudnn->getName()<<"\n"; + F_cudnn->removeFromParent(); + M.getFunctionList().push_back(F_cudnn); + + N->addGenFunc(F_cudnn, visc::CUDNN_TARGET, true); + + // Adding nounwind to generated function : FIXME: needed? + DEBUG(errs() << "Adding nounwind to generated function\n"); + F_cudnn->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind); + + // Add llvm_visc_requestTensor calls for every pointer argument of the function + // (they are all expected to be tensors), at the beginning of the function. + // This is the first instruction of the function, insert them before this + Instruction* FI = &*(F_cudnn->getEntryBlock().begin()); + + // In this backend, the target device is GPU, represented by i32 1. + ConstantInt *TargetDeviceID = + ConstantInt::get(Type::getInt32Ty(M.getContext()), 1); + + for (Function::arg_iterator ai = F_cudnn->arg_begin(), + ae = F_cudnn->arg_end(); ai != ae; ++ai) { + Argument* Arg = &*ai; + if (Arg->getType()->isPointerTy()) { + Value *Args[] = {Arg, TargetDeviceID}; + CallInst::Create(hpvm_request_tensor, + ArrayRef<Value*>(Args, 2), + "", FI); + } + } + + std::vector<IntrinsicInst *> IItoRemove; + + for (inst_iterator i = inst_begin(F_cudnn), e = inst_end(F_cudnn); i != e; ++i) { + Instruction *I = &(*i); + + if (BuildDFG::isViscIntrinsic(I)) { + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + //assert((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor") + // && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n"); + + //if (!(II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")){ + //continue; // skip non-tensor ops + //} + + /********************* Handle VISC Tensor intrinsics ********************/ + switch (II->getIntrinsicID()) { + + case Intrinsic::visc_tensor_convolution: + { /* llvm.hpvm.tensor.mul */ + // Tensor mul is not in place. + DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor convolution \n"); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + Args.push_back(II->getOperand(1)); + Args.push_back(II->getOperand(2)); + Args.push_back(II->getOperand(3)); + Args.push_back(II->getOperand(4)); + Args.push_back(II->getOperand(5)); + + Constant* conv_mode = ConstantInt::get(Type::getInt32Ty(M.getContext()), 1); + Constant* conv_precision = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0); + + Args.push_back(conv_mode); + Args.push_back(conv_precision); + + // Create cudnn runtime function call + Constant* tensorConvolution; + DECLARE(tensorConvolution); + + CallInst* CI = CallInst::Create(tensorConvolution, + Args, "", II); + // We can replace the call to hpvm.tensor.mul with the runtime call + II->replaceAllUsesWith(CI); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + + case Intrinsic::visc_tensor_group_convolution: + { /* llvm.hpvm.tensor.mul */ + // Tensor mul is not in place. + DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor convolution \n"); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + Args.push_back(II->getOperand(1)); + Args.push_back(II->getOperand(2)); + Args.push_back(II->getOperand(3)); + Args.push_back(II->getOperand(4)); + Args.push_back(II->getOperand(5)); + + Constant* conv_mode = ConstantInt::get(Type::getInt32Ty(M.getContext()), 1); + + Args.push_back(conv_mode); + Args.push_back(II->getOperand(7)); + + // Create cudnn runtime function call + Constant* tensorConvolution; + DECLARE(tensorConvolution); + + CallInst* CI = CallInst::Create(tensorConvolution, + Args, "", II); + // We can replace the call to hpvm.tensor.mul with the runtime call + II->replaceAllUsesWith(CI); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + + case Intrinsic::visc_tensor_batchnorm: + { /* llvm.hpvm.tensor.batchnorm */ + // Tensor batchnorm is in place. + // FIXME: Add Check for InPlace Analysis + DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor batch normalization \n"); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + Args.push_back(II->getOperand(1)); + Args.push_back(II->getOperand(2)); + Args.push_back(II->getOperand(3)); + Args.push_back(II->getOperand(4)); + Args.push_back(II->getOperand(5)); + + // Create cudnn runtime function call + Constant* tensorBatchNorm; + DECLARE(tensorBatchNorm); + + CallInst* CI = CallInst::Create(tensorBatchNorm, + Args, "", II); + // We can replace the call to hpvm.tensor.batchnorm with the TensorRT call + II->replaceAllUsesWith(CI); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + + + case Intrinsic::visc_tensor_mul: + { /* llvm.hpvm.tensor.mul */ + // Tensor mul is not in place. + DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor mul\n"); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + Args.push_back(II->getOperand(1)); + + // Create cudnn runtime function call + Constant* tensorGemmGPU; + DECLARE(tensorGemmGPU); + + CallInst* CI = CallInst::Create(tensorGemmGPU, + Args, "", II); + // We can replace the call to hpvm.tensor.mul with the runtime call + II->replaceAllUsesWith(CI); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + case Intrinsic::visc_tensor_add: + { /* llvm.hpvm.tensor.add */ + DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor add\n"); + // Tensor add(a,b) is in place for argument a. + Value *Op = II->getOperand(0); + + // Test the intrinsic operand for in place operation. + bool inplace = isValidOperandForInPlaceOperation(Op, F_cudnn, N); + // Code generation cannot continue if this is false, because the target + // only provides an in place operation + + // FIXME: remove this comment - must check for in-place + //assert(inplace && + // "Operand not valid for in place operation. Code gen aborted.\n"); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + Args.push_back(II->getOperand(1)); + + // Create cudnn runtime function call + Constant* tensorAdd; + DECLARE(tensorAdd); + CallInst::Create(tensorAdd, Args, "", II); + // We can replace the call to hpvm.tensor.add with the 1st argument + // that, due to in place operation, now contains the result + II->replaceAllUsesWith(II->getOperand(0)); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + case Intrinsic::visc_tensor_pool_max: + case Intrinsic::visc_tensor_pool_mean: + { /* llvm.visc.tensor.relu */ + DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor_pool_max\n"); + + // Argument list - tensorPooling(input, poolFunction, window_height, + // window_width, vertical_pad, horizontal_pad, + // vertical_stride, horizontal_stride); + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + + int pool_type = 0; + if (II->getIntrinsicID() == Intrinsic::visc_tensor_pool_max){ + pool_type = 0; + } + if (II->getIntrinsicID() == Intrinsic::visc_tensor_pool_mean){ + pool_type = 1; + } + + Constant* constPoolType = ConstantInt::get(Type::getInt32Ty(M.getContext()), pool_type); + Args.push_back(constPoolType); // ID for max pool. Min/Avg have different IDs (non-zero) + Args.push_back(II->getOperand(1)); + Args.push_back(II->getOperand(2)); + Args.push_back(II->getOperand(3)); + Args.push_back(II->getOperand(4)); + Args.push_back(II->getOperand(5)); + Args.push_back(II->getOperand(6)); + + // Create cudnn runtime function call + Constant* tensorPooling; + DECLARE(tensorPooling); + CallInst* CI = CallInst::Create(tensorPooling, Args, "", II); + + // Replacing intrinsic result uses with the result of the tensor runtime operation + II->replaceAllUsesWith(CI); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + + case Intrinsic::visc_tensor_relu: + case Intrinsic::visc_tensor_clipped_relu: + case Intrinsic::visc_tensor_tanh: + { /* llvm.visc.tensor.relu */ + DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor activation functions \n"); + // Tensor relu(a) is in place for argument a. + Value *Op = II->getOperand(0); + + // Test the intrinsic operand for in place operation. + bool inplace = isValidOperandForInPlaceOperation(Op, F_cudnn, N); + // Code generation cannot continue if this is false, because the target + // only provides an in place operation + assert(inplace && + "Operand not valid for in place operation. Code gen aborted.\n"); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + + if (II->getIntrinsicID() == Intrinsic::visc_tensor_relu){ + // Create cudnn runtime function call + Constant* tensorRelu; + DECLARE(tensorRelu); + CallInst::Create(tensorRelu, Args, "", II); + } + else if (II->getIntrinsicID() == Intrinsic::visc_tensor_clipped_relu){ + // Create cudnn runtime function call + //-- Constant* tensorClippedRelu; + Constant* tensorRelu2; + DECLARE(tensorRelu2); + CallInst::Create(tensorRelu2, Args, "", II); + } + else if (II->getIntrinsicID() == Intrinsic::visc_tensor_tanh){ + // Create cudnn runtime function call + Constant* tensorTanh; + errs()<<"tensorTanh Call = \n\n"; + DECLARE(tensorTanh); + //errs()<<"tensorTanh Call = "<<*tensorTanh<<"\l"; + CallInst::Create(tensorTanh, Args, "", II); + } + + // We can replace the call to hpvm.tensor.relu with the 1st argument + // that, due to in place operation, now contains the result + II->replaceAllUsesWith(II->getOperand(0)); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + case Intrinsic::visc_tensor_softmax: + { /* llvm.visc.tensor.softmax */ + DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor softmax\n"); + // Tensor relu(a) is in place for argument a. + Value *Op = II->getOperand(0); + + // Test the intrinsic operand for in place operation. + bool inplace = isValidOperandForInPlaceOperation(Op, F_cudnn, N); + // Code generation cannot continue if this is false, because the target + // only provides an in place operation + assert(inplace && + "Operand not valid for in place operation. Code gen aborted.\n"); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + + // Create cudnn runtime function call + Constant* tensorSoftmax; + DECLARE(tensorSoftmax); + CallInst::Create(tensorSoftmax, Args, "", II); + // We can replace the call to hpvm.tensor.softmax with the 1st argument + // that, due to in place operation, now contains the result + II->replaceAllUsesWith(II->getOperand(0)); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + + case Intrinsic::visc_node_id: + { /* llvm.visc.node.id */ + DEBUG(errs() << F_cudnn->getName() << "\t: Handling Node ID Intrinsic \n"); + // Get uint32 argument + Value *Op = II->getOperand(0); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + + // Create hpvm-tensor-rt function call + Constant* tensor_set_node_id; + DECLARE(tensor_set_node_id); + CallInst::Create(tensor_set_node_id, Args, "", II); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + + + default: + llvm_unreachable("Unknown VISC Intrinsic!"); + break; + } + } + } + + //--- errs()<<"IIToRemove.size() = "<<IItoRemove.size()<<"\n\n"; + + // We need to do this explicitly: DCE pass may not remove them. + // Traverse the vector backwards, otherwise definitions are deleted while + // their subsequent uses are still around. + for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(), + re = IItoRemove.rend(); ri != re; ++ri) { + DEBUG(errs() << "Erasing: " << **ri << "\n"); + errs() << "Erasing: " << **ri << "\n"; + (*ri)->eraseFromParent(); + } + + return; +} + +bool DFG2LLVM_CUDNN::runOnModule(Module &M) { + errs() << "\nDFG2LLVM_CUDNN PASS\n"; + + // Get the BuildDFG Analysis Results: + // - Dataflow graph + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + // Get the In Place Analysis Results + InPlaceDFGAnalysis::InPlaceDFGParameter IPP = + (getAnalysis<InPlaceDFGAnalysisWrapper>()).getIPP(); + // Print results + printInPlaceDFGParameter(IPP); + + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + + // Visitor for Code Generation Graph Traversal + CGT_CUDNN *CGTVisitor = new CGT_CUDNN(M, DFG, IPP); + + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode: Roots) { + // Initiate code generation for root DFNode + CGTVisitor->visit(rootNode); + } + + //TODO: Edit module epilogue to remove the VISC intrinsic declarations + delete CGTVisitor; + + return true; +} + + +/****************************************************************************** + * Helper functions * + ******************************************************************************/ + + +} // End of namespace + +char DFG2LLVM_CUDNN::ID = 0; +static RegisterPass<DFG2LLVM_CUDNN> X("dfg2llvm-cudnn", + "Dataflow Graph to LLVM for CUDNN Pass", + false /* does not modify the CFG */, + true /* transformation, * + * not just analysis */); + diff --git a/lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.exports b/lib/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.exports new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/DFG2LLVM_CUDNN/LLVMBuild.txt b/lib/DFG2LLVM_CUDNN/LLVMBuild.txt new file mode 100644 index 0000000000..1579b2fc47 --- /dev/null +++ b/lib/DFG2LLVM_CUDNN/LLVMBuild.txt @@ -0,0 +1,21 @@ +;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = DFG2LLVM_CUDNN +parent = Transforms diff --git a/lib/DFG2LLVM_NVPTX/CMakeLists.txt b/lib/DFG2LLVM_NVPTX/CMakeLists.txt new file mode 100644 index 0000000000..430bea7693 --- /dev/null +++ b/lib/DFG2LLVM_NVPTX/CMakeLists.txt @@ -0,0 +1,12 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( LLVMDFG2LLVM_NVPTX + DFG2LLVM_NVPTX.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) diff --git a/lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp new file mode 100644 index 0000000000..c0cbd4df14 --- /dev/null +++ b/lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -0,0 +1,2075 @@ +//=== DFG2LLVM_NVPTX.cpp ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define ENABLE_ASSERTS +#define TARGET_PTX 64 +#define GENERIC_ADDRSPACE 0 +#define GLOBAL_ADDRSPACE 1 +#define CONSTANT_ADDRSPACE 4 +#define SHARED_ADDRSPACE 3 + +#define DEBUG_TYPE "DFG2LLVM_NVPTX" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/IR/Attributes.h" +#include "llvm-c/Core.h" +#include "llvm/SupportVISC/VISCTimer.h" +#include "llvm/SupportVISC/DFG2LLVM.h" +#include "llvm/SupportVISC/VISCUtils.h" + +#include "llvm/IR/IRPrintingPasses.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Support/ToolOutputFile.h" +#include "llvm/IR/UseListOrder.h" + + +#include <sstream> + +using namespace llvm; +using namespace builddfg; +using namespace dfg2llvm; +using namespace viscUtils; + +// VISC Command line option to use timer or not +static cl::opt<bool> +VISCTimer_NVPTX("visc-timers-ptx", cl::desc("Enable visc timers")); + +namespace { +// Helper class declarations + +// Class to maintain the tuple of host pointer, device pointer and size +// in bytes. Would have preferred to use tuple but support not yet available +class OutputPtr { +public: + OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes) + : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {} + + Value* h_ptr; + Value* d_ptr; + Value* bytes; +}; + +// Class to maintain important kernel info required for generating runtime +// calls +class Kernel { +public: + Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::map<unsigned, unsigned> _inArgMap = + std::map<unsigned, unsigned>(), + std::map<unsigned, std::pair<Value*, unsigned> > _sharedInArgMap = + std::map<unsigned, std::pair<Value*, unsigned> >(), + std::vector<unsigned> _outArgMap = std::vector<unsigned>(), + unsigned _gridDim = 0, std::vector<Value*> _globalWGSize = std::vector<Value*>(), + unsigned _blockDim = 0, std::vector<Value*> _localWGSize = std::vector<Value*>()) + : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap), + sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), gridDim(_gridDim), + globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) { + + assert(gridDim == globalWGSize.size() + && "gridDim should be same as the size of vector globalWGSize"); + assert(blockDim == localWGSize.size() + && "blockDim should be same as the size of vector localWGSize"); + } + + Function* KernelFunction; + DFLeafNode* KernelLeafNode; + std::map<unsigned, unsigned> inArgMap; + // Map for shared memory arguments + std::map<unsigned, std::pair<Value*, unsigned> > sharedInArgMap; + // Fields for (potential) allocation node + DFLeafNode* AllocationNode; + Function* AllocationFunction; + std::map<unsigned, unsigned> allocInArgMap; + + std::vector<unsigned> outArgMap; + unsigned gridDim; + std::vector<Value*> globalWGSize; + unsigned blockDim; + std::vector<Value*> localWGSize; + std::vector<int> localDimMap; + + std::map<unsigned, unsigned> getInArgMap() { + return inArgMap; + } + void setInArgMap(std::map<unsigned, unsigned> map) { + inArgMap = map; + } + + std::map<unsigned, std::pair<Value*, unsigned> > getSharedInArgMap() { + return sharedInArgMap; + } + void setSharedInArgMap(std::map<unsigned, std::pair<Value*, unsigned> > map) { + sharedInArgMap = map; + } + + std::vector<unsigned> getOutArgMap() { + return outArgMap; + } + void setOutArgMap(std::vector<unsigned> map) { + outArgMap = map; + } + + void setLocalWGSize(std::vector<Value*> V) { + localWGSize = V; + } + + bool hasLocalWG() { + return blockDim != 0; + } +}; + +// Helper function declarations +static bool canBePromoted(Argument* arg, Function* F); +static void getExecuteNodeParams(Module &M, Value* &, Value* &, Value* &, Kernel*, + ValueToValueMapTy&, Instruction*); +static Value* genWorkGroupPtr(Module &M, std::vector<Value*>, ValueToValueMapTy&, + Instruction*, const Twine& WGName = "WGSize"); +static std::string getPTXFilename(const Module&); +static std::string getFilenameFromModule(const Module& M); +static void changeDataLayout(Module &); +static void changeTargetTriple(Module &); +static void findReturnInst(Function *, std::vector<ReturnInst *> &); +static void findIntrinsicInst(Function *, Intrinsic::ID, std::vector<IntrinsicInst *> &); +static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID); +static std::string getAtomicOpName(Intrinsic::ID); + +// DFG2LLVM_NVPTX - The first implementation. +struct DFG2LLVM_NVPTX : public DFG2LLVM { + static char ID; // Pass identification, replacement for typeid + DFG2LLVM_NVPTX() : DFG2LLVM(ID) {} + +private: + +public: + bool runOnModule(Module &M); +}; + +// Visitor for Code generation traversal (tree traversal for now) +class CGT_NVPTX : public CodeGenTraversal { + +private: + //Member variables + std::unique_ptr<Module> KernelM; + DFNode* KernelLaunchNode = NULL; + Kernel* kernel; + + // VISC Runtime API + Constant* llvm_visc_ocl_launch; + Constant* llvm_visc_ocl_wait; + Constant* llvm_visc_ocl_initContext; + Constant* llvm_visc_ocl_clearContext; + Constant* llvm_visc_ocl_argument_shared; + Constant* llvm_visc_ocl_argument_scalar; + Constant* llvm_visc_ocl_argument_ptr; + Constant* llvm_visc_ocl_output_ptr; + Constant* llvm_visc_ocl_free; + Constant* llvm_visc_ocl_getOutput; + Constant* llvm_visc_ocl_executeNode; + + //Functions + std::string getKernelsModuleName(Module &M); + void fixValueAddrspace(Value* V, unsigned addrspace); + std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned>*, Function*); + Function* changeArgAddrspace(Function* F, std::vector<unsigned> &Ags, unsigned i); + void addCLMetadata(Function* F); + Function* transformFunctionToVoid(Function* F); + void insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName); + + // Virtual Functions + void init() { + VISCTimer = VISCTimer_NVPTX; + TargetName = "NVPTX"; + } + void initRuntimeAPI(); + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); + +public: + + // Constructor + CGT_NVPTX(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(&_M)) { + init(); + initRuntimeAPI(); + errs() << "Old module pointer: " << &_M << "\n"; + errs() << "New module pointer: " << KernelM.get() << "\n"; + + // Copying instead of creating new, in order to preserve required info (metadata) + // Remove functions, global variables and aliases + std::vector<GlobalVariable*> gvv = std::vector<GlobalVariable*>(); + for (Module::global_iterator mi = KernelM->global_begin(), + me = KernelM->global_end(); (mi != me); ++mi) { + GlobalVariable* gv = &*mi; + gvv.push_back(gv); + } + for (std::vector<GlobalVariable*>::iterator vi = gvv.begin(); vi != gvv.end(); ++vi) { + (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType())); + (*vi)->eraseFromParent(); + } + + std::vector<Function*> fv = std::vector<Function*>(); + for (Module::iterator mi = KernelM->begin(), + me = KernelM->end(); (mi != me); ++mi) { + Function* f = &*mi; + fv.push_back(f); + } + for (std::vector<Function*>::iterator vi = fv.begin(); vi != fv.end(); ++vi) { + (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType())); + (*vi)->eraseFromParent(); + } + + std::vector<GlobalAlias*> av = std::vector<GlobalAlias*>(); + for (Module::alias_iterator mi = KernelM->alias_begin(), + me = KernelM->alias_end(); (mi != me); ++mi) { + GlobalAlias* a = &*mi; + av.push_back(a); + } + for (std::vector<GlobalAlias*>::iterator vi = av.begin(); vi != av.end(); ++vi) { + (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType())); + (*vi)->eraseFromParent(); + } + + changeDataLayout(*KernelM); + changeTargetTriple(*KernelM); + + + DEBUG(errs() << *KernelM); + + } + + void writeKernelsModule(); +}; + +// Initialize the VISC runtime API. This makes it easier to insert these calls +void CGT_NVPTX::initRuntimeAPI() { + + // Load Runtime API Module + SMDiagnostic Err; + + char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); + + Twine llvmSrcRoot = LLVM_SRC_ROOT; + Twine runtimeAPI = llvmSrcRoot+"/../build/projects/visc-rt/visc-rt.ll"; + + runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); + if(runtimeModule == nullptr) + DEBUG(errs() << Err.getMessage()); + else + DEBUG(errs() << "Successfully loaded visc-rt API module\n"); + + // Get or insert the global declarations for launch/wait functions + DECLARE(llvm_visc_ocl_launch); + DECLARE(llvm_visc_ocl_wait); + DECLARE(llvm_visc_ocl_initContext); + DECLARE(llvm_visc_ocl_clearContext); + DECLARE(llvm_visc_ocl_argument_shared); + DECLARE(llvm_visc_ocl_argument_scalar); + DECLARE(llvm_visc_ocl_argument_ptr); + DECLARE(llvm_visc_ocl_output_ptr); + DECLARE(llvm_visc_ocl_free); + DECLARE(llvm_visc_ocl_getOutput); + DECLARE(llvm_visc_ocl_executeNode); + + // Get or insert timerAPI functions as well if you plan to use timers + initTimerAPI(); + + // Insert init context in main + DEBUG(errs() << "Gen Code to initialize NVPTX Timer\n"); + Function* VI = M.getFunction("llvm.visc.init"); + assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); + + InitCall = cast<Instruction>(*VI->user_begin()); + initializeTimerSet(InitCall); + switchToTimer(visc_TimerID_INIT_CTX, InitCall); + CallInst::Create(llvm_visc_ocl_initContext, + ArrayRef<Value*>(getTargetID(M, visc::GPU_TARGET)), + "", InitCall); + switchToTimer(visc_TimerID_NONE, InitCall); + + // Insert print instruction at visc exit + DEBUG(errs() << "Gen Code to print NVPTX Timer\n"); + Function* VC = M.getFunction("llvm.visc.cleanup"); + DEBUG(errs() << *VC << "\n"); + assert(VC->getNumUses() == 1 && "__visc__clear should only be used once"); + + CleanupCall = cast<Instruction>(*VC->user_begin()); + printTimerSet(CleanupCall); + + +} + +// Generate Code to call the kernel +// The plan is to replace the internal node with a leaf node. This method is +// used to generate a function to associate with this leaf node. The function +// is responsible for all the memory allocation/transfer and invoking the +// kernel call on the device +void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName) { + // Check if clone already exists. If it does, it means we have visited this + // function before. +// assert(N->getGenFunc() == NULL && "Code already generated for this node"); + + assert(N->getGenFuncForTarget(visc::GPU_TARGET) == NULL && + "Code already generated for this node"); + + // Useful values + Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1); + Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0); + + // If kernel struct has not been initialized with kernel function, then fail + assert(K != NULL && "No kernel found!!"); + + DEBUG(errs() << "Generating kernel call code\n"); + + Function* F = N->getFuncPointer(); + + + // Create of clone of F with no instructions. Only the type is the same as F + // without the extra arguments. + Function* F_X86; + + // Clone the function, if we are seeing this function for the first time. We + // only need a clone in terms of type. + ValueToValueMapTy VMap; + + // Create new function with the same type + F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); + + // Loop over the arguments, copying the names of arguments over. + Function::arg_iterator dest_iterator = F_X86->arg_begin(); + for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); + i != e; ++i) { + dest_iterator->setName(i->getName()); // Copy the name over... + // Increment dest iterator + ++dest_iterator; + } + + // Add a basic block to this empty function + BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86); + ReturnInst* RI = ReturnInst::Create(M.getContext(), + UndefValue::get(F_X86->getReturnType()), BB); + + // FIXME: Adding Index and Dim arguments are probably not required except + // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do + // have those arguments) + + // Add Index and Dim arguments except for the root node + if(!N->isRoot() && !N->getParent()->isChildGraphStreaming()) + F_X86 = addIdxDimArgs(F_X86); + + BB = &*F_X86->begin(); + RI = cast<ReturnInst>(BB->getTerminator()); + + //Add the generated function info to DFNode +// N->setGenFunc(F_X86, visc::CPU_TARGET); + N->addGenFunc(F_X86, visc::GPU_TARGET, true); + errs() << "Added GPUGenFunc: " << F_X86->getName() << " for node " + << N->getFuncPointer()->getName() << "\n"; + + + // Loop over the arguments, to create the VMap + dest_iterator = F_X86->arg_begin(); + for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); + i != e; ++i) { + // Add mapping to VMap and increment dest iterator + VMap[&*i] = &*dest_iterator; + ++dest_iterator; + } + + /* TODO: Use this code to verufy if this is a good pattern for PTX kernel + + // Sort children in topological order before code generation for kernel call + N->getChildGraph()->sortChildren(); + + // The DFNode N has the property that it has only one child (leaving Entry + // and Exit dummy nodes). This child is the PTX kernel. This simplifies code + // generation for kernel calls significantly. All the inputs to this child + // node would either be constants or from the parent node N. + + assert(N->getChildGraph()->size() == 3 + && "Node expected to have just one non-dummy node!"); + + DFNode* C; + for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); ci != ce; ++ci) { + C = *ci; + // Skip dummy node call + if (!C->isDummyNode()) + break; + } + + assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!"); + + Function* CF = C->getFuncPointer(); + */ + Function* KF = K->KernelLeafNode->getFuncPointer(); + // Initialize context + //DEBUG(errs() << "Initializing context" << "\n"); + //CallInst::Create(llvm_visc_ocl_initContext, None, "", RI); + + DEBUG(errs() << "Initializing commandQ" << "\n"); + // Initialize command queue + switchToTimer(visc_TimerID_SETUP, InitCall); + Value* fileStr = getStringPointer(FileName, InitCall, "Filename"); + DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n"); + DEBUG(errs() << "Generating code for kernel - " << K->KernelFunction->getName()<< "\n"); + Value* kernelStr = getStringPointer(K->KernelFunction->getName(), InitCall,"KernelName"); + + Value* LaunchInstArgs[] = {fileStr, kernelStr}; + + DEBUG(errs() << "Inserting launch call" << "\n"); + CallInst* NVPTX_Ctx = CallInst::Create(llvm_visc_ocl_launch, + ArrayRef<Value*>(LaunchInstArgs, 2), + "graph"+KF->getName(), + InitCall); + DEBUG(errs() << *NVPTX_Ctx << "\n"); + GraphIDAddr = new GlobalVariable(M, + NVPTX_Ctx->getType(), + false, + GlobalValue::CommonLinkage, + Constant::getNullValue(NVPTX_Ctx->getType()), + "graph"+KF->getName()+".addr"); + DEBUG(errs() << "Store at: " << *GraphIDAddr << "\n"); + StoreInst* SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall); + DEBUG(errs() << *SI << "\n"); + switchToTimer(visc_TimerID_NONE, InitCall); + switchToTimer(visc_TimerID_SETUP, RI); + Value* GraphID = new LoadInst(GraphIDAddr, "graph."+KF->getName(), RI); + + // Iterate over the required input edges of the node and use the visc-rt API + // to set inputs + DEBUG(errs() << "Iterate over input edges of node and insert visc api\n"); + std::vector<OutputPtr> OutputPointers; + // Vector to hold the device memory object that need to be cleared before we release + // context + std::vector<Value*> DevicePointers; + + std::map<unsigned, unsigned> kernelInArgMap = K->getInArgMap(); + /* + for(unsigned i=0; i<KF->getFunctionType()->getNumParams(); i++) { + + // The kernel object gives us the mapping of arguments from kernel launch + // node function (F_X86) to kernel (kernel->KF) + Value* inputVal = getArgumentAt(F_X86, K->getInArgMap()[i]); + + */ + + for(std::map<unsigned, unsigned>::iterator ib = kernelInArgMap.begin(), + ie = kernelInArgMap.end(); ib != ie; ++ib) { + unsigned i = ib->first; + Value* inputVal = getArgumentAt(F_X86, ib->second); + DEBUG(errs() << "\tArgument "<< i<< " = " << *inputVal << "\n"); + + // input value has been obtained. + // Check if input is a scalar value or a pointer operand + // For scalar values such as int, float, etc. the size is simply the size of + // type on target machine, but for pointers, the size of data would be the + // next integer argument + if(inputVal->getType()->isPointerTy()) { + + switchToTimer(visc_TimerID_COPY_PTR, RI); + // Pointer Input + // CheckAttribute + Value* isOutput = (hasAttribute(KF, i, Attribute::Out))? True : False; + Value* isInput = ((hasAttribute(KF, i, Attribute::Out)) + && !(hasAttribute(KF, i, Attribute::In)))? False : True; + + Argument* A = getArgumentAt(KF, i); + if(isOutput == True) { + DEBUG(errs() << *A << " is an OUTPUT argument\n"); + } + if(isInput == True) { + DEBUG(errs() << *A << " is an INPUT argument\n"); + } + + + Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal, + Type::getInt8PtrTy(M.getContext()), + inputVal->getName()+".i8ptr", + RI); + + // Assert that the pointer argument size (next argument) is in the map + assert(kernelInArgMap.find(i+1) != kernelInArgMap.end()); + + Value* inputSize = getArgumentAt(F_X86, kernelInArgMap[i+1]); + assert(inputSize->getType() == Type::getInt64Ty(M.getContext()) + && "Pointer type input must always be followed by size (integer type)"); + Value* setInputArgs[] = {GraphID, + inputValI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),i), + inputSize, + isInput, + isOutput + }; + Value* d_ptr = CallInst::Create(llvm_visc_ocl_argument_ptr, + ArrayRef<Value*>(setInputArgs, 6), "", RI); + DevicePointers.push_back(d_ptr); + // If this has out attribute, store the returned device pointer in + // memory to read device memory later + if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize)); + } + else { + switchToTimer(visc_TimerID_COPY_SCALAR, RI); + // Scalar Input + // Store the scalar value on stack and then pass the pointer to its + // location + AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), inputVal->getName()+".ptr", RI); + StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI); + + Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr, + Type::getInt8PtrTy(M.getContext()), + inputVal->getName()+".i8ptr", + RI); + + Value* setInputArgs[] = {GraphID, + inputValI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),i), + ConstantExpr::getSizeOf(inputVal->getType()) + }; + CallInst::Create(llvm_visc_ocl_argument_scalar, + ArrayRef<Value*>(setInputArgs, 4), "", RI); + } + } + + DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n"); + + // Check to see if all the allocation sizes are constant (determined + // statically) + bool constSizes = true; + for (auto& e: K->getSharedInArgMap()) { + constSizes &= isa<Constant>(e.second.first); + } + + // If the sizes are all constant + if (constSizes) { + for (auto& e: K->getSharedInArgMap()) { + unsigned argNum = e.first; + Value* allocSize = e.second.first; + + DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = " << *allocSize << "\n"); + + if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) { + // Shared memory ptr argument - scalar at size position + switchToTimer(visc_TimerID_COPY_SCALAR, RI); + + assert(isa<Constant>(allocSize) && "Constant shared memory size is expected"); + + Value* setInputArgs[] = {GraphID, + ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), + allocSize + }; + CallInst::Create(llvm_visc_ocl_argument_shared, + ArrayRef<Value*>(setInputArgs, 3), "", RI); + } + else { + // Sharem memory size argument - scalar at address position + switchToTimer(visc_TimerID_COPY_SCALAR, RI); + // Store the scalar value on stack and then pass the pointer to its + // location + AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), + allocSize->getName()+".sharedMem.ptr", RI); + StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI); + + Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr, + Type::getInt8PtrTy(M.getContext()), + allocSize->getName()+".sharedMem.i8ptr", + RI); + + Value* setInputArgs[] = {GraphID, + allocSizeI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), + ConstantExpr::getSizeOf(allocSize->getType()) + }; + CallInst::Create(llvm_visc_ocl_argument_scalar, + ArrayRef<Value*>(setInputArgs, 4), "", RI); + } + } + } else { + + Function *F_alloc = K->AllocationFunction; + StructType *FAllocRetTy = dyn_cast<StructType>(F_alloc->getReturnType()); + assert(FAllocRetTy && "Allocation node with no struct return type"); + + std::vector<Value *> AllocInputArgs; + for (unsigned i = 0; i < K->allocInArgMap.size(); i++) { + AllocInputArgs.push_back(getArgumentAt(F_X86, K->allocInArgMap.at(i))); + } + + CallInst *CI = CallInst::Create(F_alloc, AllocInputArgs, "", RI); + std::vector<ExtractValueInst *> ExtractValueInstVec; + for (unsigned i = 1; i < FAllocRetTy->getNumElements(); i += 2) { + ExtractValueInst *EI = ExtractValueInst::Create(CI, i, "", RI); + ExtractValueInstVec.push_back(EI); + } + + for (auto& e: K->getSharedInArgMap()) { + unsigned argNum = e.first; + Value* allocSize = ExtractValueInstVec[e.second.second/2]; + + DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = " << *allocSize << "\n"); + + if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) { + // Shared memory ptr argument - scalar at size position + switchToTimer(visc_TimerID_COPY_SCALAR, RI); + + Value* setInputArgs[] = {GraphID, + ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), + allocSize + }; + CallInst::Create(llvm_visc_ocl_argument_shared, + ArrayRef<Value*>(setInputArgs, 3), "", RI); + } + else { + // Sharem memory size argument - scalar at address position + switchToTimer(visc_TimerID_COPY_SCALAR, RI); + // Store the scalar value on stack and then pass the pointer to its + // location + AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), + allocSize->getName()+".sharedMem.ptr", RI); + StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI); + + Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr, + Type::getInt8PtrTy(M.getContext()), + allocSize->getName()+".sharedMem.i8ptr", + RI); + + Value* setInputArgs[] = {GraphID, + allocSizeI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), + ConstantExpr::getSizeOf(allocSize->getType()) + }; + CallInst::Create(llvm_visc_ocl_argument_scalar, + ArrayRef<Value*>(setInputArgs, 4), "", RI); + } + } + } + + + DEBUG(errs() << "Setup output edges of node and insert visc api\n"); + // Set output if struct is not an empty struct + StructType* OutputTy = K->KernelLeafNode->getOutputType(); + std::vector<Value*> d_Outputs; + if(!OutputTy->isEmptyTy()) { + switchToTimer(visc_TimerID_COPY_PTR, RI); + // Not an empty struct + // Iterate over all elements of the struct and put them in + for(unsigned i=0; i < OutputTy->getNumElements(); i++) { + unsigned outputIndex = KF->getFunctionType()->getNumParams()+i; + Value* setOutputArgs[] = {GraphID, + ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex), + ConstantExpr::getSizeOf(OutputTy->getElementType(i)) + }; + + CallInst* d_Output = CallInst::Create(llvm_visc_ocl_output_ptr, + ArrayRef<Value*>(setOutputArgs, 3), + "d_output."+KF->getName(), + RI); + d_Outputs.push_back(d_Output); + } + } + + // Enqueue kernel + // Need work dim, localworksize, globalworksize + // Allocate size_t[numDims] space on stack. Store the work group sizes and + // pass it as an argument to ExecNode + + switchToTimer(visc_TimerID_MISC, RI); + Value *workDim, *LocalWGPtr, *GlobalWGPtr; + getExecuteNodeParams(M, workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI); + switchToTimer(visc_TimerID_KERNEL, RI); + Value* ExecNodeArgs[] = {GraphID, + workDim, + LocalWGPtr, + GlobalWGPtr + }; + CallInst* Event = CallInst::Create(llvm_visc_ocl_executeNode, + ArrayRef<Value*>(ExecNodeArgs, 4), + "event."+KF->getName(), + RI); + DEBUG(errs() << "Execute Node Call: " << *Event << "\n"); + + // Wait for Kernel to Finish + CallInst::Create(llvm_visc_ocl_wait, + ArrayRef<Value*>(GraphID), + "", + RI); + + switchToTimer(visc_TimerID_READ_OUTPUT, RI); + // Read Output Struct if not empty + if(!OutputTy->isEmptyTy()) { + std::vector<Value*>h_Outputs; + Value* KernelOutput = UndefValue::get(OutputTy); + for(unsigned i=0; i < OutputTy->getNumElements(); i++) { + Value* GetOutputArgs[] = {GraphID, + Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), + d_Outputs[i], + ConstantExpr::getSizeOf(OutputTy->getElementType(i)) + }; + CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput, + ArrayRef<Value*>(GetOutputArgs, 4), + "h_output."+KF->getName()+".addr", + RI); + // Read each device pointer listed in output struct + // Load the output struct + CastInst* BI = BitCastInst::CreatePointerCast(h_Output, + OutputTy->getElementType(i)->getPointerTo(), "output.ptr", RI); + + Value* OutputElement = new LoadInst(BI, "output."+KF->getName(), RI); + KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, ArrayRef<unsigned>(i), + KF->getName()+"output", RI); + } + OutputMap[K->KernelLeafNode] = KernelOutput; + } + + // Read all the pointer arguments which had side effects i.e., had out + // attribute + DEBUG(errs() << "Output Pointers : " << OutputPointers.size() << "\n"); + // FIXME: Not reading output pointers anymore as we read them when data is + // actually requested + /*for(auto output: OutputPointers) { + DEBUG(errs() << "Read: " << *output.d_ptr << "\n"); + DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n"); + DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n"); + + Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes}; + CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput, + ArrayRef<Value*>(GetOutputArgs, 4), + "", RI); + }*/ + switchToTimer(visc_TimerID_MEM_FREE, RI); + // Clear Context and free device memory + DEBUG(errs() << "Clearing context" << "\n"); + // Free Device Memory + for(auto d_ptr: DevicePointers) { + CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value*>(d_ptr), "", RI); + } + switchToTimer(visc_TimerID_CLEAR_CTX, CleanupCall); + // Clear Context + LoadInst* LI = new LoadInst(GraphIDAddr, "", CleanupCall); + CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value*>(LI), "", CleanupCall); + switchToTimer(visc_TimerID_NONE, CleanupCall); + + switchToTimer(visc_TimerID_MISC, RI); + DEBUG(errs() << "*** Generating epilogue code for the function****\n"); + // Generate code for output bindings + // Get Exit node + DFNode* C = N->getChildGraph()->getExit(); + // Get OutputType of this node + StructType* OutTy = N->getOutputType(); + Value *retVal = UndefValue::get(F_X86->getReturnType()); + // Find the kernel's output arg map, to use instead of the bindings + std::vector<unsigned> outArgMap = kernel->getOutArgMap(); + // Find all the input edges to exit node + for (unsigned i=0; i < OutTy->getNumElements(); i++) { + DEBUG(errs() << "Output Edge " << i << "\n"); + // Find the incoming edge at the requested input port + DFEdge* E = C->getInDFEdgeAt(i); + + assert(E && "No Binding for output element!"); + // Find the Source DFNode associated with the incoming edge + DFNode* SrcDF = E->getSourceDF(); + + DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); + + // If Source DFNode is a dummyNode, edge is from parent. Get the + // argument from argument list of this internal node + Value* inputVal; + if(SrcDF->isEntryNode()) { + inputVal = getArgumentAt(F_X86, i); + DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + } + else { + // edge is from a internal node + // Check - code should already be generated for this source dfnode + // FIXME: Since the 2-level kernel code gen has aspecific structure, we + // can assume the SrcDF is same as Kernel Leaf node. + // Use outArgMap to get correct mapping + SrcDF = K->KernelLeafNode; + assert(OutputMap.count(SrcDF) + && "Source node call not found. Dependency violation!"); + + // Find Output Value associated with the Source DFNode using OutputMap + Value* CI = OutputMap[SrcDF]; + + // Extract element at source position from this call instruction + std::vector<unsigned> IndexList; + // i is the destination of DFEdge E + // Use the mapping instead of the bindings +// IndexList.push_back(E->getSourcePosition()); + IndexList.push_back(outArgMap[i]); + DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); + ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, + "",RI); + inputVal = EI; + } + std::vector<unsigned> IdxList; + IdxList.push_back(i); + retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI); + } + + DEBUG(errs() << "Extracted all\n"); + switchToTimer(visc_TimerID_NONE, RI); + retVal->setName("output"); + ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); + ReplaceInstWithInst(RI, newRI); +} + + +// Right now, only targeting the one level case. In general, device functions +// can return values so we don't need to change them +void CGT_NVPTX::codeGen(DFInternalNode* N) { + errs () << "Inside internal node: " << N->getFuncPointer()->getName() << "\n"; + if(KernelLaunchNode == NULL) + errs () << "No kernel launch node\n"; + else { + errs() << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n"; + } + + if (!KernelLaunchNode) { + DEBUG(errs() << "No code generated (host code for kernel launch complete).\n"); + return; + } + + if (N == KernelLaunchNode) { + DEBUG(errs() << "Found kernel launch node. Generating host code.\n"); + //TODO + + // Now the remaining nodes to be visited should be ignored + KernelLaunchNode = NULL; + DEBUG(errs() << "Insert Runtime calls\n"); + insertRuntimeCalls(N, kernel, getPTXFilename(M)); + + } else { + DEBUG(errs() << "Found intermediate node. Getting size parameters.\n"); + // Keep track of the arguments order. + std::map<unsigned, unsigned> inmap1 = N->getInArgMap(); + std::map<unsigned, unsigned> inmap2 = kernel->getInArgMap(); + // TODO: Structure assumed: one thread node, one allocation node (at most), + // TB node + std::map<unsigned, unsigned> inmapFinal; + for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), ie = inmap2.end(); + ib != ie; ++ib) { + inmapFinal[ib->first] = inmap1[ib->second]; + } + + kernel->setInArgMap(inmapFinal); + + // Keep track of the output arguments order. + std::vector<unsigned> outmap1 = N->getOutArgMap(); + std::vector<unsigned> outmap2 = kernel->getOutArgMap(); + + // TODO: Change when we have incoming edges to the dummy exit node from more + // than one nodes. In this case, the number of bindings is the same, but + // their destination position, thus the index in outmap1, is not + // 0 ... outmap2.size()-1 + // The limit is the size of outmap2, because this is the number of kernel + // output arguments for which the mapping matters + // For now, it reasonable to assume that all the kernel arguments are returned, + // maybe plys some others from other nodes, thus outmap2.size() <= outmap1.size() + for (unsigned i = 0; i < outmap2.size(); i++) { + outmap1[i] = outmap2[outmap1[i]]; + } + kernel->setOutArgMap(outmap1); + + // Track the source of local dimlimits for the kernel + // Dimension limit can either be a constant or an argument of parent + // function. Since Internal node would no longer exist, we need to insert the + // localWGSize with values from the parent of N. + std::vector<Value*> localWGSizeMapped; + for (unsigned i = 0; i < kernel->localWGSize.size(); i++) { + if (isa<Constant>(kernel->localWGSize[i])) { + // if constant, use as it is + localWGSizeMapped.push_back(kernel->localWGSize[i]); + } + else if (Argument* Arg = dyn_cast<Argument>(kernel->localWGSize[i])) { + // if argument, find the argument location in N. Use InArgMap of N to + // find the source location in Parent of N. Retrieve the argument from + // parent to insert in the vector. + unsigned argNum = Arg->getArgNo(); + // This argument will be coming from the parent node, not the allocation + // Node + assert(N->getInArgMap().find(argNum) != N->getInArgMap().end()); + + unsigned parentArgNum = N->getInArgMap()[argNum]; + Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum); + localWGSizeMapped.push_back(A); + } + else { + assert(false && "LocalWGsize using value which is neither argument nor constant!"); + } + } + // Update localWGSize vector of kernel + kernel->setLocalWGSize(localWGSizeMapped); + } + +} + +void CGT_NVPTX::codeGen(DFLeafNode* N) { + errs () << "Inside leaf node: " << N->getFuncPointer()->getName() << "\n"; + + // Skip code generation if it is a dummy node + if(N->isDummyNode()) { + DEBUG(errs() << "Skipping dummy node\n"); + return; + } + + // Skip code generation if it is an allocation node + if(N->isAllocationNode()) { + DEBUG(errs() << "Skipping allocation node\n"); + return; + } + + // Generate code only if it has the right hint +// if(!checkPreferredTarget(N, visc::GPU_TARGET)) { +// errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; +// return; +// } + if(!preferredTargetIncludes(N, visc::GPU_TARGET)) { + errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; + return; + } + + // Checking which node is the kernel launch + DFNode* PNode = N->getParent(); + int pLevel = PNode->getLevel(); + int pReplFactor = PNode->getNumOfDim(); + + // Choose parent node as kernel launch if: + // (1) Parent is the top level node i.e., Root of DFG + // OR + // (2) Parent does not have multiple instances + errs() << "pLevel = " << pLevel << "\n"; + errs() << "pReplFactor = " << pReplFactor << "\n"; + if (!pLevel || !pReplFactor) { + errs() << "*************** Kernel Gen: 1-Level Hierarchy **************\n"; + KernelLaunchNode = PNode; + kernel = new Kernel(NULL, + N, + N->getInArgMap(), + N->getSharedInArgMap(), + N->getOutArgMap(), + N->getNumOfDim(), + N->getDimLimits()); + } + else { + // Converting a 2-level DFG to opencl kernel + errs() << "*************** Kernel Gen: 2-Level Hierarchy **************\n"; + KernelLaunchNode = PNode->getParent(); + assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match"); + // Contains the instructions generating the kernel configuration parameters + kernel = new Kernel(NULL, // kernel function + N, // kernel leaf node + N->getInArgMap(), // kenel argument mapping + N->getSharedInArgMap(), + N->getOutArgMap(), // kernel output mapping from the leaf to the interemediate node + PNode->getNumOfDim(), // gridDim + PNode->getDimLimits(),// grid size + N->getNumOfDim(), // blockDim + N->getDimLimits()); // block size + + } + + std::vector<IntrinsicInst *> IItoRemove; + BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap; + + // Get the function associated with the dataflow node + Function *F = N->getFuncPointer(); + + // Look up if we have visited this function before. If we have, then just + // get the cloned function pointer from DFNode. Otherwise, create the cloned + // function and add it to the DFNode GenFunc. +// Function *F_nvptx = N->getGenFunc(); + Function *F_nvptx = N->getGenFuncForTarget(visc::GPU_TARGET); + + assert(F_nvptx == NULL && "Error: Visiting a node for which code already generated"); + // Clone the function + ValueToValueMapTy VMap; + + Twine FName = F->getName(); + F_nvptx = CloneFunction(F, VMap); + F_nvptx->setName(FName+"_nvptx"); +// errs() << "Old Function Name: " << F->getName() << "\n"; +// errs() << "New Function Name: " << F_nvptx->getName() << "\n"; + + F_nvptx->removeFromParent(); + + + // Insert the cloned function into the kernels module + KernelM->getFunctionList().push_back(F_nvptx); + + + //TODO: Iterate over all the instructions of F_nvptx and identify the + //callees and clone them into this module. + DEBUG(errs() << *F_nvptx->getType()); + DEBUG(errs() << *F_nvptx); + + // Transform the function to void and remove all target dependent attributes + // from the function + F_nvptx = transformFunctionToVoid(F_nvptx); + + //Add generated function info to DFNode +// N->setGenFunc(F_nvptx, visc::GPU_TARGET); + N->addGenFunc(F_nvptx, visc::GPU_TARGET, false); + + DEBUG(errs() << "Removing all attributes from Kernel Function and adding nounwind\n"); + F_nvptx->removeAttributes(AttributeSet::FunctionIndex, F_nvptx->getAttributes().getFnAttributes()); + F_nvptx->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind); + + //FIXME: For now, assume only one allocation node + kernel->AllocationNode = NULL; + + for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), iee = N->indfedge_end(); + ieb != iee; ++ieb) { + DFNode *SrcDFNode = (*ieb)->getSourceDF(); + DEBUG(errs() << "Found edge from node: " << " " << SrcDFNode->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "Current Node: " << N->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n"); + if (!SrcDFNode->isDummyNode()) { + assert(SrcDFNode->isAllocationNode()); + kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode); + kernel->allocInArgMap = SrcDFNode->getInArgMap(); + break; + } + } + + // Vector for shared memory arguments + std::vector<unsigned> SharedMemArgs; + + // If no allocation node was found, SharedMemArgs is empty + if (kernel->AllocationNode) { + + ValueToValueMapTy VMap; + Function *F_alloc = CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap); + //F_alloc->removeFromParent(); + // Insert the cloned function into the kernels module + //M.getFunctionList().push_back(F_alloc); + + std::vector<IntrinsicInst *> ViscMallocInstVec; + findIntrinsicInst(F_alloc, Intrinsic::visc_malloc, ViscMallocInstVec); + + for (unsigned i = 0; i < ViscMallocInstVec.size(); i++) { + IntrinsicInst *II = ViscMallocInstVec[i]; + assert(II->hasOneUse() && "visc_malloc result is used more than once"); + II->replaceAllUsesWith(ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext()))); + II->eraseFromParent(); + } + kernel->AllocationFunction = F_alloc; + + // This could be used to check that the allocation node has the appropriate + // number of fields in its return struct + /* + ReturnInst *RI = ReturnInstVec[0]; + Value *RetVal = RI->getReturnValue(); + Type *RetTy = RetVal->getType(); + StructType *RetStructTy = dyn_cast<StructType>(RetTy); + assert(RetStructTy && "Allocation node does not return a struct type"); + unsigned numFields = RetStructTy->getNumElements(); + */ + std::map<unsigned, std::pair<Value*, unsigned> > sharedInMap = kernel->getSharedInArgMap(); + AllocationNodeProperty* APN = + (AllocationNodeProperty*) kernel->AllocationNode->getProperty(DFNode::Allocation); + for (auto& AllocPair: APN->getAllocationList()) { + unsigned destPos = AllocPair.first->getDestPosition(); + unsigned srcPos = AllocPair.first->getSourcePosition(); + SharedMemArgs.push_back(destPos); + sharedInMap[destPos] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1); + sharedInMap[destPos+1] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1); + } + kernel->setSharedInArgMap(sharedInMap); + } + std::sort(SharedMemArgs.begin(), SharedMemArgs.end()); + + // All pointer args which are not shared memory pointers have to be moved to + // global address space + unsigned argIndex = 0; + std::vector<unsigned> GlobalMemArgs; + for(auto& Arg: F_nvptx->getArgumentList()) { + if (Arg.getType()->isPointerTy()) { + // If the arguement is already chosen for shared memory arguemnt list, skip. + // Else put it in Global memory arguement list + if(std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == 0) { + GlobalMemArgs.push_back(argIndex); + } + } + argIndex++; + } + std::sort(GlobalMemArgs.begin(), GlobalMemArgs.end()); + + /* At this point, we assume that chescks for the fact that SharedMemArgs only + contains pointer arguments to GLOBAL_ADDRSPACE have been performed by the + analysis pass */ + // Optimization: Gloabl memory arguments, which are not modified and whose + // loads are not dependent on node id of current node, should be moved to + // constant memory, subject to size of course + std::vector<unsigned> ConstantMemArgs = globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx); + + F_nvptx = changeArgAddrspace(F_nvptx, ConstantMemArgs, CONSTANT_ADDRSPACE); + F_nvptx = changeArgAddrspace(F_nvptx, SharedMemArgs, SHARED_ADDRSPACE); + F_nvptx = changeArgAddrspace(F_nvptx, GlobalMemArgs, GLOBAL_ADDRSPACE); + + + // Go through all the instructions + std::vector<CallInst *> CItoRemove; + for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) { + Instruction *I = &(*i); + // Leaf nodes should not contain VISC graph intrinsics or launch + assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!"); + assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!"); + + if (BuildDFG::isViscIntrinsic(I)) { + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + IntrinsicInst* ArgII; + DFNode* ArgDFNode; + + /************************ Handle VISC Query intrinsics ************************/ + + switch (II->getIntrinsicID()) { + /**************************** llvm.visc.getNode() *****************************/ + case Intrinsic::visc_getNode: { + DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNode\n"); + // add mapping <intrinsic, this node> to the node-specific map + Leaf_HandleToDFNodeMap[II] = N; + IItoRemove.push_back(II); + } + break; + /************************* llvm.visc.getParentNode() **************************/ + case Intrinsic::visc_getParentNode: { + DEBUG(errs() << F_nvptx->getName() << "\t: Handling getParentNode\n"); + // get the parent node of the arg node + // get argument node + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + // get the parent node of the arg node + // Add mapping <intrinsic, parent node> to the node-specific map + // the argument node must have been added to the map, orelse the + // code could not refer to it + Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent(); + + IItoRemove.push_back(II); + } + break; + /*************************** llvm.visc.getNumDims() ***************************/ + case Intrinsic::visc_getNumDims: { + DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumDims\n"); + // get node from map + // get the appropriate field + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + int numOfDim = ArgDFNode->getNumOfDim(); + DEBUG(errs() << "\t Got node dimension : " << numOfDim << "\n"); + IntegerType* IntTy = Type::getInt32Ty(KernelM->getContext()); + ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim); + + // Replace the result of the intrinsic with the computed value + II->replaceAllUsesWith(numOfDimConstant); + + IItoRemove.push_back(II); + } + break; + /*********************** llvm.visc.getNodeInstanceID() ************************/ + case Intrinsic::visc_getNodeInstanceID_x: + case Intrinsic::visc_getNodeInstanceID_y: + case Intrinsic::visc_getNodeInstanceID_z: { + DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n" << "\t: " << *II << "\n"); + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + assert(ArgDFNode && "Arg node is NULL"); + // A leaf node always has a parent + DFNode* ParentDFNode = ArgDFNode->getParent(); + assert(ParentDFNode && "Parent node of a leaf is NULL"); + + // Get the number associated with the required dimension + // FIXME: The order is important! + // These three intrinsics need to be consecutive x,y,z + uint64_t dim = II->getIntrinsicID() - + Intrinsic::visc_getNodeInstanceID_x; + assert((dim >= 0) && (dim < 3) && "Invalid dimension argument"); + DEBUG(errs() << "\t dimension = " << dim << "\n"); + + // Argument of the function to be called + ConstantInt * DimConstant = + ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); + //ArrayRef<Value *> Args(DimConstant); + + // The following is to find which function to call + Function * OpenCLFunction; + int parentLevel = N->getParent()->getLevel(); + int parentReplFactor = N->getParent()->getNumOfDim(); + DEBUG(errs() << "Parent Level = " << parentLevel << "\n"); + DEBUG(errs() << "Parent Repl factor = " << parentReplFactor << "\n"); + + FunctionType* FT = + FunctionType::get(Type::getInt64Ty(KernelM->getContext()), + Type::getInt32Ty(KernelM->getContext()), + false); + if ((!parentLevel || !parentReplFactor) && ArgDFNode == N) { + // We only have one level in the hierarchy or the parent node is not + // replicated. This indicates that the parent node is the kernel + // launch, so we need to specify a global id. + // We can translate this only if the argument is the current node + // itself + DEBUG(errs() << "Substitute with get_global_id()\n"); + DEBUG(errs() << *II << "\n"); + OpenCLFunction = cast<Function> + (KernelM->getOrInsertFunction(StringRef("get_global_id"), FT)); + } else if (Leaf_HandleToDFNodeMap[ArgII] == N) { + //DEBUG(errs() << "Here inside cond 2\n"); + // We are asking for this node's id with respect to its parent + // this is a local id call + OpenCLFunction = cast<Function> + (KernelM->getOrInsertFunction(StringRef("get_local_id"), FT)); + //DEBUG(errs() << "exiting condition 2\n"); + } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) { + // We are asking for this node's parent's id with respect to its + // parent: this is a group id call + OpenCLFunction = cast<Function> + (KernelM->getOrInsertFunction(StringRef("get_group_id"), FT)); + } else { + errs() << N->getFuncPointer()->getName() << "\n"; + errs() << N->getParent()->getFuncPointer()->getName() << "\n"; + errs() << *II << "\n"; + + assert(false && "Unable to translate getNodeInstanceID intrinsic"); + } + + //DEBUG(errs() << "Create call instruction, insert it before the instrinsic\n"); + //DEBUG(errs() << "Function: " << *OpenCLFunction << "\n"); + //DEBUG(errs() << "Arguments size: " << Args.size() << "\n"); + //DEBUG(errs() << "Argument: " << Args[0] << "\n"); + //DEBUG(errs() << "Arguments: " << *DimConstant << "\n"); + // Create call instruction, insert it before the intrinsic and + // replace the uses of the previous instruction with the new one + CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); + //DEBUG(errs() << "Replace uses\n"); + II->replaceAllUsesWith(CI); + + IItoRemove.push_back(II); + } + break; + /********************** llvm.visc.getNumNodeInstances() ***********************/ + case Intrinsic::visc_getNumNodeInstances_x: + case Intrinsic::visc_getNumNodeInstances_y: + case Intrinsic::visc_getNumNodeInstances_z: { + // TODO: think about whether this is the best way to go there are hw + // specific registers. therefore it is good to have the intrinsic but + // then, why do we need to keep that info in the graph? (only for the + // kernel configuration during the call) + + DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumNodeInstances\n"); + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + // A leaf node always has a parent + DFNode* ParentDFNode = ArgDFNode->getParent(); + assert(ParentDFNode && "Parent node of a leaf is NULL"); + + // Get the number associated with the required dimension + // FIXME: The order is important! + // These three intrinsics need to be consecutive x,y,z + uint64_t dim = II->getIntrinsicID() - + Intrinsic::visc_getNumNodeInstances_x; + assert((dim >= 0) && (dim < 3) && "Invalid dimension argument"); + DEBUG(errs() << "\t dimension = " << dim << "\n"); + + // Argument of the function to be called + ConstantInt * DimConstant = + ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); + //ArrayRef<Value *> Args(DimConstant); + + // The following is to find which function to call + Function * OpenCLFunction; + int parentLevel = ParentDFNode->getLevel(); + int parentReplFactor = ParentDFNode->getNumOfDim(); + FunctionType* FT = + FunctionType::get(Type::getInt64Ty(KernelM->getContext()), + Type::getInt32Ty(KernelM->getContext()), + false); + + if ((N == ArgDFNode) && (!parentLevel || !parentReplFactor)) { + // We only have one level in the hierarchy or the parent node is not + // replicated. This indicates that the parent node is the kernel + // launch, so the instances are global_size (gridDim x blockDim) + OpenCLFunction = cast<Function> + (KernelM->getOrInsertFunction(StringRef("get_global_size"), FT)); + } else if (Leaf_HandleToDFNodeMap[ArgII] == N) { + // We are asking for this node's instances + // this is a local size (block dim) call + OpenCLFunction = cast<Function> + (KernelM->getOrInsertFunction(StringRef("get_local_size"), FT)); + } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) { + // We are asking for this node's parent's instances + // this is a (global_size/local_size) (grid dim) call + OpenCLFunction = cast<Function> + (KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT)); + } else { + assert(false && "Unable to translate getNumNodeInstances intrinsic"); + } + + // Create call instruction, insert it before the intrinsic and + // replace the uses of the previous instruction with the new one + CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); + II->replaceAllUsesWith(CI); + + IItoRemove.push_back(II); + } + break; + case Intrinsic::visc_barrier: + { + DEBUG(errs() << F_nvptx->getName() << "\t: Handling barrier\n"); + DEBUG(errs() << "Substitute with barrier()\n"); + DEBUG(errs() << *II << "\n"); + FunctionType* FT = FunctionType::get(Type::getVoidTy(KernelM->getContext()), + std::vector<Type*>(1, Type::getInt32Ty(KernelM->getContext())), + false); + Function* OpenCLFunction = cast<Function> + (KernelM->getOrInsertFunction(StringRef("barrier"), FT)); + CallInst* CI = CallInst::Create(OpenCLFunction, + ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)), + "", II); + II->replaceAllUsesWith(CI); + IItoRemove.push_back(II); + } + break; + case Intrinsic::visc_atomic_cmpxchg: + break; + case Intrinsic::visc_atomic_add: + case Intrinsic::visc_atomic_sub: + case Intrinsic::visc_atomic_xchg: + case Intrinsic::visc_atomic_min: + case Intrinsic::visc_atomic_umin: + case Intrinsic::visc_atomic_max: + case Intrinsic::visc_atomic_umax: + case Intrinsic::visc_atomic_and: + case Intrinsic::visc_atomic_or: + case Intrinsic::visc_atomic_xor: + //case Intrinsic::visc_atomic_inc: + //case Intrinsic::visc_atomic_dec: + { + DEBUG(errs() << *II << "\n"); + // Only have support for i32 atomic intrinsics + assert(II->getType() == Type::getInt32Ty(II->getContext()) + && "Only support i32 atomic intrinsics for now"); + // Substitute with atomicrmw instruction + assert(II->getNumArgOperands() == 2 && "Expecting 2 operands for these atomics"); + Value* Ptr = II->getArgOperand(0); + Value* Val = II->getArgOperand(1); + assert(Ptr->getType()->isPointerTy() + && "First argument of supported atomics is expected to be a pointer"); + PointerType* PtrTy = cast<PointerType>(Ptr->getType()); + if(PtrTy != Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace())) { + Ptr = CastInst::CreatePointerCast(Ptr, Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace()), "", II); + } + AtomicRMWInst* AtomicInst = new AtomicRMWInst(getAtomicOp(II->getIntrinsicID()), + Ptr, Val, AtomicOrdering::SequentiallyConsistent, llvm::CrossThread, II); + AtomicInst->setVolatile(true); + DEBUG(errs() << "Substitute with: " << *AtomicInst << "\n"); + II->replaceAllUsesWith(AtomicInst); + IItoRemove.push_back(II); + } + break; + default: + llvm_unreachable("Unknown VISC Intrinsic!"); + break; + } + + } + else if(CallInst* CI = dyn_cast<CallInst>(I)) { + DEBUG(errs() << "Found a call: " << *CI << "\n"); + Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts()); + if(calleeF->isDeclaration()) { + // Add the declaration to kernel module + DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n"); + KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType()); + if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(CI)) { + // Now handle a few specific intrinsics + // For now, sin and cos are translated to their libclc equivalent + switch(II->getIntrinsicID()) { + case Intrinsic::sin: + case Intrinsic::cos: + { + DEBUG(errs() << "Found sincos: " << *II << "\n"); + // Get the libclc function + // libclc uses mangled name for sin cos + assert(II->getType()->isFloatTy() + && "Only handling sin(float) and cos(float)!"); + std::string name; + if(II->getIntrinsicID() == Intrinsic::sin) + name = "_Z3sinf"; + else + name = "_Z3cosf"; + + FunctionType* SinCosFT = FunctionType::get(II->getType(), + Type::getFloatTy(KernelM->getContext()), + false); + Function* LibclcFunction = cast<Function> + (KernelM->getOrInsertFunction(name, SinCosFT)); + CallInst* CI = CallInst::Create(LibclcFunction, II->getArgOperand(0), II->getName(), II); + + II->replaceAllUsesWith(CI); + IItoRemove.push_back(II); + break; + } + case Intrinsic::floor: + { + DEBUG(errs() << "Found floor intrinsic\n"); + F = Intrinsic::getDeclaration(KernelM.get(), Intrinsic::nvvm_floor_f); + FunctionType* FTy = F->getFunctionType(); + DEBUG(errs() << *F << "\n"); + + // Create argument list + std::vector<Value*> args; + assert(CI->getNumArgOperands() == FTy->getNumParams() + && "Number of arguments of call do not match with Intrinsic"); + for(unsigned i=0; i < CI->getNumArgOperands(); i++) { + Value* V = CI->getArgOperand(i); + // Either the type should match or both should be of pointer type + assert(V->getType() == FTy->getParamType(i) || + (V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy()) + && "Dummy function call argument does not match with Intrinsic argument!"); + // If the types do not match, then both must be pointer type and pointer + // cast needs to be performed + if(V->getType() != FTy->getParamType(i)) { + V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI); + } + args.push_back(V); + } + // Insert call instruction + CallInst* Inst = CallInst::Create(F, args, + F->getReturnType()->isVoidTy()? "" : CI->getName(), CI); + DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n"); + CI->replaceAllUsesWith(Inst); + IItoRemove.push_back(II); + break; + } + default: + errs() << "[WARNING] Found Intrinsic: " << *II << "\n" ; + } + } + + } + else { + // Clone the function + ValueToValueMapTy VMap; + Function* newCalleeF = CloneFunction(calleeF, VMap); + newCalleeF->removeFromParent(); //TODO: MARIA check + KernelM->getFunctionList().push_back(newCalleeF); + CallInst *CInew = CallInst::Create(newCalleeF, CI->getArgOperand(0), CI->getName(), CI); + CI->replaceAllUsesWith(CInew); + CItoRemove.push_back(CI); + + } + //TODO: how to handle address space qualifiers in load/store + } + + } + + // We need to do this explicitly: DCE pass will not remove them because we + // have assumed theworst memory behaviour for these function calls + // Traverse the vector backwards, otherwise definitions are deleted while + // their subsequent uses are still around + for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(), + re = IItoRemove.rend(); ri != re; ++ri) { + DEBUG(errs() << "Erasing: " << **ri << "\n"); + (*ri)->eraseFromParent(); + } + for(auto *CI : reverse(CItoRemove)) { + DEBUG(errs() << "Erasing: " << *CI << "\n"); + CI->eraseFromParent(); + + } + + + addCLMetadata(F_nvptx); + kernel->KernelFunction = F_nvptx; + errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n"; + DEBUG(errs() << *KernelM); + + return; +} + +bool DFG2LLVM_NVPTX::runOnModule(Module &M) { + errs() << "\nDFG2LLVM_NVPTX PASS\n"; + + // Get the BuildDFG Analysis Results: + // - Dataflow graph + // - Maps from i8* hansles to DFNode and DFEdge + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + // DFInternalNode *Root = DFG.getRoot(); + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); + // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); + + // Visitor for Code Generation Graph Traversal + CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG); + + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode: Roots) { + // Initiate code generation for root DFNode + CGTVisitor->visit(rootNode); + } + + CGTVisitor->writeKernelsModule(); + + //TODO: Edit module epilogue to remove the VISC intrinsic declarations + delete CGTVisitor; + + return true; +} + +std::string CGT_NVPTX::getKernelsModuleName(Module &M) { + /*SmallString<128> currentDir; + llvm::sys::fs::current_path(currentDir); + std::string fileName = getFilenameFromModule(M); + Twine output = Twine(currentDir) + "/Output/" + fileName + ""; + return output.str().append(".kernels.ll");*/ + std::string mid = M.getModuleIdentifier(); + return mid.append(".kernels.ll"); +} + +void CGT_NVPTX::fixValueAddrspace(Value* V, unsigned addrspace) { + assert(isa<PointerType>(V->getType()) + && "Value should be of Pointer Type!"); + PointerType* OldTy = cast<PointerType>(V->getType()); + PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace); + V->mutateType(NewTy); + for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) { + // Change all uses producing pointer type in same address space to new + // addressspace. + if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) { + if(PTy->getAddressSpace() == OldTy->getAddressSpace()) { + fixValueAddrspace(*ui, addrspace); + } + } + } +} + + +std::vector<unsigned> CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned>* GlobalMemArgs, Function* F) { + std::vector<unsigned> ConstantMemArgs; + for(auto& arg: F->getArgumentList()) { + std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(), + GlobalMemArgs->end(), arg.getArgNo()); + // It has to be a global memory argument to be promotable + if(pos == GlobalMemArgs->end()) + continue; + + // Check if it can/should be promoted + if(canBePromoted(&arg, F)) { + errs() << "Promoting << " << arg.getName() << " to constant memory."<< "\n"; + ConstantMemArgs.push_back(arg.getArgNo()); + GlobalMemArgs->erase(pos); + } + } + return ConstantMemArgs; +} + +Function* CGT_NVPTX::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) { + unsigned idx = 0; + std::vector<Type*> ArgTypes; + for(auto& arg: F->getArgumentList()) { + DEBUG(errs() << arg << "\n"); + unsigned argno = arg.getArgNo(); + if ((idx < Args.size()) && (argno == Args[idx])) { + fixValueAddrspace(&arg, addrspace); + idx++; + } + ArgTypes.push_back(arg.getType()); + } + FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false); + + //F->mutateType(PTy); + Function* newF = cloneFunction(F, newFT, false); + replaceNodeFunctionInIR(*F->getParent(), F, newF); + + DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n"); + return newF; +} + +/* Add metadata to module KernelM, for OpenCL kernels */ +void CGT_NVPTX::addCLMetadata(Function *F) { + + IRBuilder<> Builder(&*F->begin()); + + SmallVector<Metadata*,8> KernelMD; + KernelMD.push_back(ValueAsMetadata::get(F)); + + // TODO: There is additional metadata used by kernel files but we skip them as + // they are not mandatory. In future they might be useful to enable + // optimizations + + MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD); + NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels"); + MDN_kernels->addOperand(MDKernelNode); + + KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel")); + // TODO: Replace 1 with the number of the kernel. + // Add when support for multiple launces is added + KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1))); + MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD); + NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations"); + MDN_annotations->addOperand(MDNvvmAnnotationsNode); + +} + +void CGT_NVPTX::writeKernelsModule() { + + // In addition to deleting all other functions, we also want to spiff it + // up a little bit. Do this now. + legacy::PassManager Passes; + + errs() << "Writing to File --- "; + errs() << getKernelsModuleName(M).c_str() << "\n"; + std::error_code EC; + tool_output_file Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None); + if (EC) { + errs() << EC.message() << '\n'; + } + + Passes.add( + createPrintModulePass(Out.os())); + + Passes.run(*KernelM); + + // Declare success. + Out.keep(); +} + +Function* CGT_NVPTX::transformFunctionToVoid(Function* F) { + + DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n"); + // FIXME: Maybe do that using the Node? + StructType* FRetTy = dyn_cast<StructType>(F->getReturnType()); + assert(FRetTy && "Return Type must always be a struct"); + + // Keeps return statements, because we will need to replace them + std::vector<ReturnInst *> RItoRemove; + findReturnInst(F, RItoRemove); + + + // Check for { } return struct, which means that the function returns void + if (FRetTy->isEmptyTy()) { + + DEBUG(errs() << "\tFunction output struct is void\n"); + DEBUG(errs() << "\tNo parameters added\n"); + + // Replacing return statements with others returning void + for (std::vector<ReturnInst *>::iterator i = RItoRemove.begin(), + e = RItoRemove.end(); i != e; ++i) { + ReturnInst::Create((F->getContext()), 0, (*i)); + (*i)->eraseFromParent(); + } + DEBUG(errs() << "\tChanged return statements to return void\n"); + } + else { + // The struct has return values, thus needs to be converted to parameter + + // Iterate over all element types of return struct and add arguments to the + // function + std::vector<Argument*> Args; + for (unsigned i=0; i<FRetTy->getNumElements(); i++) { + Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F); + Args.push_back(RetArg); + DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n"); + } + + Function::arg_iterator ai, ae; + + DEBUG(errs() << "\tReplacing Return statements\n"); + // Replace return statements with extractValue and store instructions + for (std::vector<ReturnInst *>::iterator rii = RItoRemove.begin(), + rie = RItoRemove.end(); rii != rie; ++rii) { + ReturnInst* RI = (*rii); + Value* RetVal = RI->getReturnValue(); + for(unsigned i = 0; i < Args.size(); i++) { + ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i), + Args[i]->getName()+".val", RI); + new StoreInst(EI, Args[i], RI); + } + // assert(RetVal && "Return value should not be null at this point"); + // StructType* RetType = cast<StructType>(RetVal->getType()); + // assert(RetType && "Return type is not a struct"); + + ReturnInst::Create((F->getContext()), 0, RI); + RI->eraseFromParent(); + + } + } + DEBUG(errs() << "\tReplaced return statements\n"); + + // Create the argument type list with the added argument's type + std::vector<Type*> ArgTypes; + for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + ArgTypes.push_back(ai->getType()); + } + + // Adding new arguments to the function argument list, would not change the + // function type. We need to change the type of this function to reflect the + // added arguments + Type* VoidRetType = Type::getVoidTy(F->getContext()); + FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg()); + + // Change the function type + //F->mutateType(PTy); + Function* newF = cloneFunction(F, newFT, false); + replaceNodeFunctionInIR(*F->getParent(), F, newF); + //F->eraseFromParent(); + return newF; +} + +/****************************************************************************** + * Helper functions * + ******************************************************************************/ +// Check if argument arg can be promoted to constant memory in Function F +// Condition: +// 1. No stores +// 2. Loads not dependent on getNodeInstanceID itrinsic + +static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector<Value*>*VisitedList) { + if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) { + DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); + return false; + } + VisitedList->push_back(V); + for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); + ui != ue; ++ui) { + Instruction* I = dyn_cast<Instruction>(*ui); + if(!I) { + // if use is not an instruction, then skip it + continue; + } + DEBUG(errs() << "\t" << *I << "\n"); + if(isa<LoadInst>(I)) { + DEBUG(errs() << "\tFound load instruction: " << *I << "\n"); + DEBUG(errs() << "\tAdd to use list: " << *V << "\n"); + UseList->push_back(V); + } + else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) { + // found a store in use chain + DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n"); + return true; + } + else if(BuildDFG::isViscIntrinsic(I)) { + // If it is an atomic intrinsic, we found a store + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic") + && "Only visc atomic intrinsics can have an argument as input"); + return true; + } + else { + DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n"); + if(findLoadStoreUses(I, UseList, VisitedList)) + return true; + } + } + return false; +} + +static bool isDependentOnNodeInstanceID(Value* V, std::vector<Value*>*DependenceList) { + if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) { + DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); + return false; + } + DependenceList->push_back(V); + // If not an instruction, then not dependent on node instance id + if(!isa<Instruction>(V) || isa<Constant>(V)) { + DEBUG(errs() << "\tStop\n"); + return false; + } + + Instruction* I = cast<Instruction>(V); + for(unsigned i = 0; i < I->getNumOperands(); i++) { + Value* operand = I->getOperand(i); + if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) { + if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x + || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y + || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) { + Value* Node = II->getArgOperand(0); + IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node); + assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n"); + if(GN->getIntrinsicID() == Intrinsic::visc_getNode) { + DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n"); + return true; + } + } + } + if(CmpInst* CI = dyn_cast<CmpInst>(operand)) { + DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n"); + continue; + } + DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n"); + if(isDependentOnNodeInstanceID(operand, DependenceList)) { + return true; + } + } + return false; +} + +// Function to check if argument arg can be changed to a constant memory pointer +static bool canBePromoted(Argument* arg, Function* F) { + DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n"); + std::vector<Value*> UseList; + std::vector<Value*> VisitedList; + // recursively traverse use chain + // if find a store instruction return false, everything fails, cannot be + // promoted + // if find a load instruction as use, add the GEP instruction to list + bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList); + if(foundStore == true) + return false; + // See that the GEP instructions are not dependent on getNodeInstanceID + // intrinsic + DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n"); + std::vector<Value*>DependenceList; + for(auto U: UseList) { + if(isDependentOnNodeInstanceID(U, &DependenceList)) + return false; + } + DEBUG(errs() << "\tYes, Promotable to Constant Memory\n"); + return true; +} + + +// Calculate execute node parameters which include, number of diemnsions for +// dynamic instances of the kernel, local and global work group sizes. +static void getExecuteNodeParams(Module &M, Value* &workDim, Value* &LocalWGPtr, Value* + &GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) { + + // Assign number of dimenstions a constant value + workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim); + + // If local work group size if null + if(!kernel->hasLocalWG()) { + LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext())); + } + else { + for(unsigned i = 0; i < kernel->localWGSize.size(); i++) { + if(isa<Argument>(kernel->localWGSize[i])) + kernel->localWGSize[i] = VMap[kernel->localWGSize[i]]; + } + LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize"); + } + + for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) { + if(isa<Argument>(kernel->globalWGSize[i])) + kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]]; + } + + // For OpenCL, global work group size is the total bumber of instances in each + // dimension. So, multiply local and global dim limits. + std::vector<Value*> globalWGSizeInsts; + if(kernel->hasLocalWG()) { + for (unsigned i = 0; i < kernel->gridDim; i++) { + BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB); + globalWGSizeInsts.push_back(MulInst); + } + } + else { + globalWGSizeInsts = kernel->globalWGSize; + } + GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize"); + DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n"); +} + +// CodeGen for allocating space for Work Group on stack and returning a pointer +// to its address +static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) { + Value* WGPtr; + // Get int64_t and or ease of use + Type* Int64Ty = Type::getInt64Ty(M.getContext()); + + // Work Group type is [#dim x i64] + Type* WGTy = ArrayType::get(Int64Ty, WGSize.size()); + // Allocate space of Global work group data on stack and get pointer to + // first element. + AllocaInst* WG = new AllocaInst(WGTy, WGName, IB); + WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB); + Value* nextDim = WGPtr; + DEBUG(errs() << *WGPtr << "\n"); + + // Iterate over the number of dimensions and store the global work group + // size in that dimension + for(unsigned i=0; i < WGSize.size(); i++) { + DEBUG(errs() << *WGSize[i] << "\n"); + assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!"); + + if(WGSize[i]->getType() != Int64Ty) { + // If number of dimensions are mentioned in any other integer format, + // generate code to extend it to i64. We need to use the mapped value in + // the new generated function, hence the use of VMap + // FIXME: Why are we changing the kernel WGSize vector here? + DEBUG(errs() << "Not i64. Zero extend required.\n"); + DEBUG(errs() << *WGSize[i] << "\n"); + CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB); + DEBUG(errs() << "Bitcast done.\n"); + StoreInst* SI = new StoreInst(CI, nextDim, IB); + DEBUG(errs() << "Zero extend done.\n"); + DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n"); + } else { + // Store the value representing work group size in ith dimension on + // stack + StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB); + + DEBUG(errs() << "\t Work group size: " << *SI << "\n"); + } + if(i+1 < WGSize.size()) { + // Move to next dimension + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim, + ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)), + WG->getName()+"."+Twine(i+1), + IB); + DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n"); + nextDim = GEP; + } + } + return WGPtr; + +} + +// Get generated PTX binary name +static std::string getPTXFilename(const Module& M) { + std::string moduleID = M.getModuleIdentifier(); + moduleID.append(".nvptx.s"); + return moduleID; +} + +// Get the name of the input file from module ID +static std::string getFilenameFromModule(const Module& M) { + std::string moduleID = M.getModuleIdentifier(); + return moduleID.substr(moduleID.find_last_of("/")+1); +} + +// Changes the data layout of the Module to be compiled with NVPTX backend +// TODO: Figure out when to call it, probably after duplicating the modules +static void changeDataLayout(Module &M) { + std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"; + std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64"; + + if (TARGET_PTX == 32) + M.setDataLayout(StringRef(nvptx32_layoutStr)); + else if (TARGET_PTX == 64) + M.setDataLayout(StringRef(nvptx64_layoutStr)); + else assert(false && "Invalid PTX target"); + + return; +} + +static void changeTargetTriple(Module &M) { + std::string nvptx32_TargetTriple = "nvptx--nvidiacl"; + std::string nvptx64_TargetTriple = "nvptx64--nvidiacl"; + + if (TARGET_PTX == 32) + M.setTargetTriple(StringRef(nvptx32_TargetTriple)); + else if (TARGET_PTX == 64) + M.setTargetTriple(StringRef(nvptx64_TargetTriple)); + else assert(false && "Invalid PTX target"); + + return; +} + +// Helper function, populate a vector with all return statements in a function +static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) { + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &(*i); + ReturnInst* RI = dyn_cast<ReturnInst>(I); + if (RI) { + ReturnInstVec.push_back(RI); + } + } +} + +// Helper function, populate a vector with all IntrinsicID intrinsics in a function +static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) { + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &(*i); + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + if (II && II->getIntrinsicID() == IntrinsicID) { + IntrinsicInstVec.push_back(II); + } + } +} + +// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic op +static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) { + switch(ID) { + case Intrinsic::visc_atomic_add: + return AtomicRMWInst::Add; + case Intrinsic::visc_atomic_sub: + return AtomicRMWInst::Sub; + case Intrinsic::visc_atomic_min: + return AtomicRMWInst::Min; + case Intrinsic::visc_atomic_umin: + return AtomicRMWInst::UMin; + case Intrinsic::visc_atomic_max: + return AtomicRMWInst::Max; + case Intrinsic::visc_atomic_umax: + return AtomicRMWInst::UMax; + //case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc; + //case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec; + case Intrinsic::visc_atomic_xchg: + return AtomicRMWInst::Xchg; + case Intrinsic::visc_atomic_and: + return AtomicRMWInst::And; + case Intrinsic::visc_atomic_or: + return AtomicRMWInst::Or; + case Intrinsic::visc_atomic_xor: + return AtomicRMWInst::Xor; + default: + llvm_unreachable("Unsupported atomic intrinsic!"); + }; +} + + +// Helper funtion, returns the OpenCL function name, corresponding to atomic op +static std::string getAtomicOpName(Intrinsic::ID ID) { + switch(ID) { + case Intrinsic::visc_atomic_cmpxchg: + return "atom_cmpxchg"; + case Intrinsic::visc_atomic_add: + return "atom_add"; + case Intrinsic::visc_atomic_sub: + return "atom_sub"; + case Intrinsic::visc_atomic_min: + return "atom_min"; + case Intrinsic::visc_atomic_max: + return "atom_max"; + case Intrinsic::visc_atomic_inc: + return "atom_inc"; + case Intrinsic::visc_atomic_dec: + return "atom_dec"; + case Intrinsic::visc_atomic_xchg: + return "atom_xchg"; + case Intrinsic::visc_atomic_and: + return "atom_and"; + case Intrinsic::visc_atomic_or: + return "atom_or"; + case Intrinsic::visc_atomic_xor: + return "atom_xor"; + default: + llvm_unreachable("Unsupported atomic intrinsic!"); + }; +} + +} // End of namespace + +char DFG2LLVM_NVPTX::ID = 0; +static RegisterPass<DFG2LLVM_NVPTX> X("dfg2llvm-nvptx", + "Dataflow Graph to LLVM for NVPTX Pass", + false /* does not modify the CFG */, + true /* transformation, * + * not just analysis */); + diff --git a/lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.exports b/lib/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.exports new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/DFG2LLVM_NVPTX/LLVMBuild.txt b/lib/DFG2LLVM_NVPTX/LLVMBuild.txt new file mode 100644 index 0000000000..fb7cae49f8 --- /dev/null +++ b/lib/DFG2LLVM_NVPTX/LLVMBuild.txt @@ -0,0 +1,21 @@ +;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = DFG2LLVM_NVPTX +parent = Transforms diff --git a/lib/DFG2LLVM_PROMISE/CMakeLists.txt b/lib/DFG2LLVM_PROMISE/CMakeLists.txt new file mode 100644 index 0000000000..5b5d2677d0 --- /dev/null +++ b/lib/DFG2LLVM_PROMISE/CMakeLists.txt @@ -0,0 +1,12 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( LLVMDFG2LLVM_PROMISE + DFG2LLVM_PROMISE.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) diff --git a/lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.cpp b/lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.cpp new file mode 100644 index 0000000000..184f92910a --- /dev/null +++ b/lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.cpp @@ -0,0 +1,1283 @@ +//=== DFG2LLVM_PROMISE.cpp ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#define ENABLE_ASSERTS + +#define DEBUG_TYPE "DFG2LLVM_PROMISE" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/IR/Attributes.h" +#include "llvm-c/Core.h" +#include "llvm/SupportVISC/VISCTimer.h" +#include "llvm/SupportVISC/DFG2LLVM.h" +#include "llvm/InPlaceDFG/InPlaceDFGAnalysis.h" +#include <sstream> +#include <fstream> + +using namespace llvm; +using namespace builddfg; +using namespace dfg2llvm; + +namespace { + +cl::opt<std::string> QuantizationInputsFilename( + "quantization-levels-filename", + cl::desc("<PROMISE quantization levels input file (path)>"), + cl::value_desc("filename"), + cl::Required); + +// Helper class declarations + +// State machine definition for pattern identification + +/* An assumption is made for the PROMISE simulator: * + * a leaf node will contain consequtive operations that will map to a * + * single PROMISE simulator call * + + * To alleviate that, the states that correspond to valid patterns * + * - (FullyConnectedLayer_(2,3,x, ConvilutionLayer_(2,3,4,x)) * + * can invoke codeGen when detecting the beginning of a new pattern, then * + * clear the collected IIs and Args, then go to initial and invoke its * + * transition. */ + +class AbstractState; + +class CodeGenStateMachine { +private: + Module *M; + Module *RtM; + + std::ifstream &qin; // Quantization levels input stream reference + std::vector<Value*> Args; + std::vector<IntrinsicInst*> IIs; + AbstractState *current; + +public: + CodeGenStateMachine(Module *, Module *, std::ifstream &); + + void setCurrent(AbstractState *s) { + current = s; + } + + void transition(IntrinsicInst *II); + + Module *getModule() { + return M; + } + + void getNextQuantizationLevel(float &ql) { + qin >> ql; + } + + void addArgument(Value *Arg) { + Args.push_back(Arg); + } + + void addIntrinsicInst(IntrinsicInst *II) { + IIs.push_back(II); + } + + IntrinsicInst *getIntrinsicInstAt(unsigned idx) { + return IIs[idx]; + } + + void codeGen(); + +}; + +class AbstractState { +public: + enum ID + { + INITIAL_STATE, + FULLY_CONNECTED_LAYER_1, + FULLY_CONNECTED_LAYER_2, + FULLY_CONNECTED_LAYER_3, + FULLY_CONNECTED_LAYER, + CONVOLUTION_LAYER_1, + CONVOLUTION_LAYER_2, + CONVOLUTION_LAYER_3, + CONVOLUTION_LAYER_4, + CONVOLUTION_LAYER, + NO_PATTERN, + }; + +protected: + enum ID StateID; + +public: + enum ID getStateID() { + return StateID; + } + + virtual void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) = 0; + virtual ~AbstractState() {} +}; + +class InitialState : public AbstractState { +public: + InitialState() { + StateID = ID::INITIAL_STATE; + DEBUG(errs() << "new InitialState\n"); + } + ~InitialState() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class FullyConnectedLayer_1 : public AbstractState { +public: + FullyConnectedLayer_1() { + StateID = ID::FULLY_CONNECTED_LAYER_1; + DEBUG(errs() << "new FullyConnectedLayer_1\n"); + } + ~FullyConnectedLayer_1() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class FullyConnectedLayer_2 : public AbstractState { +public: + FullyConnectedLayer_2() { + StateID = ID::FULLY_CONNECTED_LAYER_2; + DEBUG(errs() << "new FullyConnectedLayer_2\n"); + } + ~FullyConnectedLayer_2() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class FullyConnectedLayer_3 : public AbstractState { +public: + FullyConnectedLayer_3() { + StateID = ID::FULLY_CONNECTED_LAYER_3; + DEBUG(errs() << "new FullyConnectedLayer_3\n"); + } + ~FullyConnectedLayer_3() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class FullyConnectedLayer : public AbstractState { +public: + FullyConnectedLayer() { + StateID = ID::FULLY_CONNECTED_LAYER; + DEBUG(errs() << "new FullyConnectedLayer\n"); + } + ~FullyConnectedLayer() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class ConvolutionLayer_1 : public AbstractState { +public: + ConvolutionLayer_1() { + StateID = ID::CONVOLUTION_LAYER_1; + DEBUG(errs() << "new ConvolutionLayer_1\n"); + } + ~ConvolutionLayer_1() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class ConvolutionLayer_2 : public AbstractState { +public: + ConvolutionLayer_2() { + StateID = ID::CONVOLUTION_LAYER_2; + DEBUG(errs() << "new ConvolutionLayer_2\n"); + } + ~ConvolutionLayer_2() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class ConvolutionLayer_3 : public AbstractState { +public: + ConvolutionLayer_3() { + StateID = ID::CONVOLUTION_LAYER_3; + DEBUG(errs() << "new ConvolutionLayer_3\n"); + } + ~ConvolutionLayer_3() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class ConvolutionLayer_4 : public AbstractState { +public: + ConvolutionLayer_4() { + StateID = ID::CONVOLUTION_LAYER_4; + DEBUG(errs() << "new ConvolutionLayer_4\n"); + } + ~ConvolutionLayer_4() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class ConvolutionLayer : public AbstractState { +public: + ConvolutionLayer() { + StateID = ID::CONVOLUTION_LAYER; + DEBUG(errs() << "new ConvolutionLayer\n"); + } + ~ConvolutionLayer() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class NoPattern : public AbstractState { +public: + NoPattern() { + StateID = ID::NO_PATTERN; + DEBUG(errs() << "new NoPattern\n"); + } + ~NoPattern() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +void InitialState::transition(CodeGenStateMachine *Mch, IntrinsicInst *II) { + if (II) { // Not end of instruction stream + switch (II->getIntrinsicID()) { + case Intrinsic::visc_tensor_convolution: + { + Mch->addIntrinsicInst(II); + Mch->addArgument(II->getOperand(0)); // conv input + + // Read quantization levels for input + float i_min, i_max; + Mch->getNextQuantizationLevel(i_min); + Mch->getNextQuantizationLevel(i_max); + errs() << "i_min: " << i_min << "\n"; + errs() << "i_max: " << i_max << "\n"; + + // Create associated arguments for the quantization levels + Constant *IminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) i_min); +// errs() << "IminC : " +// << dyn_cast<ConstantFP>(IminC)->getValueAPF().convertToFloat() +// << "\n"; + Mch->addArgument(IminC); + Constant *ImaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) i_max); + Mch->addArgument(ImaxC); + + Mch->addArgument(II->getOperand(1)); // conv kernel + + // Read quantization levels for filter + float w_min, w_max; + Mch->getNextQuantizationLevel(w_min); + Mch->getNextQuantizationLevel(w_max); + errs() << "w_min: " << w_min << "\n"; + errs() << "w_max: " << w_max << "\n"; + Constant *WminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) w_min); + Mch->addArgument(WminC); + Constant *WmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) w_max); + Mch->addArgument(WmaxC); + + Mch->setCurrent(new ConvolutionLayer_1()); + } + break; + case Intrinsic::visc_tensor_mul: + { + Mch->addIntrinsicInst(II); + Mch->addArgument(II->getOperand(0)); // 1st gemm input + + // Read quantization levels for input + float i_min, i_max; + Mch->getNextQuantizationLevel(i_min); + Mch->getNextQuantizationLevel(i_max); + errs() << "i_min: " << i_min << "\n"; + errs() << "i_max: " << i_max << "\n"; + + Constant *IminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) i_min); + Mch->addArgument(IminC); + Constant *ImaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) i_max); + Mch->addArgument(ImaxC); + + Mch->addArgument(II->getOperand(1)); // 2nd gemm input + + // Read quantization levels for weight + float w_min, w_max; + Mch->getNextQuantizationLevel(w_min); + Mch->getNextQuantizationLevel(w_max); + errs() << "w_min: " << w_min << "\n"; + errs() << "w_max: " << w_max << "\n"; + + Constant *WminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) w_min); + Mch->addArgument(WminC); + Constant *WmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) w_max); + Mch->addArgument(WmaxC); + + Mch->setCurrent(new FullyConnectedLayer_1()); + } + break; + default: // Other HPVM intrinsic + Mch->setCurrent(new NoPattern()); + break; + } + delete this; + } // else {} // No HPVM intrinsic received. Remain at initial +} + +void FullyConnectedLayer_1::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (II) { // Not end of instruction stream + switch (II->getIntrinsicID()) { + case Intrinsic::visc_tensor_add: + { + IntrinsicInst *MulII = Mch->getIntrinsicInstAt(0); + assert((MulII == II->getOperand(0)) && + "Output of mul must be used as 1st operand of add"); + Mch->addIntrinsicInst(II); + + Mch->addArgument(II->getOperand(1)); // bias + + // Read quantization levels for input + float b_min, b_max; + Mch->getNextQuantizationLevel(b_min); + Mch->getNextQuantizationLevel(b_max); + errs() << "b_min: " << b_min << "\n"; + errs() << "b_max: " << b_max << "\n"; + + Constant *BminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) b_min); + Mch->addArgument(BminC); + Constant *BmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) b_max); + Mch->addArgument(BmaxC); + + Mch->setCurrent(new FullyConnectedLayer_2()); + } + break; + default: + Mch->setCurrent(new NoPattern()); + break; + } + } else { + Mch->setCurrent(new NoPattern()); + } + delete this; +} + +void FullyConnectedLayer_2::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (II) { // Not end of instruction stream + switch (II->getIntrinsicID()) { + case Intrinsic::visc_tensor_tanh: + { + // Type of activation : TanH + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + + // Read quantization levels for output + float out_min, out_max; + Mch->getNextQuantizationLevel(out_min); + Mch->getNextQuantizationLevel(out_max); + errs() << "out_min: " << out_min << "\n"; + errs() << "out_max: " << out_max << "\n"; + + Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_min); + Mch->addArgument(OutminC); + Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_max); + Mch->addArgument(OutmaxC); + + Mch->addIntrinsicInst(II); + + Mch->setCurrent(new FullyConnectedLayer_3()); + } + break; + case Intrinsic::visc_tensor_relu: + { + // Type of activation : ReLU + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 1)); + + // Read quantization levels for output + float out_min, out_max; + Mch->getNextQuantizationLevel(out_min); + Mch->getNextQuantizationLevel(out_max); + errs() << "out_min: " << out_min << "\n"; + errs() << "out_max: " << out_max << "\n"; + + Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_min); + Mch->addArgument(OutminC); + Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_max); + Mch->addArgument(OutmaxC); + + Mch->addIntrinsicInst(II); + + Mch->setCurrent(new FullyConnectedLayer_3()); + } + break; + case Intrinsic::visc_tensor_clipped_relu: + { + // Type of activation : Clipped ReLU + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 2)); + + // Read quantization levels for output + float out_min, out_max; + Mch->getNextQuantizationLevel(out_min); + Mch->getNextQuantizationLevel(out_max); + errs() << "out_min: " << out_min << "\n"; + errs() << "out_max: " << out_max << "\n"; + + Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_min); + Mch->addArgument(OutminC); + Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_max); + Mch->addArgument(OutmaxC); + + Mch->addIntrinsicInst(II); + + Mch->setCurrent(new FullyConnectedLayer_3()); + } + break; + default: // No activation, but HPVM intrinsic + Mch->setCurrent(new NoPattern()); + break; + } + } else { // End of instruction stream + // No activation + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), -1)); + + // Read quantization levels for output + float out_min, out_max; + Mch->getNextQuantizationLevel(out_min); + Mch->getNextQuantizationLevel(out_max); + errs() << "out_min: " << out_min << "\n"; + errs() << "out_max: " << out_max << "\n"; + + Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_min); + Mch->addArgument(OutminC); + Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_max); + Mch->addArgument(OutmaxC); + + Mch->setCurrent(new FullyConnectedLayer()); + } + delete this; +} + +void FullyConnectedLayer_3::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (!II) { // End of instruction stream + Mch->setCurrent(new FullyConnectedLayer()); + } else { + Mch->setCurrent(new NoPattern()); + } + delete this; +} + +void FullyConnectedLayer::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (II) { // Not end of instruction stream + Mch->setCurrent(new NoPattern()); + delete this; + } +} + +void ConvolutionLayer_1::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (II) { // Not end of instruction stream + switch (II->getIntrinsicID()) { + case Intrinsic::visc_tensor_add: + { + IntrinsicInst *ConvII = Mch->getIntrinsicInstAt(0); + assert((ConvII == II->getOperand(0)) && + "Output of conv must be used as 1st operand of add"); + Mch->addIntrinsicInst(II); + + Mch->addArgument(II->getOperand(1)); // bias + // Read quantization levels for bias + float b_min, b_max; + Mch->getNextQuantizationLevel(b_min); + Mch->getNextQuantizationLevel(b_max); + errs() << "b_min: " << b_min << "\n"; + errs() << "b_max: " << b_max << "\n"; + + Constant *BminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) b_min); + Mch->addArgument(BminC); + Constant *BmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) b_max); + Mch->addArgument(BmaxC); + + Mch->addArgument(ConvII->getOperand(2)); // 1st numeric arg of conv + Mch->addArgument(ConvII->getOperand(3)); // 2nd numeric arg of conv + Mch->addArgument(ConvII->getOperand(4)); // 3rd numeric arg of conv + Mch->addArgument(ConvII->getOperand(5)); // 4th numeric arg of conv + + Mch->setCurrent(new ConvolutionLayer_2()); + } + break; + default: + Mch->setCurrent(new NoPattern()); + break; + } + } else { + // No addition + Mch->addArgument(ConstantPointerNull::get( + Type::getInt8PtrTy(Mch->getModule()->getContext()))); + // Still need to add the quantization constants - and remove them from file + float b_min, b_max; + Mch->getNextQuantizationLevel(b_min); + Mch->getNextQuantizationLevel(b_max); + errs() << "b_min: " << b_min << "\n"; + errs() << "b_max: " << b_max << "\n"; + Constant *BminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) b_min); + Mch->addArgument(BminC); + Constant *BmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) b_max); + Mch->addArgument(BmaxC); + + // Zero for all convolution numeric arguments FIXME??? + IntrinsicInst *ConvII = Mch->getIntrinsicInstAt(0); + Mch->addArgument(ConvII->getOperand(2)); // 1st numeric arg of conv + Mch->addArgument(ConvII->getOperand(3)); // 2nd numeric arg of conv + Mch->addArgument(ConvII->getOperand(4)); // 3rd numeric arg of conv + Mch->addArgument(ConvII->getOperand(5)); // 4th numeric arg of conv +// Mch->addArgument(ConstantInt::get( +// Type::getInt32Ty(Mch->getModule()->getContext()), 0)); +// Mch->addArgument(ConstantInt::get( +// Type::getInt32Ty(Mch->getModule()->getContext()), 0)); +// Mch->addArgument(ConstantInt::get( +// Type::getInt32Ty(Mch->getModule()->getContext()), 0)); +// Mch->addArgument(ConstantInt::get( +// Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + + // No pooling + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + // 0 for unused pool argument + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + // No activation + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), -1)); + + // Read quantization levels for output + float out_min, out_max; + Mch->getNextQuantizationLevel(out_min); + Mch->getNextQuantizationLevel(out_max); + errs() << "out_min: " << out_min << "\n"; + errs() << "out_max: " << out_max << "\n"; + + Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_min); + Mch->addArgument(OutminC); + Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_max); + Mch->addArgument(OutmaxC); + + Mch->setCurrent(new ConvolutionLayer()); + } + delete this; +} + +void ConvolutionLayer_2::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (II) { // Not end of instruction stream + switch (II->getIntrinsicID()) { + case Intrinsic::visc_tensor_tanh: + { + // Type of activation : TanH +// Mch->addArgument(ConstantInt::get( +// Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + Mch->addIntrinsicInst(II); + + Mch->setCurrent(new ConvolutionLayer_3()); + } + break; + case Intrinsic::visc_tensor_relu: + { + // Type of activation : ReLU +// Mch->addArgument(ConstantInt::get( +// Type::getInt32Ty(Mch->getModule()->getContext()), 1)); + Mch->addIntrinsicInst(II); + + Mch->setCurrent(new ConvolutionLayer_3()); + } + break; + case Intrinsic::visc_tensor_clipped_relu: + { + // Type of activation : Clipped ReLU +// Mch->addArgument(ConstantInt::get( +// Type::getInt32Ty(Mch->getModule()->getContext()), 2)); + Mch->addIntrinsicInst(II); + + Mch->setCurrent(new ConvolutionLayer_3()); + } + break; + case Intrinsic::visc_tensor_pool_max: + { + // pool max + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + // poolSize + Mch->addArgument(II->getOperand(1)); + // No activation + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), -1)); + Mch->addIntrinsicInst(II); + + // Read quantization levels for output + float out_min, out_max; + Mch->getNextQuantizationLevel(out_min); + Mch->getNextQuantizationLevel(out_max); + errs() << "out_min: " << out_min << "\n"; + errs() << "out_max: " << out_max << "\n"; + + Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_min); + Mch->addArgument(OutminC); + Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_max); + Mch->addArgument(OutmaxC); + + Mch->setCurrent(new ConvolutionLayer_4()); + } + break; + case Intrinsic::visc_tensor_pool_min: + { + // pool min FIXME: 2: supported? + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 2)); + // poolSize + Mch->addArgument(II->getOperand(1)); + // No activation + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), -1)); + Mch->addIntrinsicInst(II); + + // Read quantization levels for output + float out_min, out_max; + Mch->getNextQuantizationLevel(out_min); + Mch->getNextQuantizationLevel(out_max); + errs() << "out_min: " << out_min << "\n"; + errs() << "out_max: " << out_max << "\n"; + + Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_min); + Mch->addArgument(OutminC); + Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_max); + Mch->addArgument(OutmaxC); + + Mch->setCurrent(new ConvolutionLayer_4()); + } + break; + case Intrinsic::visc_tensor_pool_mean: + { + // pool mean + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 1)); + // poolSize + Mch->addArgument(II->getOperand(1)); + // No activation + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), -1)); + Mch->addIntrinsicInst(II); + + // Read quantization levels for output + float out_min, out_max; + Mch->getNextQuantizationLevel(out_min); + Mch->getNextQuantizationLevel(out_max); + errs() << "out_min: " << out_min << "\n"; + errs() << "out_max: " << out_max << "\n"; + + Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_min); + Mch->addArgument(OutminC); + Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_max); + Mch->addArgument(OutmaxC); + + Mch->setCurrent(new ConvolutionLayer_4()); + } + break; + default: // No activation, No pooling, but HPVM intrinsic + Mch->setCurrent(new NoPattern()); + break; + } + } else { // End of instruction stream + // No pooling + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + // 0 for unused pool argument + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + // No activation + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), -1)); + + // Read quantization levels for output + float out_min, out_max; + Mch->getNextQuantizationLevel(out_min); + Mch->getNextQuantizationLevel(out_max); + errs() << "out_min: " << out_min << "\n"; + errs() << "out_max: " << out_max << "\n"; + + Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_min); + Mch->addArgument(OutminC); + Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_max); + Mch->addArgument(OutmaxC); + + Mch->setCurrent(new ConvolutionLayer()); + } + delete this; +} + +void ConvolutionLayer_3::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (II) { // Not end of instruction stream + switch (II->getIntrinsicID()) { + case Intrinsic::visc_tensor_pool_max: + { + // pool max + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + // poolSize + Mch->addArgument(II->getOperand(1)); + Mch->addIntrinsicInst(II); + + // Revisit last intrinsic, to add argument for activation operation + IntrinsicInst *ActII = Mch->getIntrinsicInstAt(2); + // Due to previous switch, we know it is a TanH, ReLU, or Clipped ReLU + Intrinsic::ID ActIID = ActII->getIntrinsicID(); + if (ActIID == Intrinsic::visc_tensor_tanh) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + } else if (ActIID == Intrinsic::visc_tensor_relu) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 1)); + } else { //ActIID == Intrinsic::visc_tensor_clipped_relu + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 2)); + } + + // Read quantization levels for output + float out_min, out_max; + Mch->getNextQuantizationLevel(out_min); + Mch->getNextQuantizationLevel(out_max); + errs() << "out_min: " << out_min << "\n"; + errs() << "out_max: " << out_max << "\n"; + + Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_min); + Mch->addArgument(OutminC); + Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_max); + Mch->addArgument(OutmaxC); + + Mch->setCurrent(new ConvolutionLayer_4()); + } + break; + case Intrinsic::visc_tensor_pool_min: + { + // pool min FIXME: 2: supported? + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 2)); + // poolSize + Mch->addArgument(II->getOperand(1)); + Mch->addIntrinsicInst(II); + + // Revisit last intrinsic, to add argument for activation operation + IntrinsicInst *ActII = Mch->getIntrinsicInstAt(2); + // Due to previous switch, we know it is a TanH, ReLU, or Clipped ReLU + Intrinsic::ID ActIID = ActII->getIntrinsicID(); + if (ActIID == Intrinsic::visc_tensor_tanh) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + } else if (ActIID == Intrinsic::visc_tensor_relu) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 1)); + } else { //ActIID == Intrinsic::visc_tensor_clipped_relu + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 2)); + } + + // Read quantization levels for output + float out_min, out_max; + Mch->getNextQuantizationLevel(out_min); + Mch->getNextQuantizationLevel(out_max); + errs() << "out_min: " << out_min << "\n"; + errs() << "out_max: " << out_max << "\n"; + + Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_min); + Mch->addArgument(OutminC); + Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_max); + Mch->addArgument(OutmaxC); + + Mch->setCurrent(new ConvolutionLayer_4()); + } + break; + case Intrinsic::visc_tensor_pool_mean: + { + // pool max + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 1)); + // poolSize + Mch->addArgument(II->getOperand(1)); + Mch->addIntrinsicInst(II); + + // Revisit last intrinsic, to add argument for activation operation + IntrinsicInst *ActII = Mch->getIntrinsicInstAt(2); + // Due to previous switch, we know it is a TanH, ReLU, or Clipped ReLU + Intrinsic::ID ActIID = ActII->getIntrinsicID(); + if (ActIID == Intrinsic::visc_tensor_tanh) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + } else if (ActIID == Intrinsic::visc_tensor_relu) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 1)); + } else { //ActIID == Intrinsic::visc_tensor_clipped_relu + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 2)); + } + + // Read quantization levels for output + float out_min, out_max; + Mch->getNextQuantizationLevel(out_min); + Mch->getNextQuantizationLevel(out_max); + errs() << "out_min: " << out_min << "\n"; + errs() << "out_max: " << out_max << "\n"; + + Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_min); + Mch->addArgument(OutminC); + Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_max); + Mch->addArgument(OutmaxC); + + Mch->setCurrent(new ConvolutionLayer_4()); + } + break; + default: // No pooling, but HPVM intrinsic + Mch->setCurrent(new NoPattern()); + break; + } + } else { // End of instruction stream + // No pooling + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + // 0 for unused pool argument + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + + // Revisit last intrinsic, to add argument for activation operation + IntrinsicInst *ActII = Mch->getIntrinsicInstAt(2); + // Due to previous switch, we know it is a TanH, ReLU, or Clipped ReLU + Intrinsic::ID ActIID = ActII->getIntrinsicID(); + if (ActIID == Intrinsic::visc_tensor_tanh) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + } else if (ActIID == Intrinsic::visc_tensor_relu) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 1)); + } else { //ActIID == Intrinsic::visc_tensor_clipped_relu + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 2)); + } + + // Read quantization levels for output + float out_min, out_max; + Mch->getNextQuantizationLevel(out_min); + Mch->getNextQuantizationLevel(out_max); + errs() << "out_min: " << out_min << "\n"; + errs() << "out_max: " << out_max << "\n"; + + Constant *OutminC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_min); + Mch->addArgument(OutminC); + Constant *OutmaxC = ConstantFP::get(Type::getFloatTy(Mch->getModule()->getContext()), + (double) out_max); + Mch->addArgument(OutmaxC); + + Mch->setCurrent(new ConvolutionLayer()); + } + delete this; +} + +void ConvolutionLayer_4::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (!II) { // End of instruction stream + Mch->setCurrent(new ConvolutionLayer()); + } else { + Mch->setCurrent(new NoPattern()); + } + delete this; +} + +void ConvolutionLayer::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (II) { // Not end of instruction stream + Mch->setCurrent(new NoPattern()); + delete this; + } +} + +void NoPattern::transition(CodeGenStateMachine *Mch, IntrinsicInst *II) {} + +CodeGenStateMachine::CodeGenStateMachine(Module *_M, Module *_RtM, std::ifstream &_qin) : + M(_M), RtM(_RtM), qin(_qin) { + current = new InitialState(); +} + +void CodeGenStateMachine::transition(IntrinsicInst *II) { + current->transition(this, II); +} + +void CodeGenStateMachine::codeGen() { + + if ((current->getStateID() != AbstractState::ID::FULLY_CONNECTED_LAYER) && + (current->getStateID() != AbstractState::ID::CONVOLUTION_LAYER)) { + // Not a valid instruction sequence. + assert(false && "Unsupported instruction sequence by PROMISE simulator\n"); + } + + // We have a valid instruction sequence. + // Make sure that the instruction sequence can be traslated: + // each instruction's result must be used only by the next one in sequence. + for (unsigned p = 0; p < IIs.size()-1; p++) { + IntrinsicInst *II = IIs[p]; + assert((II->hasOneUse()) && + "Instruction sequence does not fit expected pattern: not single use\n"); + + Value::user_iterator ui = II->user_begin(); // The only use + assert((*ui == IIs[p+1]) && + "Instruction sequence does not fit expected pattern: not used by next instruction\n"); + } + + // Create corresponding PROMISE simulator call + CallInst *CI; + switch (current->getStateID()) { + case AbstractState::ID::CONVOLUTION_LAYER: + { + Constant* ConvLayer_PROMISE = + M->getOrInsertFunction(StringRef("ConvLayer_PROMISE"), + RtM->getFunction(StringRef("ConvLayer_PROMISE"))->getFunctionType()); + DEBUG(errs() << *ConvLayer_PROMISE); + + // FIXME: get last argument from some intrinsic. For now, 7 + Args.push_back(ConstantInt::get(Type::getInt32Ty(M->getContext()), 7)); + // Create PROMISE simulator function call + CI = CallInst::Create(ConvLayer_PROMISE, Args, ""); + } + break; + case AbstractState::ID::FULLY_CONNECTED_LAYER: + { + Constant* FCLayer_PROMISE = + M->getOrInsertFunction(StringRef("FCLayer_PROMISE"), + RtM->getFunction(StringRef("FCLayer_PROMISE"))->getFunctionType()); + DEBUG(errs() << *FCLayer_PROMISE); + + // FIXME: get last argument from some intrinsic. For now, 7 + Args.push_back(ConstantInt::get(Type::getInt32Ty(M->getContext()), 7)); + // Create PROMISE simulator function call + CI = CallInst::Create(FCLayer_PROMISE, Args, ""); + } + break; + default: + llvm_unreachable("Unexpected CodeGenStateMachine State\n"); + break; + } + + // Insert new call and replace all uses of pattern result with + // the PROMISE simulator call + IntrinsicInst *IIlast = *(IIs.rbegin()); + CI->insertBefore(IIlast); + IIlast->replaceAllUsesWith(CI); + + // Remove the instructions we translated to the simulator call. + // Traverse the vector backwards, otherwise definitions are deleted while + // their subsequent uses are still around. + for (std::vector<IntrinsicInst *>::reverse_iterator ri = IIs.rbegin(), + re = IIs.rend(); ri != re; ++ri) { + DEBUG(errs() << "Erasing: " << **ri << "\n"); + (*ri)->eraseFromParent(); + } +errs() << "****** GenF:\n" << *(CI->getParent()->getParent()); + +} + +// DFG2LLVM_PROMISE - The first implementation. + +struct DFG2LLVM_PROMISE : public DFG2LLVM { + static char ID; // Pass identification, replacement for typeid + DFG2LLVM_PROMISE() : DFG2LLVM(ID) {} +private: + +public: + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<BuildDFG>(); + AU.addPreserved<BuildDFG>(); + } + + bool runOnModule(Module &M); +}; + +// Visitor for Code generation traversal (tree traversal for now) +class CGT_PROMISE : public CodeGenTraversal { + +private: + //Member variables + std::ifstream qin; + + // VISC Runtime API and Tensor runtime API + Constant* llvm_hpvm_initTensorRt; + Constant* llvm_hpvm_cleanupTensorRt; + Constant* hpvm_request_tensor; + + // Functions + + // Virtual Functions + void init(); + void initRuntimeAPI(); + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); + +public: + + // Constructor + CGT_PROMISE(Module &_M, BuildDFG &_DFG, std::string &_str) : CodeGenTraversal(_M, _DFG) { + qin.open(_str.c_str()); + assert(qin && "Failed to open quantization levels input file\n"); + initRuntimeAPI(); + } + + ~CGT_PROMISE() { + qin.close(); + } + +}; + +void CGT_PROMISE::init() { + // FIXME: what to do here? If anything? +} + +// Initialize the VISC runtime API. This makes it easier to insert these calls +void CGT_PROMISE::initRuntimeAPI() { + + // Load Runtime API Module + SMDiagnostic Err; + + char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!\n"); + + // FIXME: set correct path + Twine llvmSrcRoot = LLVM_SRC_ROOT; + Twine runtimeAPI = llvmSrcRoot+"/projects/hpvm-tensor-rt/lib/tensor_runtime.ll"; + runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); + if(runtimeModule == nullptr) + DEBUG(errs() << Err.getMessage()); + else + DEBUG(errs() << "Successfully loaded hpvm-tensor-rt API module\n"); + + // Get or insert Global declarations for + // - initialization + // - cleanup + // - request a tensor + DECLARE(llvm_hpvm_initTensorRt); + DECLARE(llvm_hpvm_cleanupTensorRt); + DECLARE(hpvm_request_tensor); + + // Find visc.init and visc.cleanup calls, and add placeholder methods + // for initialization and cleanup of the hpvm tensor runtime + + Function* VI = M.getFunction("llvm.visc.init"); + assert(VI->getNumUses() == 1 && "__visc__init should only be used once\n"); + InitCall = cast<Instruction>(*VI->user_begin()); + CallInst::Create(llvm_hpvm_initTensorRt, + ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(M.getContext()), 0)), + "", InitCall); + + Function* VC = M.getFunction("llvm.visc.cleanup"); + assert(VC->getNumUses() == 1 && "__visc__clear should only be used once\n"); + CleanupCall = cast<Instruction>(*VC->user_begin()); + CallInst::Create(llvm_hpvm_cleanupTensorRt, ArrayRef<Value*>(), "", CleanupCall); + +} + +void CGT_PROMISE::codeGen(DFInternalNode* N) { + errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n"; + errs () << "Skipping internal node\n"; +} + +void CGT_PROMISE::codeGen(DFLeafNode* N) { + + // Skip code generation if it is a dummy node + if(N->isDummyNode()) { + DEBUG(errs() << "Skipping dummy node\n"); + return; + } + + // Abort code generation if it is an allocation node + if(N->isAllocationNode()) { + assert(false && "Allocation Node not expected in ApproxHPVM"); + return; + } + + // Generate code only if it has the right hint + if (!checkPreferredTarget(N, visc::PROMISE_TARGET)) { + errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; + return; + } + + // Get the function associated with the dataflow node + Function *F = N->getFuncPointer(); +errs() << "Node Function: " << *F << "\n"; + // Look up if we have visited this function before. If we have, then just + // get the cloned function pointer from DFNode. Otherwise, create the cloned + // function and add it to the DFNode GenFunc. + Function *F_promise = N->getGenFuncForTarget(visc::PROMISE_TARGET); + + assert((F_promise == NULL) && + "Error: Visiting a node for which code already generated"); + + // Clone the function + ValueToValueMapTy VMap; + std::string FName(F->getName().data());//Twine FName = F->getName(); + F_promise = CloneFunction(F, VMap); + F_promise->setName(FName+"_promise"); + F_promise->removeFromParent(); + M.getFunctionList().push_back(F_promise); + + N->addGenFunc(F_promise, visc::PROMISE_TARGET, true); + + /* Removing HPVM in/out/inout function attributes */ + for(Function::arg_iterator ai = F_promise->arg_begin(), ae = F_promise->arg_end(); + ai != ae; ai++){ + Argument *Arg = &*ai; + if(Arg->hasAttribute(Attribute::In)) + Arg->removeAttr(Attribute::In); + if(Arg->hasAttribute(Attribute::Out)) + Arg->removeAttr(Attribute::Out); + if(Arg->hasAttribute(Attribute::InOut)) + Arg->removeAttr(Attribute::InOut); + } + + // Adding nounwind to generated function : FIXME: needed? + DEBUG(errs() << "Adding nounwind to generated function\n"); + F_promise->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind); + + // Add llvm_visc_requestTensor calls for every pointer argument of the function + // (they are all expected to be tensors), at the beginning of the function. + // This is the first instruction of the function, insert them before this + Instruction* FI = &*(F_promise->getEntryBlock().begin()); + + // FIXME: verify that we want 0 as a target device + // In this backend, the target device is CPU, represented by i32 0. + ConstantInt *TargetDeviceID = + ConstantInt::get(Type::getInt32Ty(M.getContext()), 0); + + for (Function::arg_iterator ai = F_promise->arg_begin(), + ae = F_promise->arg_end(); ai != ae; ++ai) { + Argument* Arg = &*ai; + if (Arg->getType()->isPointerTy()) { + Value *Args[] = {Arg, TargetDeviceID}; + CallInst::Create(hpvm_request_tensor, + ArrayRef<Value*>(Args, 2), + "", FI); + } + } + + CodeGenStateMachine CGM(&M, runtimeModule.get(), qin); + + /* An assumption is made for the PROMISE simulator: * + * a leaf node will contain consequtive operations that will map to a * + * single PROMISE simulator call */ + + for (inst_iterator i = inst_begin(F_promise), e = inst_end(F_promise); + i != e; ++i) { + Instruction *I = &(*i); + CGM.transition(dyn_cast<IntrinsicInst>(I)); + } + + CGM.codeGen(); + +//errs() << "-----------------------------------\n"; +//errs() << *F_promise << "\n"; + + return; +} + +bool DFG2LLVM_PROMISE::runOnModule(Module &M) { + errs() << "\nDFG2LLVM_PROMISE PASS\n"; + + errs() << QuantizationInputsFilename << "\n"; + +// std::ifstream qin(quantizationInputsFilename_cstr); +// std::ifstream qin; +// qin.open(QuantizationInputsFilename.c_str()); +// qin.open(QuantizationInputsFilename.c_str(), std::ifstream::in); + + // Get the BuildDFG Analysis Results: + // - Dataflow graph + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + + // Visitor for Code Generation Graph Traversal + CGT_PROMISE *CGTVisitor = new CGT_PROMISE(M, DFG, QuantizationInputsFilename); + + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode: Roots) { + // Initiate code generation for root DFNode + CGTVisitor->visit(rootNode); + } + + //TODO: Edit module epilogue to remove the VISC intrinsic declarations + delete CGTVisitor; + + return true; +} + + +/****************************************************************************** + * Helper functions * + ******************************************************************************/ + +} // End of namespace + +char DFG2LLVM_PROMISE::ID = 0; +static RegisterPass<DFG2LLVM_PROMISE> X("dfg2llvm-promise", + "Dataflow Graph to LLVM for PROMISE Pass", + false /* does not modify the CFG */, + true /* transformation, * + * not just analysis */); + diff --git a/lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.exports b/lib/DFG2LLVM_PROMISE/DFG2LLVM_PROMISE.exports new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/DFG2LLVM_PROMISE/LLVMBuild.txt b/lib/DFG2LLVM_PROMISE/LLVMBuild.txt new file mode 100644 index 0000000000..714ad14f18 --- /dev/null +++ b/lib/DFG2LLVM_PROMISE/LLVMBuild.txt @@ -0,0 +1,21 @@ +;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = DFG2LLVM_PROMISE +parent = Transforms diff --git a/lib/DFG2LLVM_SPIR/CMakeLists.txt b/lib/DFG2LLVM_SPIR/CMakeLists.txt new file mode 100644 index 0000000000..43e2254c79 --- /dev/null +++ b/lib/DFG2LLVM_SPIR/CMakeLists.txt @@ -0,0 +1,12 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( LLVMDFG2LLVM_SPIR + DFG2LLVM_SPIR.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) diff --git a/lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp b/lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp new file mode 100644 index 0000000000..48b1492047 --- /dev/null +++ b/lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp @@ -0,0 +1,2010 @@ +//=== DFG2LLVM_SPIR.cpp ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define ENABLE_ASSERTS +#define TARGET_PTX 32 +#define GENERIC_ADDRSPACE 0 +#define GLOBAL_ADDRSPACE 1 +#define SHARED_ADDRSPACE 3 + +#define DEBUG_TYPE "DFG2LLVM_SPIR" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/IR/Attributes.h" +#include "llvm/SupportVISC/VISCHint.h" +#include "llvm/SupportVISC/VISCTimer.h" +#include "llvm/SupportVISC/DFG2LLVM.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm-c/Core.h" + +#include "llvm/SupportVISC/VISCUtils.h" +#include "llvm/IR/IRPrintingPasses.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Support/ToolOutputFile.h" +#include "llvm/IR/UseListOrder.h" + +#include <sstream> + +using namespace llvm; +using namespace builddfg; +using namespace dfg2llvm; +using namespace viscUtils; + +// VISC Command line option to use timer or not +static cl::opt<bool> +VISCTimer_SPIR("visc-timers-spir", cl::desc("Enable visc timers")); + +namespace { +// Helper class declarations + +// Class to maintain the tuple of host pointer, device pointer and size +// in bytes. Would have preferred to use tuple but support not yet available +class OutputPtr { +public: + OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes) + : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {} + + Value* h_ptr; + Value* d_ptr; + Value* bytes; +}; + +// Class to maintain important kernel info required for generating runtime +// calls +class Kernel { +public: + Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::map<unsigned, unsigned> _inArgMap = + std::map<unsigned, unsigned>(), + std::map<unsigned, std::pair<Value*, unsigned> > _sharedInArgMap = + std::map<unsigned, std::pair<Value*, unsigned> >(), + std::vector<unsigned> _outArgMap = std::vector<unsigned>(), + unsigned _gridDim = 0, std::vector<Value*> _globalWGSize = std::vector<Value*>(), + unsigned _blockDim = 0, std::vector<Value*> _localWGSize = std::vector<Value*>()) + : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap), + sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), gridDim(_gridDim), + globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) { + + assert(gridDim == globalWGSize.size() + && "gridDim should be same as the size of vector globalWGSize"); + assert(blockDim == localWGSize.size() + && "blockDim should be same as the size of vector localWGSize"); + } + + Function* KernelFunction; + DFLeafNode* KernelLeafNode; + std::map<unsigned, unsigned> inArgMap; + // Map for shared memory arguments + std::map<unsigned, std::pair<Value*, unsigned> > sharedInArgMap; + // Fields for (potential) allocation node + DFLeafNode* AllocationNode; + Function* AllocationFunction; + std::map<unsigned, unsigned> allocInArgMap; + + std::vector<unsigned> outArgMap; + unsigned gridDim; + std::vector<Value*> globalWGSize; + unsigned blockDim; + std::vector<Value*> localWGSize; + std::vector<int> localDimMap; + + std::map<unsigned, unsigned> getInArgMap() { + return inArgMap; + } + void setInArgMap(std::map<unsigned, unsigned> map) { + inArgMap = map; + } + + std::map<unsigned, std::pair<Value*, unsigned> > getSharedInArgMap() { + return sharedInArgMap; + } + void setSharedInArgMap(std::map<unsigned, std::pair<Value*, unsigned> > map) { + sharedInArgMap = map; + } + + std::vector<unsigned> getOutArgMap() { + return outArgMap; + } + void setOutArgMap(std::vector<unsigned> map) { + outArgMap = map; + } + + void setLocalWGSize(std::vector<Value*> V) { + localWGSize = V; + } + + bool hasLocalWG() { + return blockDim != 0; + } +}; + +// Helper function declarations +static void getExecuteNodeParams(Module &M, Value* &, Value* &, Value* &, Kernel*, + ValueToValueMapTy&, Instruction*); +static Value* genWorkGroupPtr(Module &M, std::vector<Value*>, ValueToValueMapTy&, + Instruction*, const Twine& WGName = "WGSize"); +static std::string getSPIRFilename(const Module&); +static std::string getFilenameFromModule(const Module& M); +static void changeDataLayout(Module &); +static void changeTargetTriple(Module &); +static std::string printType(Type*); +static StringRef getMangledName(std::string); +static StringRef getAtomicMangledName(std::string, unsigned, bool); +static void findReturnInst(Function *, std::vector<ReturnInst *> &); +static void findIntrinsicInst(Function *, Intrinsic::ID, std::vector<IntrinsicInst *> &); +static StringRef getAtomicOpName(Intrinsic::ID, unsigned); +static std::string getMathFunctionName(Intrinsic::ID); + +// DFG2LLVM_SPIR - The first implementation. +struct DFG2LLVM_SPIR : public DFG2LLVM { + static char ID; // Pass identification, replacement for typeid + DFG2LLVM_SPIR() : DFG2LLVM(ID) {} + +private: + +public: + bool runOnModule(Module &M); +}; + +// Visitor for Code generation traversal (tree traversal for now) +class CGT_SPIR : public CodeGenTraversal { + +private: + //Member variables + std::unique_ptr<Module> KernelM; + DFNode* KernelLaunchNode = nullptr; + Kernel* kernel; + + // VISC Runtime API + Constant* llvm_visc_ocl_launch; + Constant* llvm_visc_ocl_wait; + Constant* llvm_visc_ocl_initContext; + Constant* llvm_visc_ocl_clearContext; + Constant* llvm_visc_ocl_argument_shared; + Constant* llvm_visc_ocl_argument_scalar; + Constant* llvm_visc_ocl_argument_ptr; + Constant* llvm_visc_ocl_output_ptr; + Constant* llvm_visc_ocl_free; + Constant* llvm_visc_ocl_getOutput; + Constant* llvm_visc_ocl_executeNode; + + //Functions + std::string getKernelsModuleName(Module &M); + void fixValueAddrspace(Value* V, unsigned addrspace); + Function* changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned i); + void removeAttributeAtArguments(Function* F, std::vector<unsigned> &Ags, Attribute::AttrKind attrKind); + void addCLMetadata(Function* F); + Function* transformFunctionToVoid(Function* F); + void removeInOutAttributes(Function* F); + void insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName); + + // Virtual Functions + void init() { + VISCTimer = VISCTimer_SPIR; + TargetName = "SPIR"; + } + void initRuntimeAPI(); + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); + +public: + + // Constructor + CGT_SPIR(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(&_M)) { + KernelLaunchNode = NULL; + init(); + initRuntimeAPI(); + errs() << "Old module pointer: " << &_M << "\n"; + errs() << "New module pointer: " << KernelM.get() << "\n"; + // Copying instead of creating new, in order to preserve required info (metadata) + // Remove functions, global variables and aliases + std::vector<GlobalVariable*> gvv = std::vector<GlobalVariable*>(); + for (Module::global_iterator mi = KernelM->global_begin(), + me = KernelM->global_end(); (mi != me); ++mi) { + GlobalVariable* gv = &*mi; + gvv.push_back(gv); + } + for (std::vector<GlobalVariable*>::iterator vi = gvv.begin(); vi != gvv.end(); ++vi) { + (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType())); + (*vi)->eraseFromParent(); + } + + std::vector<Function*> fv = std::vector<Function*>(); + for (Module::iterator mi = KernelM->begin(), + me = KernelM->end(); (mi != me); ++mi) { + Function* f = &*mi; + fv.push_back(f); + } + for (std::vector<Function*>::iterator vi = fv.begin(); vi != fv.end(); ++vi) { + (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType())); + (*vi)->eraseFromParent(); + } + + std::vector<GlobalAlias*> av = std::vector<GlobalAlias*>(); + for (Module::alias_iterator mi = KernelM->alias_begin(), + me = KernelM->alias_end(); (mi != me); ++mi) { + GlobalAlias* a = &*mi; + av.push_back(a); + } + for (std::vector<GlobalAlias*>::iterator vi = av.begin(); vi != av.end(); ++vi) { + (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType())); + (*vi)->eraseFromParent(); + } + + changeDataLayout(*KernelM); + changeTargetTriple(*KernelM); + + DEBUG(errs() << *KernelM); + + } + + void removeLLVMIntrinsics(); + void writeKernelsModule(); +}; + +// Initialize the VISC runtime API. This makes it easier to insert these calls +void CGT_SPIR::initRuntimeAPI() { + + // Load Runtime API Module + SMDiagnostic Err; + + char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); + + Twine llvmSrcRoot = LLVM_SRC_ROOT; + Twine runtimeAPI = llvmSrcRoot+"/../build/projects/visc-rt/visc-rt.ll"; + errs() << "Open file: " << runtimeAPI.str() << "\n"; + runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); + if(runtimeModule == NULL) + DEBUG(errs() << Err.getMessage()); + else + errs() << "Successfully loaded visc-rt API module\n"; + + // Get or insert the global declarations for launch/wait functions + DECLARE(llvm_visc_ocl_launch); + DECLARE(llvm_visc_ocl_wait); + DECLARE(llvm_visc_ocl_initContext); + DECLARE(llvm_visc_ocl_clearContext); + DECLARE(llvm_visc_ocl_argument_shared); + DECLARE(llvm_visc_ocl_argument_scalar); + DECLARE(llvm_visc_ocl_argument_ptr); + DECLARE(llvm_visc_ocl_output_ptr); + DECLARE(llvm_visc_ocl_free); + DECLARE(llvm_visc_ocl_getOutput); + DECLARE(llvm_visc_ocl_executeNode); + + // Get or insert timerAPI functions as well if you plan to use timers + initTimerAPI(); + + // Insert init context in main + DEBUG(errs() << "Gen Code to initialize SPIR Timer\n"); + Function* VI = M.getFunction("llvm.visc.init"); + assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); + + InitCall = cast<Instruction>(*VI->user_begin()); + initializeTimerSet(InitCall); + switchToTimer(visc_TimerID_INIT_CTX, InitCall); + CallInst::Create(llvm_visc_ocl_initContext, + ArrayRef<Value*>(getTargetID(M, visc::SPIR_TARGET)), + "", InitCall); + switchToTimer(visc_TimerID_NONE, InitCall); + + // Insert print instruction at visc exit + DEBUG(errs() << "Gen Code to print SPIR Timer\n"); + Function* VC = M.getFunction("llvm.visc.cleanup"); + DEBUG(errs() << *VC << "\n"); + assert(VC->getNumUses() == 1 && "__visc__clear should only be used once"); + + CleanupCall = cast<Instruction>(*VC->user_begin()); + printTimerSet(CleanupCall); + + +} + +// Generate Code to call the kernel +// The plan is to replace the internal node with a leaf node. This method is +// used to generate a function to associate with this leaf node. The function +// is responsible for all the memory allocation/transfer and invoking the +// kernel call on the device +void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName) { + // Check if clone already exists. If it does, it means we have visited this + // function before. +// assert(N->getGenFunc() == NULL && "Code already generated for this node"); + + assert(N->getGenFuncForTarget(visc::SPIR_TARGET) == NULL && + "Code already generated for this node"); + + // Useful values + Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1); + Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0); + + // If kernel struct has not been initialized with kernel function, then fail + assert(K != NULL && "No kernel found!!"); + + DEBUG(errs() << "Generating kernel call code\n"); + + Function* F = N->getFuncPointer(); + + + // Create of clone of F with no instructions. Only the type is the same as F + // without the extra arguments. + Function* F_X86; + + // Clone the function, if we are seeing this function for the first time. We + // only need a clone in terms of type. + ValueToValueMapTy VMap; + + // Create new function with the same type + F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); + + // Loop over the arguments, copying the names of arguments over. + Function::arg_iterator dest_iterator = F_X86->arg_begin(); + for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); + i != e; ++i) { + dest_iterator->setName(i->getName()); // Copy the name over... + // Increment dest iterator + ++dest_iterator; + } + + // Add a basic block to this empty function + BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86); + ReturnInst* RI = ReturnInst::Create(M.getContext(), + UndefValue::get(F_X86->getReturnType()), BB); + + // FIXME: Adding Index and Dim arguments are probably not required except + // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do + // have those arguments) + + // Add Index and Dim arguments except for the root node + if(!N->isRoot() && !N->getParent()->isChildGraphStreaming()) + F_X86 = addIdxDimArgs(F_X86); + + BB = &*F_X86->begin(); + RI = cast<ReturnInst>(BB->getTerminator()); + + //Add the generated function info to DFNode +// N->setGenFunc(F_X86, visc::CPU_TARGET); + N->addGenFunc(F_X86, visc::SPIR_TARGET, true); + + // Loop over the arguments, to create the VMap + dest_iterator = F_X86->arg_begin(); + for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); + i != e; ++i) { + // Add mapping to VMap and increment dest iterator + VMap[&*i] = &*dest_iterator; + ++dest_iterator; + } + + /* TODO: Use this code to verufy if this is a good pattern for OCL kernel + + // Sort children in topological order before code generation for kernel call + N->getChildGraph()->sortChildren(); + + // The DFNode N has the property that it has only one child (leaving Entry + // and Exit dummy nodes). This child is the OCL kernel. This simplifies code + // generation for kernel calls significantly. All the inputs to this child + // node would either be constants or from the parent node N. + + assert(N->getChildGraph()->size() == 3 + && "Node expected to have just one non-dummy node!"); + + DFNode* C; + for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); ci != ce; ++ci) { + C = *ci; + // Skip dummy node call + if (!C->isDummyNode()) + break; + } + + assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!"); + + Function* CF = C->getFuncPointer(); + */ + Function* KF = K->KernelLeafNode->getFuncPointer(); + // Initialize context + //DEBUG(errs() << "Initializing context" << "\n"); + //CallInst::Create(llvm_visc_ocl_initContext, None, "", RI); + + DEBUG(errs() << "Initializing commandQ" << "\n"); + // Initialize command queue + switchToTimer(visc_TimerID_SETUP, InitCall); + Value* fileStr = getStringPointer(FileName, InitCall, "Filename"); + DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n"); + DEBUG(errs() << "Generating code for kernel - " << K->KernelFunction->getName()<< "\n"); + Value* kernelStr = getStringPointer(K->KernelFunction->getName(), InitCall,"KernelName"); + + Value* LaunchInstArgs[] = {fileStr, kernelStr}; + + DEBUG(errs() << "Inserting launch call" << "\n"); + CallInst* SPIR_Ctx = CallInst::Create(llvm_visc_ocl_launch, + ArrayRef<Value*>(LaunchInstArgs, 2), + "graph"+KF->getName(), + InitCall); + DEBUG(errs() << *SPIR_Ctx << "\n"); + GraphIDAddr = new GlobalVariable(M, + SPIR_Ctx->getType(), + false, + GlobalValue::CommonLinkage, + Constant::getNullValue(SPIR_Ctx->getType()), + "graph"+KF->getName()+".addr"); + DEBUG(errs() << "Store at: " << *GraphIDAddr << "\n"); + StoreInst* SI = new StoreInst(SPIR_Ctx, GraphIDAddr, InitCall); + DEBUG(errs() << *SI << "\n"); + switchToTimer(visc_TimerID_NONE, InitCall); + switchToTimer(visc_TimerID_SETUP, RI); + Value* GraphID = new LoadInst(GraphIDAddr, "graph."+KF->getName(), RI); + + // Iterate over the required input edges of the node and use the visc-rt API + // to set inputs + DEBUG(errs() << "Iterate over input edges of node and insert visc api\n"); + std::vector<OutputPtr> OutputPointers; + // Vector to hold the device memory object that need to be cleared before we release + // context + std::vector<Value*> DevicePointers; + + std::map<unsigned, unsigned> kernelInArgMap = K->getInArgMap(); +/* + for(unsigned i=0; i<KF->getFunctionType()->getNumParams(); i++) { + + // The kernel object gives us the mapping of arguments from kernel launch + // node function (F_X86) to kernel (kernel->KF) + Value* inputVal = getArgumentAt(F_X86, K->getInArgMap()[i]); + +*/ + for(std::map<unsigned, unsigned>::iterator ib = kernelInArgMap.begin(), + ie = kernelInArgMap.end(); ib != ie; ++ib) { + unsigned i = ib->first; + Value* inputVal = getArgumentAt(F_X86, ib->second); + DEBUG(errs() << "\tArgument "<< i<< " = " << *inputVal << "\n"); + + // input value has been obtained. + // Check if input is a scalar value or a pointer operand + // For scalar values such as int, float, etc. the size is simply the size of + // type on target machine, but for pointers, the size of data would be the + // next integer argument + if(inputVal->getType()->isPointerTy()) { + + switchToTimer(visc_TimerID_COPY_PTR, RI); + // Pointer Input + // CheckAttribute + Value* isOutput = (hasAttribute(KF, i, Attribute::Out))? True : False; + Value* isInput = ((hasAttribute(KF, i, Attribute::Out)) + && !(hasAttribute(KF, i, Attribute::In)))? False : True; + + Argument* A = getArgumentAt(KF, i); + if(isOutput == True) { + DEBUG(errs() << *A << " is an OUTPUT argument\n"); + } + if(isInput == True) { + DEBUG(errs() << *A << " is an INPUT argument\n"); + } + + + Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal, + Type::getInt8PtrTy(M.getContext()), + inputVal->getName()+".i8ptr", + RI); + + // Assert that the pointer argument size (next argument) is in the map + assert(kernelInArgMap.find(i+1) != kernelInArgMap.end()); + + Value* inputSize = getArgumentAt(F_X86, kernelInArgMap[i+1]); + + assert(inputSize->getType() == Type::getInt64Ty(M.getContext()) + && "Pointer type input must always be followed by size (integer type)"); + Value* setInputArgs[] = {GraphID, + inputValI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),i), + inputSize, + isInput, + isOutput + }; + Value* d_ptr = CallInst::Create(llvm_visc_ocl_argument_ptr, + ArrayRef<Value*>(setInputArgs, 6), "", RI); + DevicePointers.push_back(d_ptr); + // If this has out attribute, store the returned device pointer in + // memory to read device memory later + if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize)); + } + else { + switchToTimer(visc_TimerID_COPY_SCALAR, RI); + // Scalar Input + // Store the scalar value on stack and then pass the pointer to its + // location + AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), inputVal->getName()+".ptr", RI); + StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI); + + Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr, + Type::getInt8PtrTy(M.getContext()), + inputVal->getName()+".i8ptr", + RI); + + Value* setInputArgs[] = {GraphID, + inputValI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),i), + ConstantExpr::getSizeOf(inputVal->getType()) + }; + CallInst::Create(llvm_visc_ocl_argument_scalar, + ArrayRef<Value*>(setInputArgs, 4), "", RI); + } + } + + DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n"); + + // Check to see if all the allocation sizes are constant (determined + // statically) + bool constSizes = true; + for (auto& e: K->getSharedInArgMap()) { + constSizes &= isa<Constant>(e.second.first); + } + + // If the sizes are all constant + if (constSizes) { + for (auto& e: K->getSharedInArgMap()) { + unsigned argNum = e.first; + Value* allocSize = e.second.first; + + DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = " << *allocSize << "\n"); + + if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) { + // Shared memory ptr argument - scalar at size position + switchToTimer(visc_TimerID_COPY_SCALAR, RI); + + assert(isa<Constant>(allocSize) && "Constant shared memory size is expected"); + + Value* setInputArgs[] = {GraphID, + ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), + allocSize + }; + CallInst::Create(llvm_visc_ocl_argument_shared, + ArrayRef<Value*>(setInputArgs, 3), "", RI); + } + else { + // Sharem memory size argument - scalar at address position + switchToTimer(visc_TimerID_COPY_SCALAR, RI); + // Store the scalar value on stack and then pass the pointer to its + // location + AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), + allocSize->getName()+".sharedMem.ptr", RI); + StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI); + + Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr, + Type::getInt8PtrTy(M.getContext()), + allocSize->getName()+".sharedMem.i8ptr", + RI); + + Value* setInputArgs[] = {GraphID, + allocSizeI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), + ConstantExpr::getSizeOf(allocSize->getType()) + }; + CallInst::Create(llvm_visc_ocl_argument_scalar, + ArrayRef<Value*>(setInputArgs, 4), "", RI); + } + } + } else { + + Function *F_alloc = K->AllocationFunction; + StructType *FAllocRetTy = dyn_cast<StructType>(F_alloc->getReturnType()); + assert(FAllocRetTy && "Allocation node with no struct return type"); + + std::vector<Value *> AllocInputArgs; + for (unsigned i = 0; i < K->allocInArgMap.size(); i++) { + AllocInputArgs.push_back(getArgumentAt(F_X86, K->allocInArgMap.at(i))); + } + + CallInst *CI = CallInst::Create(F_alloc, AllocInputArgs, "", RI); + std::vector<ExtractValueInst *> ExtractValueInstVec; + for (unsigned i = 1; i < FAllocRetTy->getNumElements(); i += 2) { + ExtractValueInst *EI = ExtractValueInst::Create(CI, i, "", RI); + ExtractValueInstVec.push_back(EI); + } + + for (auto& e: K->getSharedInArgMap()) { + unsigned argNum = e.first; + Value* allocSize = ExtractValueInstVec[e.second.second/2]; + + DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = " << *allocSize << "\n"); + + if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) { + // Shared memory ptr argument - scalar at size position + switchToTimer(visc_TimerID_COPY_SCALAR, RI); + + Value* setInputArgs[] = {GraphID, + ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), + allocSize + }; + CallInst::Create(llvm_visc_ocl_argument_shared, + ArrayRef<Value*>(setInputArgs, 3), "", RI); + } + else { + // Sharem memory size argument - scalar at address position + switchToTimer(visc_TimerID_COPY_SCALAR, RI); + // Store the scalar value on stack and then pass the pointer to its + // location + AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), + allocSize->getName()+".sharedMem.ptr", RI); + StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI); + + Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr, + Type::getInt8PtrTy(M.getContext()), + allocSize->getName()+".sharedMem.i8ptr", + RI); + + Value* setInputArgs[] = {GraphID, + allocSizeI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), + ConstantExpr::getSizeOf(allocSize->getType()) + }; + CallInst::Create(llvm_visc_ocl_argument_scalar, + ArrayRef<Value*>(setInputArgs, 4), "", RI); + } + } + } + + + DEBUG(errs() << "Setup output edges of node and insert visc api\n"); + + // Set output if struct is not an empty struct + StructType* OutputTy = K->KernelLeafNode->getOutputType(); + std::vector<Value*> d_Outputs; + if(!OutputTy->isEmptyTy()) { + switchToTimer(visc_TimerID_COPY_PTR, RI); + // Not an empty struct + // Iterate over all elements of the struct and put them in + for(unsigned i=0; i < OutputTy->getNumElements(); i++) { + unsigned outputIndex = KF->getFunctionType()->getNumParams()+i; + Value* setOutputArgs[] = {GraphID, + ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex), + ConstantExpr::getSizeOf(OutputTy->getElementType(i))}; + + CallInst* d_Output = CallInst::Create(llvm_visc_ocl_output_ptr, + ArrayRef<Value*>(setOutputArgs, 3), + "d_output."+KF->getName(), + RI); + d_Outputs.push_back(d_Output); + } + } + + // Enqueue kernel + // Need work dim, localworksize, globalworksize + // Allocate size_t[numDims] space on stack. Store the work group sizes and + // pass it as an argument to ExecNode + + switchToTimer(visc_TimerID_MISC, RI); + Value *workDim, *LocalWGPtr, *GlobalWGPtr; + getExecuteNodeParams(M, workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI); + switchToTimer(visc_TimerID_KERNEL, RI); + Value* ExecNodeArgs[] = {GraphID, + workDim, + LocalWGPtr, + GlobalWGPtr + }; + CallInst* Event = CallInst::Create(llvm_visc_ocl_executeNode, + ArrayRef<Value*>(ExecNodeArgs, 4), + "event."+KF->getName(), + RI); + DEBUG(errs() << "Execute Node Call: " << *Event << "\n"); + + // Wait for Kernel to Finish + CallInst::Create(llvm_visc_ocl_wait, + ArrayRef<Value*>(GraphID), + "", + RI); + + switchToTimer(visc_TimerID_READ_OUTPUT, RI); + // Read Output Struct if not empty + if(!OutputTy->isEmptyTy()) { + std::vector<Value*>h_Outputs; + Value* KernelOutput = UndefValue::get(OutputTy); + for(unsigned i=0; i < OutputTy->getNumElements(); i++) { + Value* GetOutputArgs[] = {GraphID, + Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), + d_Outputs[i], + ConstantExpr::getSizeOf(OutputTy->getElementType(i)) + }; + CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput, + ArrayRef<Value*>(GetOutputArgs, 4), + "h_output."+KF->getName()+".addr", + RI); + // Read each device pointer listed in output struct + // Load the output struct + CastInst* BI = BitCastInst::CreatePointerCast(h_Output, + OutputTy->getElementType(i)->getPointerTo(), "output.ptr", RI); + + Value* OutputElement = new LoadInst(BI, "output."+KF->getName(), RI); + KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, ArrayRef<unsigned>(i), + KF->getName()+"output", RI); + } + OutputMap[K->KernelLeafNode] = KernelOutput; + } + + // Read all the pointer arguments which had side effects i.e., had out + // attribute + DEBUG(errs() << "Output Pointers : " << OutputPointers.size() << "\n"); + // FIXME: Not reading output pointers anymore as we read them when data is + // actually requested + /*for(auto output: OutputPointers) { + DEBUG(errs() << "Read: " << *output.d_ptr << "\n"); + DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n"); + DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n"); + + Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes}; + CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput, + ArrayRef<Value*>(GetOutputArgs, 4), + "", RI); + }*/ + switchToTimer(visc_TimerID_MEM_FREE, RI); + // Clear Context and free device memory + DEBUG(errs() << "Clearing context" << "\n"); + // Free Device Memory + for(auto d_ptr: DevicePointers) { + CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value*>(d_ptr), "", RI); + } + switchToTimer(visc_TimerID_CLEAR_CTX, CleanupCall); + // Clear Context + LoadInst* LI = new LoadInst(GraphIDAddr, "", CleanupCall); + CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value*>(LI), "", CleanupCall); + switchToTimer(visc_TimerID_NONE, CleanupCall); + + switchToTimer(visc_TimerID_MISC, RI); + DEBUG(errs() << "*** Generating epilogue code for the function****\n"); + // Generate code for output bindings + // Get Exit node + DFNode* C = N->getChildGraph()->getExit(); + // Get OutputType of this node + StructType* OutTy = N->getOutputType(); + Value *retVal = UndefValue::get(F_X86->getReturnType()); + // Find the kernel's output arg map, to use instead of the bindings + std::vector<unsigned> outArgMap = kernel->getOutArgMap(); + // Find all the input edges to exit node + for (unsigned i=0; i < OutTy->getNumElements(); i++) { + DEBUG(errs() << "Output Edge " << i << "\n"); + // Find the incoming edge at the requested input port + DFEdge* E = C->getInDFEdgeAt(i); + + assert(E && "No Binding for output element!"); + // Find the Source DFNode associated with the incoming edge + DFNode* SrcDF = E->getSourceDF(); + + DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); + + // If Source DFNode is a dummyNode, edge is from parent. Get the + // argument from argument list of this internal node + Value* inputVal; + if(SrcDF->isEntryNode()) { + inputVal = getArgumentAt(F_X86, i); + DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + } + else { + // edge is from a internal node + // Check - code should already be generated for this source dfnode + // FIXME: Since the 2-level kernel code gen has aspecific structure, we + // can assume the SrcDF is same as Kernel Leaf node. + // Use outArgMap to get correct mapping + SrcDF = K->KernelLeafNode; + assert(OutputMap.count(SrcDF) + && "Source node call not found. Dependency violation!"); + + // Find Output Value associated with the Source DFNode using OutputMap + Value* CI = OutputMap[SrcDF]; + + // Extract element at source position from this call instruction + std::vector<unsigned> IndexList; + // i is the destination of DFEdge E + // Use the mapping instead of the bindings +// IndexList.push_back(E->getSourcePosition()); + IndexList.push_back(outArgMap[i]); + DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); + ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, + "",RI); + inputVal = EI; + } + std::vector<unsigned> IdxList; + IdxList.push_back(i); + retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI); + } + + DEBUG(errs() << "Extracted all\n"); + switchToTimer(visc_TimerID_NONE, RI); + retVal->setName("output"); + ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); + ReplaceInstWithInst(RI, newRI); +} + + +// Right now, only targeting the one level case. In general, device functions +// can return values so we don't need to change them +void CGT_SPIR::codeGen(DFInternalNode* N) { + errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n"; + if(KernelLaunchNode == NULL) + errs () << "No kernel launch node\n"; + else { + errs () << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n"; + } + + + if (!KernelLaunchNode) { + DEBUG(errs() << "No code generated (host code for kernel launch complete).\n"); + return; + } + + if (N == KernelLaunchNode) { + DEBUG(errs() << "Found kernel launch node. Generating host code.\n"); + //TODO + + // Now the remaining nodes to be visited should be ignored + KernelLaunchNode = NULL; + DEBUG(errs() << "Insert Runtime calls\n"); + insertRuntimeCalls(N, kernel, getSPIRFilename(M)); + + } else { + DEBUG(errs() << "Found intermediate node. Getting size parameters.\n"); + // Keep track of the arguments order. + std::map<unsigned, unsigned> inmap1 = N->getInArgMap(); + std::map<unsigned, unsigned> inmap2 = kernel->getInArgMap(); + // TODO: Structure assumed: one thread node, one allocation node (at most), + // TB node + std::map<unsigned, unsigned> inmapFinal; + for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), ie = inmap2.end(); + ib != ie; ++ib) { + inmapFinal[ib->first] = inmap1[ib->second]; + } + + kernel->setInArgMap(inmapFinal); + + // Keep track of the output arguments order. + std::vector<unsigned> outmap1 = N->getOutArgMap(); + std::vector<unsigned> outmap2 = kernel->getOutArgMap(); + + // TODO: Change when we have incoming edges to the dummy exit node from more + // than one nodes. In this case, the number of bindings is the same, but + // their destination position, thus the index in outmap1, is not + // 0 ... outmap2.size()-1 + // The limit is the size of outmap2, because this is the number of kernel + // output arguments for which the mapping matters + // For now, it reasonable to assume that all the kernel arguments are returned, + // maybe plys some others from other nodes, thus outmap2.size() <= outmap1.size() + for (unsigned i = 0; i < outmap2.size(); i++) { + outmap1[i] = outmap2[outmap1[i]]; + } + kernel->setOutArgMap(outmap1); + + // Track the source of local dimlimits for the kernel + // Dimension limit can either be a constant or an argument of parent + // function. Since Internal node would no longer exist, we need to insert the + // localWGSize with values from the parent of N. + std::vector<Value*> localWGSizeMapped; + for (unsigned i = 0; i < kernel->localWGSize.size(); i++) { + if (isa<Constant>(kernel->localWGSize[i])) { + // if constant, use as it is + localWGSizeMapped.push_back(kernel->localWGSize[i]); + } + else if (Argument* Arg = dyn_cast<Argument>(kernel->localWGSize[i])) { + // if argument, find the argument location in N. Use InArgMap of N to + // find the source location in Parent of N. Retrieve the argument from + // parent to insert in the vector. + unsigned argNum = Arg->getArgNo(); + // This argument will be coming from the parent node, not the allocation + // Node + assert(N->getInArgMap().find(argNum) != N->getInArgMap().end()); + + unsigned parentArgNum = N->getInArgMap()[argNum]; + Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum); + localWGSizeMapped.push_back(A); + } + else { + assert(false && "LocalWGsize using value which is neither argument nor constant!"); + } + } + // Update localWGSize vector of kernel + kernel->setLocalWGSize(localWGSizeMapped); + } + +} + +//static bool checkPreferredTarget(DFNode* N, visc::Target T) { + //Function* F = N->getFuncPointer(); + //Module* M = F->getParent(); + //NamedMDNode* HintNode; + //switch (T) { + //case visc::GPU_TARGET: + //HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); + //break; + //case visc::SPIR_TARGET: + //HintNode = M->getOrInsertNamedMetadata("visc_hint_spir"); + //break; + //case visc::CPU_TARGET: + //HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); + //break; + //default: + //llvm_unreachable("Target Not supported yet!"); + //} + //for (unsigned i = 0; i < HintNode->getNumOperands(); i++) { + //MDNode* MetaNode = HintNode->getOperand(i); + //if(F == MetaNode->getOperand(0)) + //return true; + //} + //return false; +//} + +void CGT_SPIR::codeGen(DFLeafNode* N) { + + // Skip code generation if it is a dummy node + if(N->isDummyNode()) { + DEBUG(errs() << "Skipping dummy node\n"); + return; + } + + // Skip code generation if it is an allocation node + if(N->isAllocationNode()) { + DEBUG(errs() << "Skipping allocation node\n"); + return; + } + + // Generate code only if it has the right hint +// if(!checkPreferredTarget(N, visc::SPIR_TARGET)) { +// errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; +// return; +// } + if(!preferredTargetIncludes(N, visc::SPIR_TARGET)) { + errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; + return; + } + + // Checking which node is the kernel launch + DFNode* PNode = N->getParent(); + int pLevel = PNode->getLevel(); + int pReplFactor = PNode->getNumOfDim(); + + // Choose parent node as kernel launch if: + // (1) Parent is the top level node i.e., Root of DFG + // OR + // (2) Parent does not have multiple instances + errs() << "pLevel = " << pLevel << "\n"; + errs() << "pReplFactor = " << pReplFactor << "\n"; + + if (!pLevel || !pReplFactor) { + errs() << "*************** Kernel Gen: 1-Level Hierarchy **************\n"; + KernelLaunchNode = PNode; + errs() << "Setting Kernel Launch Node\n"; + kernel = new Kernel(NULL, + N, + N->getInArgMap(), + N->getSharedInArgMap(), + N->getOutArgMap(), + N->getNumOfDim(), + N->getDimLimits()); + } + else { + // Converting a 2-level DFG to opencl kernel + errs() << "*************** Kernel Gen: 2-Level Hierarchy **************\n"; + KernelLaunchNode = PNode->getParent(); + assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match"); + // Contains the instructions generating the kernel configuration parameters + kernel = new Kernel(NULL, // kernel function + N, // kernel leaf node + N->getInArgMap(), // kenel argument mapping + N->getSharedInArgMap(), + N->getOutArgMap(), // kernel output mapping from the leaf to the interemediate node + PNode->getNumOfDim(), // gridDim + PNode->getDimLimits(),// grid size + N->getNumOfDim(), // blockDim + N->getDimLimits()); // block size + + } + + std::vector<IntrinsicInst *> IItoRemove; + BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap; + + // Get the function associated with the dataflow node + Function *F = N->getFuncPointer(); + + // Look up if we have visited this function before. If we have, then just + // get the cloned function pointer from DFNode. Otherwise, create the cloned + // function and add it to the DFNode GenFunc. + Function *F_spir = N->getGenFuncForTarget(visc::SPIR_TARGET); + assert(F_spir == NULL && "Error: Visiting a node for which code already generated"); + + // Clone the function + ValueToValueMapTy VMap; + + Twine FName = F->getName(); + F_spir = CloneFunction(F, VMap); + F_spir->setName(FName+"_spir"); + errs() << "Old Function Name: " << F->getName() << "\n"; + errs() << "New Function Name: " << F_spir->getName() << "\n"; + + F_spir->removeFromParent(); + + // Insert the cloned function into the kernels module + KernelM->getFunctionList().push_back(F_spir); + + //TODO: Iterate over all the instructions of F_spir and identify the + //callees and clone them into this module. + DEBUG(errs() << *F_spir->getType()); + DEBUG(errs() << *F_spir); + + //Add generated function info to DFNode + //N->setGenFunc(F_spir, visc::SPIR_TARGET); + + F_spir = transformFunctionToVoid(F_spir); + + // Add generated function info to DFNode + //N->setGenFunc(F_spir, visc::SPIR_TARGET); + + removeInOutAttributes(F_spir); + + //Add generated function info to DFNode + N->addGenFunc(F_spir, visc::SPIR_TARGET, false); + + DEBUG(errs() << "Removing all attributes from Kernel Function and adding nounwind\n"); + F_spir->removeAttributes(AttributeSet::FunctionIndex, F_spir->getAttributes().getFnAttributes()); + F_spir->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind); + + + //FIXME: For now, assume only one allocation node + kernel->AllocationNode = NULL; + + for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), iee = N->indfedge_end(); + ieb != iee; ++ieb) { + DFNode *SrcDFNode = (*ieb)->getSourceDF(); + DEBUG(errs() << "Found edge from node: " << " " << SrcDFNode->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "Current Node: " << N->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n"); + if (!SrcDFNode->isDummyNode()) { + assert(SrcDFNode->isAllocationNode()); + kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode); + kernel->allocInArgMap = SrcDFNode->getInArgMap(); + break; + } + } + + // Vector for shared memory arguments + std::vector<unsigned> SharedMemArgs; + + // If no allocation node was found, SharedMemArgs is empty + if (kernel->AllocationNode) { + + ValueToValueMapTy VMap; + Function *F_alloc = CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap); + //F_alloc->removeFromParent(); + // Insert the cloned function into the kernels module + //M.getFunctionList().push_back(F_alloc); + + std::vector<IntrinsicInst *> ViscMallocInstVec; + findIntrinsicInst(F_alloc, Intrinsic::visc_malloc, ViscMallocInstVec); + + for (unsigned i = 0; i < ViscMallocInstVec.size(); i++) { + IntrinsicInst *II = ViscMallocInstVec[i]; + assert(II->hasOneUse() && "visc_malloc result is used more than once"); + II->replaceAllUsesWith(ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext()))); + II->eraseFromParent(); + } + kernel->AllocationFunction = F_alloc; + + // This could be used to check that the allocation node has the appropriate + // number of fields in its return struct +/* + ReturnInst *RI = ReturnInstVec[0]; + Value *RetVal = RI->getReturnValue(); + Type *RetTy = RetVal->getType(); + StructType *RetStructTy = dyn_cast<StructType>(RetTy); + assert(RetStructTy && "Allocation node does not return a struct type"); + unsigned numFields = RetStructTy->getNumElements(); +*/ + std::map<unsigned, std::pair<Value*, unsigned> > sharedInMap = kernel->getSharedInArgMap(); + AllocationNodeProperty* APN = + (AllocationNodeProperty*) kernel->AllocationNode->getProperty(DFNode::Allocation); + for (auto& AllocPair: APN->getAllocationList()) { + unsigned destPos = AllocPair.first->getDestPosition(); + unsigned srcPos = AllocPair.first->getSourcePosition(); + SharedMemArgs.push_back(destPos); + sharedInMap[destPos] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1); + sharedInMap[destPos+1] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1); + } + kernel->setSharedInArgMap(sharedInMap); + } + std::sort(SharedMemArgs.begin(), SharedMemArgs.end()); + + // All pointer args which are not shared memory pointers have to be moved to + // global address space + unsigned argIndex = 0; + std::vector<unsigned> GlobalMemArgs; + for(auto& Arg: F_spir->getArgumentList()) { + if (Arg.getType()->isPointerTy()) { + // If the arguement is already chosen for shared memory arguemnt list, skip. + // Else put it in Global memory arguement list + if(std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == 0) { + GlobalMemArgs.push_back(argIndex); + } + } + argIndex++; + } + std::sort(GlobalMemArgs.begin(), GlobalMemArgs.end()); + + /* At this point, we assume that chescks for the fact that SharedMemArgs only + contains pointer arguments to GLOBAL_ADDRSPACE have been performed by the + analysis pass */ + + F_spir = changeArgAddrspace(F_spir, SharedMemArgs, SHARED_ADDRSPACE); + removeAttributeAtArguments(F_spir, SharedMemArgs, Attribute::NoCapture); + F_spir = changeArgAddrspace(F_spir, GlobalMemArgs, GLOBAL_ADDRSPACE); + + + // Go through all the instructions + for (inst_iterator i = inst_begin(F_spir), e = inst_end(F_spir); i != e; ++i) { + Instruction *I = &(*i); + // Leaf nodes should not contain VISC graph intrinsics or launch + assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!"); + assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!"); + + if (BuildDFG::isViscIntrinsic(I)) { + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + IntrinsicInst* ArgII; + DFNode* ArgDFNode; + + /************************ Handle VISC Query intrinsics ************************/ + + switch (II->getIntrinsicID()) { + /**************************** llvm.visc.getNode() *****************************/ + case Intrinsic::visc_getNode: { + DEBUG(errs() << F_spir->getName() << "\t: Handling getNode\n"); + // add mapping <intrinsic, this node> to the node-specific map + Leaf_HandleToDFNodeMap[II] = N; + IItoRemove.push_back(II); + } + break; + /************************* llvm.visc.getParentNode() **************************/ + case Intrinsic::visc_getParentNode: { + DEBUG(errs() << F_spir->getName() << "\t: Handling getParentNode\n"); + // get the parent node of the arg node + // get argument node + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + // get the parent node of the arg node + // Add mapping <intrinsic, parent node> to the node-specific map + // the argument node must have been added to the map, orelse the + // code could not refer to it + Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent(); + + IItoRemove.push_back(II); + } + break; + /*************************** llvm.visc.getNumDims() ***************************/ + case Intrinsic::visc_getNumDims: { + DEBUG(errs() << F_spir->getName() << "\t: Handling getNumDims\n"); + // get node from map + // get the appropriate field + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + int numOfDim = ArgDFNode->getNumOfDim(); + DEBUG(errs() << "\t Got node dimension : " << numOfDim << "\n"); + IntegerType* IntTy = Type::getInt32Ty(KernelM->getContext()); + ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim); + + // Replace the result of the intrinsic with the computed value + II->replaceAllUsesWith(numOfDimConstant); + + IItoRemove.push_back(II); + } + break; + /*********************** llvm.visc.getNodeInstanceID() ************************/ + case Intrinsic::visc_getNodeInstanceID_x: + case Intrinsic::visc_getNodeInstanceID_y: + case Intrinsic::visc_getNodeInstanceID_z: { + DEBUG(errs() << F_spir->getName() << "\t: Handling getNodeInstanceID\n"); + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + assert(ArgDFNode && "Arg node is NULL"); + // A leaf node always has a parent + DFNode* ParentDFNode = ArgDFNode->getParent(); + assert(ParentDFNode && "Parent node of a leaf is NULL"); + + // Get the number associated with the required dimension + // FIXME: The order is important! + // These three intrinsics need to be consecutive x,y,z + uint64_t dim = II->getIntrinsicID() - + Intrinsic::visc_getNodeInstanceID_x; + assert((dim >= 0) && (dim < 3) && "Invalid dimension argument"); + DEBUG(errs() << "\t dimension = " << dim << "\n"); + + // Argument of the function to be called + ConstantInt * DimConstant = + ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); + //ArrayRef<Value *> Args(DimConstant); + + // The following is to find which function to call + Function * OpenCLFunction; + int parentLevel = N->getParent()->getLevel(); + int parentReplFactor = N->getParent()->getNumOfDim(); + DEBUG(errs() << "Parent Level = " << parentLevel << "\n"); + DEBUG(errs() << "Parent Repl factor = " << parentReplFactor << "\n"); + + FunctionType* FT = + FunctionType::get(Type::getInt64Ty(KernelM->getContext()), + ArrayRef<Type*>(Type::getInt32Ty(KernelM->getContext())), + false); + + if ((!parentLevel || !parentReplFactor) && ArgDFNode == N) { + // We only have one level in the hierarchy or the parent node is not + // replicated. This indicates that the parent node is the kernel + // launch, so we need to specify a global id. + // We can translate this only if the argument is the current node + // itself + DEBUG(errs() << "Substitute with get_global_id()\n"); + DEBUG(errs() << *II << "\n"); + OpenCLFunction = cast<Function> + (KernelM->getOrInsertFunction(getMangledName("get_global_id"), FT)); + } else if (Leaf_HandleToDFNodeMap[ArgII] == N) { + // We are asking for this node's id with respect to its parent + // this is a local id call + OpenCLFunction = cast<Function> + (KernelM->getOrInsertFunction(getMangledName("get_local_id"), FT)); + } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) { + // We are asking for this node's parent's id with respect to its + // parent: this is a group id call + OpenCLFunction = cast<Function> + (KernelM->getOrInsertFunction(getMangledName("get_group_id"), FT)); + } else { + errs() << N->getFuncPointer()->getName() << "\n"; + errs() << N->getParent()->getFuncPointer()->getName() << "\n"; + errs() << *II << "\n"; + + assert(false && "Unable to translate getNodeInstanceID intrinsic"); + } + + // Create call instruction, insert it before the intrinsic and truncate + // the output to 32 bits and replace all the uses of the previous + // instruction with the new one + CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); + II->replaceAllUsesWith(CI); + + IItoRemove.push_back(II); + } + break; + /********************** llvm.visc.getNumNodeInstances() ***********************/ + case Intrinsic::visc_getNumNodeInstances_x: + case Intrinsic::visc_getNumNodeInstances_y: + case Intrinsic::visc_getNumNodeInstances_z: { +//TODO: think about whether this is the best way to go +// there are hw specific registers. therefore it is good to have the intrinsic +// but then, why do we need to keep that info in the graph? +// (only for the kernel configuration during the call) + + DEBUG(errs() << F_spir->getName() << "\t: Handling getNumNodeInstances\n"); + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + // A leaf node always has a parent + DFNode* ParentDFNode = ArgDFNode->getParent(); + assert(ParentDFNode && "Parent node of a leaf is NULL"); + + // Get the number associated with the required dimension + // FIXME: The order is important! + // These three intrinsics need to be consecutive x,y,z + uint64_t dim = II->getIntrinsicID() - + Intrinsic::visc_getNumNodeInstances_x; + assert((dim >= 0) && (dim < 3) && "Invalid dimension argument"); + DEBUG(errs() << "\t dimension = " << dim << "\n"); + + // Argument of the function to be called + ConstantInt * DimConstant = + ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); + //ArrayRef<Value *> Args(DimConstant); + + // The following is to find which function to call + Function * OpenCLFunction; + int parentLevel = ParentDFNode->getLevel(); + int parentReplFactor = ParentDFNode->getNumOfDim(); + + FunctionType* FT = + FunctionType::get(Type::getInt64Ty(KernelM->getContext()), + Type::getInt32Ty(KernelM->getContext()), + false); + if ((N == ArgDFNode) && (!parentLevel || !parentReplFactor)) { + // We only have one level in the hierarchy or the parent node is not + // replicated. This indicates that the parent node is the kernel + // launch, so the instances are global_size (gridDim x blockDim) + OpenCLFunction = cast<Function> + (KernelM->getOrInsertFunction(getMangledName("get_global_size"), FT)); + } else if (Leaf_HandleToDFNodeMap[ArgII] == N) { + // We are asking for this node's instances + // this is a local size (block dim) call + OpenCLFunction = cast<Function> + (KernelM->getOrInsertFunction(getMangledName("get_local_size"), FT)); + } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) { + // We are asking for this node's parent's instances + // this is a (global_size/local_size) (grid dim) call + OpenCLFunction = cast<Function> + (KernelM->getOrInsertFunction(getMangledName("get_num_groups"), FT)); + } else { + assert(false && "Unable to translate getNumNodeInstances intrinsic"); + } + + // Create call instruction, insert it before the intrinsic and truncate + // the output to 32 bits and replace all the uses of the previous + // instruction with the new one + CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); + II->replaceAllUsesWith(CI); + + IItoRemove.push_back(II); + } + break; + case Intrinsic::visc_barrier: + { + DEBUG(errs() << F_spir->getName() << "\t: Handling barrier\n"); + DEBUG(errs() << "Substitute with barrier()\n"); + DEBUG(errs() << *II << "\n"); + FunctionType* FT = FunctionType::get(Type::getVoidTy(KernelM->getContext()), + std::vector<Type*>(1, Type::getInt32Ty(KernelM->getContext())), + false); + Function* OpenCLFunction = cast<Function> + (KernelM->getOrInsertFunction(getMangledName("barrier"), FT)); + CallInst* CI = CallInst::Create(OpenCLFunction, + ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)), + "", II); + II->replaceAllUsesWith(CI); + IItoRemove.push_back(II); + } + break; + case Intrinsic::visc_atomic_cmpxchg: + case Intrinsic::visc_atomic_add: + case Intrinsic::visc_atomic_sub: + case Intrinsic::visc_atomic_xchg: + case Intrinsic::visc_atomic_min: + case Intrinsic::visc_atomic_umin: + case Intrinsic::visc_atomic_max: + case Intrinsic::visc_atomic_umax: + case Intrinsic::visc_atomic_and: + case Intrinsic::visc_atomic_or: + case Intrinsic::visc_atomic_xor: + case Intrinsic::visc_atomic_inc: + case Intrinsic::visc_atomic_dec: + { + DEBUG(errs() << *II << "\n"); + // Only have support for i32 atomic intrinsics + assert(II->getType() == Type::getInt32Ty(II->getContext()) + && "Only support i32 atomic intrinsics for now"); + // Substitute with appropriate atomic builtin + assert(II->getNumArgOperands() == 2 && "Expecting 2 operands for these atomics"); + + Value* Ptr = II->getArgOperand(0); + Value* Val = II->getArgOperand(1); + assert(Ptr->getType()->isPointerTy() + && "First argument of supported atomics is expected to be a pointer"); + PointerType* PtrTy = cast<PointerType>(Ptr->getType()); + if(PtrTy != Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace())) { + Ptr = CastInst::CreatePointerCast(Ptr, + Type::getInt32PtrTy(II->getContext(), + PtrTy->getAddressSpace()), "", II); + } + + StringRef name = getAtomicOpName(II->getIntrinsicID(), PtrTy->getAddressSpace()); + + Type* paramTypes[] = { Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace()), + Type::getInt32Ty(KernelM->getContext()) + }; + FunctionType* AtomicFT = FunctionType::get(II->getType(), + ArrayRef<Type*>(paramTypes, 2), + false); + Function* AtomicFunction = cast<Function> + (KernelM->getOrInsertFunction(name, AtomicFT)); + Value* atomicArgs[] = { Ptr, Val }; + CallInst* AtomicInst = CallInst::Create(AtomicFunction, + ArrayRef<Value*>(atomicArgs, 2), + "", II); + + DEBUG(errs() << "Substitute with: " << *AtomicInst << "\n"); + II->replaceAllUsesWith(AtomicInst); + IItoRemove.push_back(II); + } + break; + default: + assert(false && "Unknown VISC Intrinsic!"); + break; + } + + } + else if(CallInst* CI = dyn_cast<CallInst>(I)) { + DEBUG(errs() << "Found a call: " << *CI << "\n"); + Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts()); + if(calleeF->isDeclaration()) { + // Add the declaration to kernel module + DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n"); + KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType()); + if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(CI)) { + // Now handle a few specific intrinsics + // For now, sin and cos are translated to their libclc equivalent + switch(II->getIntrinsicID()) { + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::sqrt: + case Intrinsic::floor: + case Intrinsic::nvvm_rsqrt_approx_f: + { + DEBUG(errs() << "Found math function: " << *II << "\n"); + // Get the builtin function + // SPIR uses mangled name for builtin math functions + assert(II->getType()->isFloatTy() + && "Only handling sin(float) and cos(float)!"); + std::string name = getMathFunctionName(II->getIntrinsicID()); + + FunctionType* MathFT = FunctionType::get(II->getType(), + Type::getFloatTy(KernelM->getContext()), + false); + Function* MathFunction = cast<Function> + (KernelM->getOrInsertFunction(name, MathFT)); + CallInst* CI = CallInst::Create(MathFunction, II->getArgOperand(0), II->getName(), II); + + II->replaceAllUsesWith(CI); + IItoRemove.push_back(II); + break; + } + default: + DEBUG(errs() << "[WARNING] Found Intrinsic: " << *II << "\n" ); + } + } + + } + else { + // Clone the function + ValueToValueMapTy VMap; + Function* newCalleeF = CloneFunction(calleeF, VMap); + newCalleeF->removeFromParent(); //TODO: MARIA check + KernelM->getFunctionList().push_back(newCalleeF); + } + //TODO: how to handle address space qualifiers in load/store + } + + } + + // We need to do this explicitly: DCE pass will not remove them because we + // have assumed theworst memory behaviour for these function calls + // Traverse the vector backwards, otherwise definitions are deleted while + // their subsequent uses are still around + for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(), + re = IItoRemove.rend(); ri != re; ++ri) + (*ri)->eraseFromParent(); + + addCLMetadata(F_spir); + kernel->KernelFunction = F_spir; + errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n"; + DEBUG(errs() << *KernelM); + + return; +} + +bool DFG2LLVM_SPIR::runOnModule(Module &M) { + errs() << "\nDFG2LLVM_SPIR PASS\n"; + + // Get the BuildDFG Analysis Results: + // - Dataflow graph + // - Maps from i8* hansles to DFNode and DFEdge + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + // DFInternalNode *Root = DFG.getRoot(); + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); + // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); + + // Visitor for Code Generation Graph Traversal + CGT_SPIR *CGTVisitor = new CGT_SPIR(M, DFG); + + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode: Roots) { + // Initiate code generation for root DFNode + CGTVisitor->visit(rootNode); + } + + // This is not required. Itrinsics that do not have a use are not a problem + //CGTVisitor->removeLLVMIntrinsics(); + CGTVisitor->writeKernelsModule(); + + //TODO: Edit module epilogue to remove the VISC intrinsic declarations + delete CGTVisitor; + + return true; +} + +std::string CGT_SPIR::getKernelsModuleName(Module &M) { + /*SmallString<128> currentDir; + llvm::sys::fs::current_path(currentDir); + std::string fileName = getFilenameFromModule(M); + Twine output = Twine(currentDir) + "/Output/" + fileName + ""; + return output.str().append(".kernels.ll");*/ + std::string mid = M.getModuleIdentifier(); + return mid.append(".kernels.ll"); +} + +void CGT_SPIR::fixValueAddrspace(Value* V, unsigned addrspace) { + assert(isa<PointerType>(V->getType()) + && "Value should be of Pointer Type!"); + PointerType* OldTy = cast<PointerType>(V->getType()); + PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace); + V->mutateType(NewTy); + for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) { + // Change all uses producing pointer type in same address space to new + // addressspace. + if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) { + if(PTy->getAddressSpace() == OldTy->getAddressSpace()) { + fixValueAddrspace(*ui, addrspace); + } + } + } +} + +Function* CGT_SPIR::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) { + unsigned idx = 0; + std::vector<Type*> ArgTypes; + for(auto& arg: F->getArgumentList()) { + DEBUG(errs() << arg << "\n"); + unsigned argno = arg.getArgNo(); + if ((idx < Args.size()) && (argno == Args[idx])) { + fixValueAddrspace(&arg, addrspace); + idx++; + } + ArgTypes.push_back(arg.getType()); + } + FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false); + + //F->mutateType(PTy); + Function* newF = cloneFunction(F, newFT, false); + replaceNodeFunctionInIR(*F->getParent(), F, newF); + + DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n"); + return newF; +} + +/* Remove the specified argument from arguments at positions denoted in Args */ +void CGT_SPIR::removeAttributeAtArguments(Function* F, std::vector<unsigned> &Args, Attribute::AttrKind attrKind) { + DEBUG(errs() << "Removing nocapture attribute from shared memory arguments of function " << F->getName() << "\n"); + + unsigned cnt = 0, arg_no = 0; + for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae && arg_no < Args.size(); ++ai, ++cnt) { + + if (Args[arg_no] == cnt) { + AttributeSet AS = F->getAttributes(); + AttrBuilder AB(AS, ai->getArgNo()+1); + AB.removeAttribute(attrKind); + AttributeSet argAS = AttributeSet::get(F->getContext(), ai->getArgNo()+1, AB); + F->removeAttributes(1+ai->getArgNo(), AS.getParamAttributes(ai->getArgNo() + 1)); + F->addAttributes(1+ai->getArgNo(), argAS); + + arg_no++; + } + } +} + +/* Add metadata to module KernelM, for OpenCL kernels */ +void CGT_SPIR::addCLMetadata(Function *F) { + // TODO: There is additional metadata used by kernel files but we skip them as + // they are not mandatory. In future they might be useful to enable + // optimizations + + IRBuilder<> Builder(&*F->begin()); + + // Create node for "kernel_arg_type" + SmallVector<Metadata*,8> argTypeNames; + argTypeNames.push_back(MDString::get(KernelM->getContext(), "kernel_arg_type")); + + for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae; + ai++) { + argTypeNames.push_back(MDString::get(KernelM->getContext(), printType(ai->getType()))); + } + // All argument type names are in the vector. Create a metadata node + // "kernel_arg_type" + MDTuple* KernelArgTypes = MDNode::get(KernelM->getContext(), argTypeNames); + + // Create kernel metadata node containg the kernel function and the + // "kernel_arg_type" metadata node created above + SmallVector<Metadata*,8> KernelMD; + KernelMD.push_back(ValueAsMetadata::get(F)); + KernelMD.push_back(KernelArgTypes); + MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD); + + // Create metadata node opencl.kernels. It points to the kernel metadata node + NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels"); + MDN_kernels->addOperand(MDKernelNode); + + //KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel")); + // TODO: Replace 1 with the number of the kernel. + // Add when support for multiple launces is added + //KernelMD.push_back(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1)); + //MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD); + //NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations"); + //MDN_annotations->addOperand(MDNvvmAnnotationsNode); + +} + +/* Function to remove all remaining declarations of llvm intrinsics, + * as they are not supported in SPIR. + */ +void CGT_SPIR::removeLLVMIntrinsics() { + + std::vector<Function*> fv = std::vector<Function*>(); + + for (Module::iterator mi = KernelM->begin(), me = KernelM->end(); (mi != me); ++mi) { + Function* F = &*mi; + if (F->isDeclaration() && F->getName().startswith("llvm.")) { + DEBUG(errs() << "Declaration: " << F->getName() << " with " << F->getNumUses() <<"uses.\n"); + assert(F->hasNUses(0) && "LLVM intrinsic function still in use"); + fv.push_back(F); + } + } + + for (std::vector<Function*>::iterator vi = fv.begin(); vi != fv.end(); ++vi) { + DEBUG(errs() << "Erasing declaration: " << (*vi)->getName() <<"\n"); + (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType())); + (*vi)->eraseFromParent(); + } + +} + +void CGT_SPIR::writeKernelsModule() { + + // In addition to deleteing all otjer functions, we also want to spice it up a + // little bit. Do this now. + legacy::PassManager Passes; + + std::error_code EC; + tool_output_file Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None); + if (EC) { + errs() << EC.message() << "\n"; + } + + Passes.add( + createPrintModulePass(Out.os())); + + Passes.run(*KernelM); + + // Declare success. + Out.keep(); +} + +Function* CGT_SPIR::transformFunctionToVoid(Function* F) { + + // FIXME: Maybe do that using the Node? + StructType* FRetTy = cast<StructType>(F->getReturnType()); + assert(FRetTy && "Return Type must always be a struct"); + + // Keeps return statements, because we will need to replace them + std::vector<ReturnInst *> RItoRemove; + findReturnInst(F, RItoRemove); + + + // Check for { } return struct, which means that the function returns void + if (FRetTy->isEmptyTy()) { + + DEBUG(errs() << "\tFunction output struct is void\n"); + DEBUG(errs() << "\tNo parameters added\n"); + + // Replacing return statements with others returning void + for (std::vector<ReturnInst *>::iterator i = RItoRemove.begin(), + e = RItoRemove.end(); i != e; ++i) { + ReturnInst::Create((F->getContext()), 0, (*i)); + (*i)->eraseFromParent(); + } + DEBUG(errs() << "\tChanged return statements to return void\n"); + } + else { + // The struct has return values, thus needs to be converted to parameter + + // Iterate over all element types of return struct and add arguments to the + // function + std::vector<Argument*> Args; + for (unsigned i=0; i<FRetTy->getNumElements(); i++) { + Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F); + Args.push_back(RetArg); + DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n"); + } + + Function::arg_iterator ai, ae; + + DEBUG(errs() << "\tReplacing Return statements\n"); + // Replace return statements with extractValue and store instructions + for (std::vector<ReturnInst *>::iterator rii = RItoRemove.begin(), + rie = RItoRemove.end(); rii != rie; ++rii) { + ReturnInst* RI = (*rii); + Value* RetVal = RI->getReturnValue(); + for(unsigned i = 0; i < Args.size(); i++) { + ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i), + Args[i]->getName()+".val", RI); + new StoreInst(EI, Args[i], RI); + } + // assert(RetVal && "Return value should not be null at this point"); + // StructType* RetType = cast<StructType>(RetVal->getType()); + // assert(RetType && "Return type is not a struct"); + + ReturnInst::Create((F->getContext()), 0, RI); + RI->eraseFromParent(); + + } + } + DEBUG(errs() << "\tReplaced return statements\n"); + + // Create the argument type list with the added argument's type + std::vector<Type*> ArgTypes; + for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + ArgTypes.push_back(ai->getType()); + } + + // Adding new arguments to the function argument list, would not change the + // function type. We need to change the type of this function to reflect the + // added arguments + Type* VoidRetType = Type::getVoidTy(F->getContext()); + FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg()); + + // Change the function type + //F->mutateType(PTy); + Function* newF = cloneFunction(F, newFT, false); + replaceNodeFunctionInIR(*F->getParent(), F, newF); + + return newF; +} + +// Remove the visc in/out attributes from kernel function +void CGT_SPIR::removeInOutAttributes(Function* F) { + DEBUG(errs() << "Removing visc attributes from argument list of function " << F->getName() << "\n"); + for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ai++) { + + AttributeSet AS = F->getAttributes(); + AttrBuilder AB(AS, ai->getArgNo()+1); + AB.removeAttribute(Attribute::In); + AB.removeAttribute(Attribute::Out); + AB.removeAttribute(Attribute::InOut); + AttributeSet argAS = AttributeSet::get(F->getContext(), ai->getArgNo()+1, AB); + F->removeAttributes(1+ai->getArgNo(), AS.getParamAttributes(ai->getArgNo() + 1)); + F->addAttributes(1+ai->getArgNo(), argAS); + + } +} + +/****************************************************************************** + * Helper functions * + ******************************************************************************/ + +// Calculate execute node parameters which include, number of diemnsions for +// dynamic instances of the kernel, local and global work group sizes. +static void getExecuteNodeParams(Module &M, Value* &workDim, Value* &LocalWGPtr, Value* + &GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) { + + // Assign number of dimenstions a constant value + workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim); + + // If local work group size if null + if(!kernel->hasLocalWG()) { + LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext())); + } + else { + for(unsigned i = 0; i < kernel->localWGSize.size(); i++) { + if(isa<Argument>(kernel->localWGSize[i])) + kernel->localWGSize[i] = VMap[kernel->localWGSize[i]]; + } + LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize"); + } + + for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) { + if(isa<Argument>(kernel->globalWGSize[i])) + kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]]; + } + + // For OpenCL, global work group size is the total bumber of instances in each + // dimension. So, multiply local and global dim limits. + std::vector<Value*> globalWGSizeInsts; + if(kernel->hasLocalWG()) { + for (unsigned i = 0; i < kernel->gridDim; i++) { + BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB); + globalWGSizeInsts.push_back(MulInst); + } + } + else { + globalWGSizeInsts = kernel->globalWGSize; + } + GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize"); + DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n"); +} + +// CodeGen for allocating space for Work Group on stack and returning a pointer +// to its address +static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) { + Value* WGPtr; + // Get int64_t and or ease of use + Type* Int64Ty = Type::getInt64Ty(M.getContext()); + + // Work Group type is [#dim x i64] + Type* WGTy = ArrayType::get(Int64Ty, WGSize.size()); + // Allocate space of Global work group data on stack and get pointer to + // first element. + AllocaInst* WG = new AllocaInst(WGTy, WGName, IB); + WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB); + Value* nextDim = WGPtr; + DEBUG(errs() << *WGPtr << "\n"); + + // Iterate over the number of dimensions and store the global work group + // size in that dimension + for(unsigned i=0; i < WGSize.size(); i++) { + assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!"); + + if(WGSize[i]->getType() != Int64Ty) { + // If number of dimensions are mentioned in any other integer format, + // generate code to extend it to i64. We need to use the mapped value in + // the new generated function, hence the use of VMap + // FIXME: Why are we changing the kernel WGSize vector here? + DEBUG(errs() << "Not i64. Zero extend required.\n"); + DEBUG(errs() << *WGSize[i] << "\n"); + CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB); + DEBUG(errs() << "Bitcast done.\n"); + StoreInst* SI = new StoreInst(CI, nextDim, IB); + DEBUG(errs() << "Zero extend done.\n"); + DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n"); + } else { + // Store the value representing work group size in ith dimension on + // stack + StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB); + + DEBUG(errs() << "\t Work group size: " << *SI << "\n"); + } + if(i+1 < WGSize.size()) { + // Move to next dimension + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim, + ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)), + WG->getName()+"."+Twine(i+1), + IB); + DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n"); + nextDim = GEP; + } + } + return WGPtr; + +} + +//Get generated SPIR binary name +static std::string getSPIRFilename(const Module& M) { + std::string mid = M.getModuleIdentifier(); + return mid.append(".kernels.bc"); + +} + +// Get the name of the input file from module ID +static std::string getFilenameFromModule(const Module& M) { + std::string moduleID = M.getModuleIdentifier(); + return moduleID.substr(moduleID.find_last_of("/")+1); +} + +// Changes the data layout of the Module to be compiled with SPIR backend +// TODO: Figure out when to call it, probably after duplicating the modules +static void changeDataLayout(Module &M) { + std::string spir64_layoutStr = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"; + + M.setDataLayout(StringRef(spir64_layoutStr)); + return; +} + +static void changeTargetTriple(Module &M) { + std::string spir64_TargetTriple = "spir64-unknown-unknown"; + M.setTargetTriple(StringRef(spir64_TargetTriple)); +} + +// Helper function, generate a string representation of a type +static std::string printType(Type* ty) { + std::string type_str; + raw_string_ostream rso(type_str); + ty->print(rso); + return rso.str(); +} + +// Helper function to get mangled names of OpenCL built ins +static StringRef getMangledName(std::string name) { + Twine mangledName = "_Z"+Twine(name.size())+name+"j"; + return StringRef(mangledName.str()); +} + + +// Helper function, populate a vector with all return statements in a function +static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) { + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &(*i); + ReturnInst* RI = dyn_cast<ReturnInst>(I); + if (RI) { + ReturnInstVec.push_back(RI); + } + } +} + +// Helper function, populate a vector with all IntrinsicID intrinsics in a function +static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) { + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &(*i); + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + if (II && II->getIntrinsicID() == IntrinsicID) { + IntrinsicInstVec.push_back(II); + } + } +} + +// Helper function to get mangled names of OpenCL built ins for atomics +static StringRef getAtomicMangledName(std::string name, unsigned addrspace, bool sign) { + Twine mangledName = "_Z" + + Twine(name.size())+name + + "PU3AS"+Twine(addrspace) + "jj"; +// ((sign) ? "ii" : "jj"); + return StringRef(mangledName.str()); +} + +// Helper funtion, returns the OpenCL function name corresponding to atomic op +static StringRef getAtomicOpName(Intrinsic::ID ID, unsigned addrspace) { + switch(ID) { + case Intrinsic::visc_atomic_cmpxchg: + return getAtomicMangledName("atom_cmpxchg", addrspace, true); + case Intrinsic::visc_atomic_add: + return getAtomicMangledName("atom_add", addrspace, true); + case Intrinsic::visc_atomic_sub: + return getAtomicMangledName("atom_sub", addrspace, true); + case Intrinsic::visc_atomic_min: + return getAtomicMangledName("atom_min", addrspace, true); + case Intrinsic::visc_atomic_umin: + return getAtomicMangledName("atom_min", addrspace, false); + case Intrinsic::visc_atomic_max: + return getAtomicMangledName("atom_max", addrspace, true); + case Intrinsic::visc_atomic_umax: + return getAtomicMangledName("atom_max", addrspace, false); + case Intrinsic::visc_atomic_inc: + return getAtomicMangledName("atom_inc", addrspace, true); + case Intrinsic::visc_atomic_dec: + return getAtomicMangledName("atom_dec", addrspace, true); + case Intrinsic::visc_atomic_xchg: + return getAtomicMangledName("atom_xchg", addrspace, true); + case Intrinsic::visc_atomic_and: + return getAtomicMangledName("atom_and", addrspace, true); + case Intrinsic::visc_atomic_or: + return getAtomicMangledName("atom_or", addrspace, true); + case Intrinsic::visc_atomic_xor: + return getAtomicMangledName("atom_xor", addrspace, true); + default: + llvm_unreachable("Unsupported atomic intrinsic!"); + }; +} + +static std::string getMathFunctionName(Intrinsic::ID ID) { + switch(ID) { + case Intrinsic::sin: return "_Z3sinf"; + case Intrinsic::cos: return "_Z3cosf"; + case Intrinsic::sqrt: return "_Z4sqrtf"; + case Intrinsic::floor: return "_Z5floorf"; + case Intrinsic::nvvm_rsqrt_approx_f: return "_Z5rsqrtf"; + default: + llvm_unreachable("Unsupported math function!"); + }; +} + +} // End of namespace + +char DFG2LLVM_SPIR::ID = 0; +static RegisterPass<DFG2LLVM_SPIR> X("dfg2llvm-spir", + "Dataflow Graph to LLVM for SPIR Pass", + false /* does not modify the CFG */, + true /* transformation, * + * not just analysis */); + diff --git a/lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.exports b/lib/DFG2LLVM_SPIR/DFG2LLVM_SPIR.exports new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/DFG2LLVM_SPIR/LLVMBuild.txt b/lib/DFG2LLVM_SPIR/LLVMBuild.txt new file mode 100644 index 0000000000..72c4de9efd --- /dev/null +++ b/lib/DFG2LLVM_SPIR/LLVMBuild.txt @@ -0,0 +1,21 @@ +;===- ./lib/Transforms/DFG2LLVM_SPIR/LLVMBuild.txt -------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = DFG2LLVM_SPIR +parent = Transforms diff --git a/lib/DFG2LLVM_WrapperAPI/CMakeLists.txt b/lib/DFG2LLVM_WrapperAPI/CMakeLists.txt new file mode 100644 index 0000000000..22c219d0a1 --- /dev/null +++ b/lib/DFG2LLVM_WrapperAPI/CMakeLists.txt @@ -0,0 +1,12 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( LLVMDFG2LLVM_WrapperAPI + DFG2LLVM_WrapperAPI.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) diff --git a/lib/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp b/lib/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp new file mode 100644 index 0000000000..ecec258dfe --- /dev/null +++ b/lib/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp @@ -0,0 +1,1532 @@ +//=== DFG2LLVM_WrapperAPI.cpp ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#define ENABLE_ASSERTS + +#define DEBUG_TYPE "DFG2LLVM_WrapperAPI" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/IR/Attributes.h" +#include "llvm-c/Core.h" +#include "llvm/SupportVISC/VISCTimer.h" +#include "llvm/SupportVISC/DFG2LLVM.h" +#include "llvm/InPlaceDFG/InPlaceDFGAnalysis.h" +#include <sstream> +#include <fstream> + +using namespace llvm; +using namespace builddfg; +using namespace dfg2llvm; + +using namespace inplacedfg; + +namespace { + +cl::opt<std::string> QuantizationInputsFilename( + "quantization-levels-filename", + cl::desc("<PROMISE quantization levels input file (path)>"), + cl::value_desc("filename"), + cl::Required); + +cl::opt<std::string> ConfigurationInputsFilename( + "configuration-inputs-filename", + cl::desc("<Autotuner configurations input file (path)>"), + cl::value_desc("filename"), + cl::Required); + +// Helper function declarations +bool isValidOperandForInPlaceOperation(Value *, Function *, DFNode *, + InPlaceDFGAnalysis::InPlaceDFGParameter &); + +// Helper class declarations + +// State machine definition for pattern identification + +/* An assumption is made for the Wrapper API input: * + * a leaf node will contain consequtive operations that will map to a * + * single convolution or fully connected layer, or a single tensor operation. * + + * FullyConnectedLayer: Multiply, Add, [Activation] * + * ConvolutionLayer: Convolution, [Add], [Activation], [Pooling] */ + +class AbstractState; + +class CodeGenStateMachine { +private: + Module *M; + Module *RtM; + + std::vector<Value*> Args; + std::vector<IntrinsicInst*> IIs; + std::vector<IntrinsicInst*> IIs_remove; // Intrinsics to remove + AbstractState *current; + +public: + CodeGenStateMachine(Module *, Module *); + + void setCurrent(AbstractState *s) { + current = s; + } + + void transition(IntrinsicInst *II); + + Module *getModule() { + return M; + } + + Module *getRtModule() { + return RtM; + } + + void addArgument(Value *Arg) { + Args.push_back(Arg); + } + + void addIntrinsicInst(IntrinsicInst *II) { + IIs.push_back(II); + } + + void addIntrinsicToRemove(IntrinsicInst *II) { + IIs_remove.push_back(II); + } + + IntrinsicInst *getIntrinsicInstAt(unsigned idx) { + return IIs[idx]; + } + + void codeGen(DFNode *, Function * , const StringRef &, + InPlaceDFGAnalysis::InPlaceDFGParameter &); + +}; + +class AbstractState { +public: + enum ID + { + INITIAL_STATE, + FULLY_CONNECTED_LAYER_1, + FULLY_CONNECTED_LAYER_2, + FULLY_CONNECTED_LAYER_3, + FULLY_CONNECTED_LAYER, + CONVOLUTION_LAYER_1, + CONVOLUTION_LAYER_2, + CONVOLUTION_LAYER_3, + CONVOLUTION_LAYER_4, + CONVOLUTION_LAYER, + SINGLE_TENSOR_OPERATION, + NO_PATTERN, + }; + +protected: + enum ID StateID; + +public: + enum ID getStateID() { + return StateID; + } + + virtual void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) = 0; + virtual ~AbstractState() {} +}; + +class InitialState : public AbstractState { +public: + InitialState() { + StateID = ID::INITIAL_STATE; + DEBUG(errs() << "new InitialState\n"); + } + ~InitialState() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class FullyConnectedLayer_1 : public AbstractState { +public: + FullyConnectedLayer_1() { + StateID = ID::FULLY_CONNECTED_LAYER_1; + DEBUG(errs() << "new FullyConnectedLayer_1\n"); + } + ~FullyConnectedLayer_1() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class FullyConnectedLayer_2 : public AbstractState { +public: + FullyConnectedLayer_2() { + StateID = ID::FULLY_CONNECTED_LAYER_2; + DEBUG(errs() << "new FullyConnectedLayer_2\n"); + } + ~FullyConnectedLayer_2() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class FullyConnectedLayer_3 : public AbstractState { +public: + FullyConnectedLayer_3() { + StateID = ID::FULLY_CONNECTED_LAYER_3; + DEBUG(errs() << "new FullyConnectedLayer_3\n"); + } + ~FullyConnectedLayer_3() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class FullyConnectedLayer : public AbstractState { +public: + FullyConnectedLayer() { + StateID = ID::FULLY_CONNECTED_LAYER; + DEBUG(errs() << "new FullyConnectedLayer\n"); + } + ~FullyConnectedLayer() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class ConvolutionLayer_1 : public AbstractState { +public: + ConvolutionLayer_1() { + StateID = ID::CONVOLUTION_LAYER_1; + DEBUG(errs() << "new ConvolutionLayer_1\n"); + } + ~ConvolutionLayer_1() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class ConvolutionLayer_2 : public AbstractState { +public: + ConvolutionLayer_2() { + StateID = ID::CONVOLUTION_LAYER_2; + DEBUG(errs() << "new ConvolutionLayer_2\n"); + } + ~ConvolutionLayer_2() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class ConvolutionLayer_3 : public AbstractState { +public: + ConvolutionLayer_3() { + StateID = ID::CONVOLUTION_LAYER_3; + DEBUG(errs() << "new ConvolutionLayer_3\n"); + } + ~ConvolutionLayer_3() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class ConvolutionLayer_4 : public AbstractState { +public: + ConvolutionLayer_4() { + StateID = ID::CONVOLUTION_LAYER_4; + DEBUG(errs() << "new ConvolutionLayer_4\n"); + } + ~ConvolutionLayer_4() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class ConvolutionLayer : public AbstractState { +public: + ConvolutionLayer() { + StateID = ID::CONVOLUTION_LAYER; + DEBUG(errs() << "new ConvolutionLayer\n"); + } + ~ConvolutionLayer() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class SingleTensorOperation : public AbstractState { +public: + SingleTensorOperation() { + StateID = ID::SINGLE_TENSOR_OPERATION; + DEBUG(errs() << "new SingleTensorOperation\n"); + } + ~SingleTensorOperation() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + +class NoPattern : public AbstractState { +public: + NoPattern() { + StateID = ID::NO_PATTERN; + DEBUG(errs() << "new NoPattern\n"); + } + ~NoPattern() {} + + void transition(CodeGenStateMachine *Mch, IntrinsicInst *II) override; +}; + + +void InitialState::transition(CodeGenStateMachine *Mch, IntrinsicInst *II) { + if (II) { // Not end of instruction stream + switch (II->getIntrinsicID()) { + case Intrinsic::visc_tensor_convolution: + { + Mch->addIntrinsicInst(II); + Mch->addArgument(II->getOperand(0)); // conv input + Mch->addArgument(II->getOperand(1)); // conv kernel + + Mch->setCurrent(new ConvolutionLayer_1()); + } + break; + case Intrinsic::visc_tensor_mul: + { + Mch->addIntrinsicInst(II); + Mch->addArgument(II->getOperand(0)); // 1st gemm input + Mch->addArgument(II->getOperand(1)); // 2nd gemm input + + Mch->setCurrent(new FullyConnectedLayer_1()); + } + break; + + case Intrinsic::visc_node_id: + { + + DEBUG(errs() << "\t: Handling __visc_node_id \n"); + // Get uint32 node ID + Value *Op = II->getOperand(0); + + std::vector<Value*> Args; + Args.push_back(Op); + + Module *M = Mch->getModule(); + Module *RtM = Mch->getRtModule(); + + Constant* visc_node_id_call = + M->getOrInsertFunction(StringRef("tensor_set_node_id"), + RtM->getFunction(StringRef("tensor_set_node_id"))->getFunctionType()); + + CallInst::Create(visc_node_id_call, Args, "", II); + + Mch->addIntrinsicToRemove(II); + Mch->setCurrent(new InitialState()); + } + break; + + default: // Other HPVM intrinsic + { + Mch->addIntrinsicInst(II); + Mch->setCurrent(new SingleTensorOperation()); + } + break; + } + delete this; + } // else {} // No HPVM intrinsic received. Remain at initial +} + +void SingleTensorOperation::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (II) { // Not end of instruction stream + Mch->setCurrent(new NoPattern()); + delete this; + } +} + +void FullyConnectedLayer_1::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (II) { // Not end of instruction stream + switch (II->getIntrinsicID()) { + case Intrinsic::visc_tensor_add: + { + IntrinsicInst *MulII = Mch->getIntrinsicInstAt(0); + assert((MulII == II->getOperand(0)) && + "Output of mul must be used as 1st operand of add"); + Mch->addIntrinsicInst(II); + + Mch->addArgument(II->getOperand(1)); // bias + + Mch->setCurrent(new FullyConnectedLayer_2()); + } + break; + default: + Mch->setCurrent(new NoPattern()); + break; + } + } else { + Mch->setCurrent(new NoPattern()); + } + delete this; +} + +void FullyConnectedLayer_2::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (II) { // Not end of instruction stream + switch (II->getIntrinsicID()) { + case Intrinsic::visc_tensor_tanh: + { + // Type of activation : TanH + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + + Mch->addIntrinsicInst(II); + + Mch->setCurrent(new FullyConnectedLayer_3()); + } + break; + case Intrinsic::visc_tensor_relu: + { + // Type of activation : ReLU + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 1)); + + Mch->addIntrinsicInst(II); + + Mch->setCurrent(new FullyConnectedLayer_3()); + } + break; + case Intrinsic::visc_tensor_clipped_relu: + { + // Type of activation : Clipped ReLU + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 2)); + + Mch->addIntrinsicInst(II); + + Mch->setCurrent(new FullyConnectedLayer_3()); + } + break; + default: // No activation, but HPVM intrinsic + Mch->setCurrent(new NoPattern()); + break; + } + } else { // End of instruction stream + // No activation + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), -1)); + + Mch->setCurrent(new FullyConnectedLayer()); + } + delete this; +} + +void FullyConnectedLayer_3::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (!II) { // End of instruction stream + Mch->setCurrent(new FullyConnectedLayer()); + } else { + Mch->setCurrent(new NoPattern()); + } + delete this; +} + +void FullyConnectedLayer::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (II) { // Not end of instruction stream + Mch->setCurrent(new NoPattern()); + delete this; + } +} + +void ConvolutionLayer_1::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (II) { // Not end of instruction stream + switch (II->getIntrinsicID()) { + case Intrinsic::visc_tensor_add: + { + IntrinsicInst *ConvII = Mch->getIntrinsicInstAt(0); + assert((ConvII == II->getOperand(0)) && + "Output of conv must be used as 1st operand of add"); + Mch->addIntrinsicInst(II); + + Mch->addArgument(II->getOperand(1)); // bias + + Mch->addArgument(ConvII->getOperand(2)); // 1st numeric arg of conv + Mch->addArgument(ConvII->getOperand(3)); // 2nd numeric arg of conv + Mch->addArgument(ConvII->getOperand(4)); // 3rd numeric arg of conv + Mch->addArgument(ConvII->getOperand(5)); // 4th numeric arg of conv + + Mch->setCurrent(new ConvolutionLayer_2()); + } + break; + default: + Mch->setCurrent(new NoPattern()); + break; + } + } else { + // No addition + Mch->addArgument(ConstantPointerNull::get( + Type::getInt8PtrTy(Mch->getModule()->getContext()))); + + // Zero for all convolution numeric arguments FIXME??? + IntrinsicInst *ConvII = Mch->getIntrinsicInstAt(0); + Mch->addArgument(ConvII->getOperand(2)); // 1st numeric arg of conv + Mch->addArgument(ConvII->getOperand(3)); // 2nd numeric arg of conv + Mch->addArgument(ConvII->getOperand(4)); // 3rd numeric arg of conv + Mch->addArgument(ConvII->getOperand(5)); // 4th numeric arg of conv + + // Mch->addArgument(ConstantInt::get( + // Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + // Mch->addArgument(ConstantInt::get( + // Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + // Mch->addArgument(ConstantInt::get( + // Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + // Mch->addArgument(ConstantInt::get( + // Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + + // No pooling + // 0 for unused pool arguments: + // pool_id, pool_size_v, pool_size_h, pool pad_v, + // pool_pad_h, pool_stride_v, pool_stride_h + for (int i = 0; i < 7; i++) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + } + // No activation + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), -1)); + + Mch->setCurrent(new ConvolutionLayer()); + } + delete this; +} + +void ConvolutionLayer_2::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (II) { // Not end of instruction stream + switch (II->getIntrinsicID()) { + case Intrinsic::visc_tensor_tanh: + { + // Type of activation : TanH + // Mch->addArgument(ConstantInt::get( + // Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + Mch->addIntrinsicInst(II); + + Mch->setCurrent(new ConvolutionLayer_3()); + } + break; + case Intrinsic::visc_tensor_relu: + { + // Type of activation : ReLU + // Mch->addArgument(ConstantInt::get( + // Type::getInt32Ty(Mch->getModule()->getContext()), 1)); + Mch->addIntrinsicInst(II); + + Mch->setCurrent(new ConvolutionLayer_3()); + } + break; + case Intrinsic::visc_tensor_clipped_relu: + { + // Type of activation : Clipped ReLU + // Mch->addArgument(ConstantInt::get( + // Type::getInt32Ty(Mch->getModule()->getContext()), 2)); + Mch->addIntrinsicInst(II); + + Mch->setCurrent(new ConvolutionLayer_3()); + } + break; + case Intrinsic::visc_tensor_pool_max: + { + // pool max + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + // pool_size_v, pool_size_h, pool pad_v, + // pool_pad_h, pool_stride_v, pool_stride_h + for (int i = 1; i < 7; i++) { + Mch->addArgument(II->getOperand(i)); + } + // No activation + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), -1)); + Mch->addIntrinsicInst(II); + + Mch->setCurrent(new ConvolutionLayer_4()); + } + break; + case Intrinsic::visc_tensor_pool_min: + { + // pool min FIXME: 2: supported? + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 2)); + // pool_size_v, pool_size_h, pool pad_v, + // pool_pad_h, pool_stride_v, pool_stride_h + for (int i = 1; i < 7; i++) { + Mch->addArgument(II->getOperand(i)); + } + // No activation + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), -1)); + Mch->addIntrinsicInst(II); + + Mch->setCurrent(new ConvolutionLayer_4()); + } + break; + case Intrinsic::visc_tensor_pool_mean: + { + // pool mean + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 1)); + // pool_size_v, pool_size_h, pool pad_v, + // pool_pad_h, pool_stride_v, pool_stride_h + for (int i = 1; i < 7; i++) { + Mch->addArgument(II->getOperand(i)); + } + // No activation + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), -1)); + Mch->addIntrinsicInst(II); + + Mch->setCurrent(new ConvolutionLayer_4()); + } + break; + default: // No activation, No pooling, but HPVM intrinsic + Mch->setCurrent(new NoPattern()); + break; + } + } else { // End of instruction stream + // No pooling + // 0 for unused pool arguments: + // pool_id, pool_size_v, pool_size_h, pool pad_v, + // pool_pad_h, pool_stride_v, pool_stride_h + for (int i = 0; i < 7; i++) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + } + // No activation + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), -1)); + + Mch->setCurrent(new ConvolutionLayer()); + } + delete this; +} + +void ConvolutionLayer_3::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (II) { // Not end of instruction stream + switch (II->getIntrinsicID()) { + case Intrinsic::visc_tensor_pool_max: + { + // pool max + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + // pool_size_v, pool_size_h, pool pad_v, + // pool_pad_h, pool_stride_v, pool_stride_h + for (int i = 1; i < 7; i++) { + Mch->addArgument(II->getOperand(i)); + } + Mch->addIntrinsicInst(II); + + // Revisit last intrinsic, to add argument for activation operation + IntrinsicInst *ActII = Mch->getIntrinsicInstAt(2); + // Due to previous switch, we know it is a TanH, ReLU, or Clipped ReLU + Intrinsic::ID ActIID = ActII->getIntrinsicID(); + if (ActIID == Intrinsic::visc_tensor_tanh) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + } else if (ActIID == Intrinsic::visc_tensor_relu) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 1)); + } else { //ActIID == Intrinsic::visc_tensor_clipped_relu + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 2)); + } + + Mch->setCurrent(new ConvolutionLayer_4()); + } + break; + case Intrinsic::visc_tensor_pool_min: + { + // pool min FIXME: 2: supported? + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 2)); + + // pool_size_v, pool_size_h, pool pad_v, + // pool_pad_h, pool_stride_v, pool_stride_h + for (int i = 1; i < 7; i++) { + Mch->addArgument(II->getOperand(i)); + } + Mch->addIntrinsicInst(II); + + // Revisit last intrinsic, to add argument for activation operation + IntrinsicInst *ActII = Mch->getIntrinsicInstAt(2); + // Due to previous switch, we know it is a TanH, ReLU, or Clipped ReLU + Intrinsic::ID ActIID = ActII->getIntrinsicID(); + if (ActIID == Intrinsic::visc_tensor_tanh) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + } else if (ActIID == Intrinsic::visc_tensor_relu) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 1)); + } else { //ActIID == Intrinsic::visc_tensor_clipped_relu + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 2)); + } + + Mch->setCurrent(new ConvolutionLayer_4()); + } + break; + case Intrinsic::visc_tensor_pool_mean: + { + // pool max + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 1)); + // pool_size_v, pool_size_h, pool pad_v, + // pool_pad_h, pool_stride_v, pool_stride_h + for (int i = 1; i < 7; i++) { + Mch->addArgument(II->getOperand(i)); + } + Mch->addIntrinsicInst(II); + + // Revisit last intrinsic, to add argument for activation operation + IntrinsicInst *ActII = Mch->getIntrinsicInstAt(2); + // Due to previous switch, we know it is a TanH, ReLU, or Clipped ReLU + Intrinsic::ID ActIID = ActII->getIntrinsicID(); + if (ActIID == Intrinsic::visc_tensor_tanh) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + } else if (ActIID == Intrinsic::visc_tensor_relu) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 1)); + } else { //ActIID == Intrinsic::visc_tensor_clipped_relu + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 2)); + } + + Mch->setCurrent(new ConvolutionLayer_4()); + } + break; + default: // No pooling, but HPVM intrinsic + Mch->setCurrent(new NoPattern()); + break; + } + } else { // End of instruction stream + // No pooling + // 0 for unused pool arguments: + // pool_id, pool_size_v, pool_size_h, pool pad_v, + // pool_pad_h, pool_stride_v, pool_stride_h + for (int i = 0; i < 7; i++) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + } + + // Revisit last intrinsic, to add argument for activation operation + IntrinsicInst *ActII = Mch->getIntrinsicInstAt(2); + // Due to previous switch, we know it is a TanH, ReLU, or Clipped ReLU + Intrinsic::ID ActIID = ActII->getIntrinsicID(); + if (ActIID == Intrinsic::visc_tensor_tanh) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 0)); + } else if (ActIID == Intrinsic::visc_tensor_relu) { + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 1)); + } else { //ActIID == Intrinsic::visc_tensor_clipped_relu + Mch->addArgument(ConstantInt::get( + Type::getInt32Ty(Mch->getModule()->getContext()), 2)); + } + + Mch->setCurrent(new ConvolutionLayer()); + } + delete this; +} + +void ConvolutionLayer_4::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (!II) { // End of instruction stream + Mch->setCurrent(new ConvolutionLayer()); + } else { + Mch->setCurrent(new NoPattern()); + } + delete this; +} + +void ConvolutionLayer::transition(CodeGenStateMachine *Mch, + IntrinsicInst *II) { + if (II) { // Not end of instruction stream + Mch->setCurrent(new NoPattern()); + delete this; + } +} + +void NoPattern::transition(CodeGenStateMachine *Mch, IntrinsicInst *II) {} + +CodeGenStateMachine::CodeGenStateMachine(Module *_M, Module *_RtM) : + M(_M), RtM(_RtM) { + current = new InitialState(); +} + +void CodeGenStateMachine::transition(IntrinsicInst *II) { + current->transition(this, II); +} + +void CodeGenStateMachine::codeGen(DFNode *N, Function *F, const StringRef &strRef, + InPlaceDFGAnalysis::InPlaceDFGParameter &IPP) { + + assert( ( (current->getStateID() == AbstractState::ID::FULLY_CONNECTED_LAYER) || + (current->getStateID() == AbstractState::ID::CONVOLUTION_LAYER) || + (current->getStateID() == AbstractState::ID::SINGLE_TENSOR_OPERATION) ) && + "Unsupported instruction sequence for the Wrapper API.\n" ); + + if ((current->getStateID() == AbstractState::ID::FULLY_CONNECTED_LAYER) || + (current->getStateID() == AbstractState::ID::CONVOLUTION_LAYER)) { + + // Layer Operation. + DEBUG(errs() << "Layer Instruction Sequence. Validating ...\n"); + // We have a valid instruction sequence. + // Make sure that the instruction sequence can be traslated: + // each instruction's result must be used only by the next one in sequence. + + for (unsigned p = 0; p < IIs.size()-1; p++) { + IntrinsicInst *II = IIs[p]; + assert((II->hasOneUse()) && + "Instruction sequence does not fit pattern: not single use\n"); + + Value::user_iterator ui = II->user_begin(); // The only use + assert((*ui == IIs[p+1]) && + "Instruction sequence does not fit pattern: not used by next instruction\n"); + } + + // Create corresponding wrapper API call + CallInst *CI; + switch (current->getStateID()) { + case AbstractState::ID::CONVOLUTION_LAYER: + { + Constant* wrapper_ConvLayer2 = + M->getOrInsertFunction(StringRef("wrapper_ConvLayer2"), + RtM->getFunction(StringRef("wrapper_ConvLayer2"))->getFunctionType()); + + DEBUG(errs() << *wrapper_ConvLayer2); + + // FIXME: get last (float) arguments from clipped relu intrinsic. For now, 0 + Args.push_back(ConstantFP::get(Type::getFloatTy(M->getContext()), (double) 0)); + Args.push_back(ConstantFP::get(Type::getFloatTy(M->getContext()), (double) 0)); + + + // Create string for node name, as first argument for wrapper API call + Constant *ConstArray = ConstantDataArray::getString(M->getContext(), + strRef, true); + GlobalVariable *GV = new GlobalVariable(*M,ConstArray->getType(), + true, GlobalValue::ExternalLinkage, ConstArray, ""); + + // Create GEP expression to access it + Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0); + Constant* GEPIndices[] = { Int_0, Int_0 }; + Constant* GEPConst = + ConstantExpr::getGetElementPtr(GV->getType()->getPointerElementType(), + GV, GEPIndices); + + std::vector<Value*> UpdatedArgs; + UpdatedArgs.push_back(GEPConst); + for (unsigned i = 0; i < Args.size(); i++) { + UpdatedArgs.push_back(Args[i]); + } + // Create wrapper API function call + CI = CallInst::Create(wrapper_ConvLayer2, UpdatedArgs, ""); + } + break; + case AbstractState::ID::FULLY_CONNECTED_LAYER: + { + Constant* wrapper_FCLayer = + M->getOrInsertFunction(StringRef("wrapper_FCLayer"), + RtM->getFunction(StringRef("wrapper_FCLayer"))->getFunctionType()); + DEBUG(errs() << *wrapper_FCLayer); + + // FIXME: get last (float) arguments from clipped relu intrinsic. For now, 0 + Args.push_back(ConstantFP::get(Type::getFloatTy(M->getContext()), (double) 0)); + Args.push_back(ConstantFP::get(Type::getFloatTy(M->getContext()), (double) 0)); + + // Create string for node name, as first argument for wrapper API call + Constant *ConstArray = ConstantDataArray::getString(M->getContext(), + strRef, true); + GlobalVariable *GV = new GlobalVariable(*M,ConstArray->getType(), + true, GlobalValue::ExternalLinkage, ConstArray, ""); + + // Create GEP expression to access it + Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0); + Constant* GEPIndices[] = { Int_0, Int_0 }; + Constant* GEPConst = + ConstantExpr::getGetElementPtr(GV->getType()->getPointerElementType(), + GV, GEPIndices); + + std::vector<Value*> UpdatedArgs; + UpdatedArgs.push_back(GEPConst); + for (unsigned i = 0; i < Args.size(); i++) { + UpdatedArgs.push_back(Args[i]); + } + + // Create wrapper API function call + CI = CallInst::Create(wrapper_FCLayer, UpdatedArgs, ""); + } + break; + default: + llvm_unreachable("Unexpected CodeGenStateMachine State\n"); + break; + } + + // Insert new call and replace all uses of pattern result with + // the wrapper API call + IntrinsicInst *IIlast = *(IIs.rbegin()); + CI->insertBefore(IIlast); + IIlast->replaceAllUsesWith(CI); + + } + else { // SINGLE_TENSOR_OPERATION + assert((IIs.size() == 1) && + "Unexpected size of intrinsics vector in code gen state machine.\n"); + assert(Args.empty() && "Unexpected arguments found in coge gen state machine.\n"); + IntrinsicInst *TensorII = IIs[0]; + + errs() << "TensorII: " << *TensorII << "\n"; + + switch (TensorII->getIntrinsicID()) { + case Intrinsic::visc_tensor_group_convolution: + { /* llvm.hpvm.tensor.group.conv */ + // Tensor group conv is not in place. + DEBUG(errs() << F->getName() << "\t: Handling tensor group convolution \n"); + + // Argument list for the runtime call + + // Create string for node name, as first argument for wrapper API call + Constant *ConstArray = ConstantDataArray::getString(M->getContext(), + strRef, true); + GlobalVariable *GV = new GlobalVariable(*M,ConstArray->getType(), + true, GlobalValue::ExternalLinkage, ConstArray, ""); + // Create GEP expression to access it + Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0); + Constant* GEPIndices[] = { Int_0, Int_0 }; + Constant* GEPConst = + ConstantExpr::getGetElementPtr(GV->getType()->getPointerElementType(), + GV, GEPIndices); + + Args.push_back(GEPConst); + + Args.push_back(TensorII->getOperand(0)); + Args.push_back(TensorII->getOperand(1)); + Args.push_back(TensorII->getOperand(2)); + Args.push_back(TensorII->getOperand(3)); + Args.push_back(TensorII->getOperand(4)); + Args.push_back(TensorII->getOperand(5)); + + Constant *conv_mode = ConstantInt::get(Type::getInt32Ty(M->getContext()), 1); + Args.push_back(conv_mode); + + Args.push_back(TensorII->getOperand(7)); + + // Create wrapper API runtime function call + Constant* wrapper_tensorGroupConvolution = + M->getOrInsertFunction(StringRef("wrapper_tensorGroupConvolution"), + RtM->getFunction(StringRef("wrapper_tensorGroupConvolution"))->getFunctionType()); + CallInst* CI = CallInst::Create(wrapper_tensorGroupConvolution, + Args, "", TensorII); + // We can replace the call to hpvm.tensor.mul with the runtime call + TensorII->replaceAllUsesWith(CI); + } + break; + + case Intrinsic::visc_tensor_batchnorm: + { /* llvm.hpvm.tensor.batchnorm */ + + // Tensor batchnorm is not in place. + // FIXME: Add Check for InPlace Analysis + DEBUG(errs() << F->getName() << "\t: Handling tensor batch normalization \n"); + + // Argument list for the runtime call + + // Create string for node name, as first argument for wrapper API call + Constant *ConstArray = ConstantDataArray::getString(M->getContext(), + strRef, true); + GlobalVariable *GV = new GlobalVariable(*M,ConstArray->getType(), + true, GlobalValue::ExternalLinkage, ConstArray, ""); + // Create GEP expression to access it + Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0); + Constant* GEPIndices[] = { Int_0, Int_0 }; + Constant* GEPConst = + ConstantExpr::getGetElementPtr(GV->getType()->getPointerElementType(), + GV, GEPIndices); + + Args.push_back(GEPConst); + + Args.push_back(TensorII->getOperand(0)); + Args.push_back(TensorII->getOperand(1)); + Args.push_back(TensorII->getOperand(2)); + Args.push_back(TensorII->getOperand(3)); + Args.push_back(TensorII->getOperand(4)); + Args.push_back(TensorII->getOperand(5)); + + // Create wrapper API runtime function call + Constant* wrapper_tensorBatchNorm = + M->getOrInsertFunction(StringRef("wrapper_tensorBatchNorm"), + RtM->getFunction(StringRef("wrapper_tensorBatchNorm"))->getFunctionType()); + CallInst* CI = CallInst::Create(wrapper_tensorBatchNorm, + Args, "", TensorII); + // We can replace the call to hpvm.tensor.batchnorm with the wrapper API call + TensorII->replaceAllUsesWith(CI); + } + break; + + case Intrinsic::visc_tensor_add: + { /* llvm.hpvm.tensor.add */ + DEBUG(errs() << F->getName() << "\t: Handling tensorAdd\n"); + + // Tensor add(a,b) is in place for argument a. + // Value *Op = TensorII->getOperand(0); + // Test the intrinsic operand for in place operation. + // bool inplace = isValidOperandForInPlaceOperation(Op, F, N, IPP); + + // Code generation will not continue if this is false, because the target + // may provide an in place operation(safe choice) + // FIXME: remove this comment - must check for in-place + // assert(inplace && + // "Operand not valid for in place operation. Code gen aborted.\n"); + + + // Argument list for the runtime call + + // Create string for node name, as first argument for wrapper API call + Constant *ConstArray = ConstantDataArray::getString(M->getContext(), + strRef, true); + GlobalVariable *GV = new GlobalVariable(*M,ConstArray->getType(), + true, GlobalValue::ExternalLinkage, ConstArray, ""); + // Create GEP expression to access it + Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0); + Constant* GEPIndices[] = { Int_0, Int_0 }; + Constant* GEPConst = + ConstantExpr::getGetElementPtr(GV->getType()->getPointerElementType(), + GV, GEPIndices); + + Args.push_back(GEPConst); + + Args.push_back(TensorII->getOperand(0)); + Args.push_back(TensorII->getOperand(1)); + + // Create wrapper API runtime function call + Constant* wrapper_tensorAdd = + M->getOrInsertFunction(StringRef("wrapper_tensorAdd"), + RtM->getFunction(StringRef("wrapper_tensorAdd"))->getFunctionType()); + CallInst::Create(wrapper_tensorAdd, Args, "", TensorII); + // We can replace the call to hpvm.tensor.add with the 1st argument + // that, due to in place operation, now contains the result + TensorII->replaceAllUsesWith(TensorII->getOperand(0)); + } + break; + + case Intrinsic::visc_tensor_pool_max: + case Intrinsic::visc_tensor_pool_mean: + case Intrinsic::visc_tensor_pool_min: + { + DEBUG(errs() << F->getName() << "\t: Handling tensor pooling functions\n"); + + // Argument list for tensor pooling: + // input, poolFunction, window_height, window_width, + // vertical_pad, horizontal_pad, vertical_stride, horizontal_stride + + // Create string for node name, as first argument for wrapper API call + Constant *ConstArray = ConstantDataArray::getString(M->getContext(), + strRef, true); + GlobalVariable *GV = new GlobalVariable(*M,ConstArray->getType(), + true, GlobalValue::ExternalLinkage, ConstArray, ""); + // Create GEP expression to access it + Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0); + Constant* GEPIndices[] = { Int_0, Int_0 }; + Constant* GEPConst = + ConstantExpr::getGetElementPtr(GV->getType()->getPointerElementType(), + GV, GEPIndices); + + Args.push_back(GEPConst); + + Args.push_back(TensorII->getOperand(0)); + + int pool_type = 0; + if (TensorII->getIntrinsicID() == Intrinsic::visc_tensor_pool_max) { + pool_type = 0; + } + if (TensorII->getIntrinsicID() == Intrinsic::visc_tensor_pool_mean) { + pool_type = 1; + } + if (TensorII->getIntrinsicID() == Intrinsic::visc_tensor_pool_min) { + pool_type = 2; + } + + Constant *constPoolType = + ConstantInt::get(Type::getInt32Ty(M->getContext()), pool_type); + Args.push_back(constPoolType); + + Args.push_back(TensorII->getOperand(1)); + Args.push_back(TensorII->getOperand(2)); + Args.push_back(TensorII->getOperand(3)); + Args.push_back(TensorII->getOperand(4)); + Args.push_back(TensorII->getOperand(5)); + Args.push_back(TensorII->getOperand(6)); + + // Create wrapper API runtime function call + Constant* wrapper_tensorPooling = + M->getOrInsertFunction(StringRef("wrapper_tensorPooling"), + RtM->getFunction(StringRef("wrapper_tensorPooling"))->getFunctionType()); + DEBUG(errs() << *wrapper_tensorPooling); + CallInst* CI = CallInst::Create(wrapper_tensorPooling, Args, "", TensorII); + + // Replacing intrinsic result uses with the result of the tensor runtime operation + TensorII->replaceAllUsesWith(CI); + } + break; + + case Intrinsic::visc_tensor_relu: + case Intrinsic::visc_tensor_clipped_relu: + case Intrinsic::visc_tensor_tanh: + { + DEBUG(errs() << F->getName() << "\t: Handling tensor activation functions\n"); + + // Tensor relu(a) (and others) is in place for argument a. + Value *Op = TensorII->getOperand(0); + + // Test the intrinsic operand for in place operation. + //-- bool inplace = isValidOperandForInPlaceOperation(Op, F, N, IPP); + // Code generation will not continue if this is false, because the target + // may provide an in place operation(safe choice) + //-- assert(inplace && + //-- "Operand not valid for in place operation. Code gen aborted.\n"); + + // Create string for node name, as first argument for wrapper API call + Constant *ConstArray = ConstantDataArray::getString(M->getContext(), + strRef, true); + GlobalVariable *GV = new GlobalVariable(*M,ConstArray->getType(), + true, GlobalValue::ExternalLinkage, ConstArray, ""); + // Create GEP expression to access it + Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0); + Constant* GEPIndices[] = { Int_0, Int_0 }; + Constant* GEPConst = + ConstantExpr::getGetElementPtr(GV->getType()->getPointerElementType(), + GV, GEPIndices); + + Args.push_back(GEPConst); + + Args.push_back(TensorII->getOperand(0)); + + if (TensorII->getIntrinsicID() == Intrinsic::visc_tensor_relu) { + // Create wrapper API runtime function call + Constant* wrapper_tensorRelu = + M->getOrInsertFunction(StringRef("wrapper_tensorRelu"), + RtM->getFunction(StringRef("wrapper_tensorRelu"))->getFunctionType()); + DEBUG(errs() << *wrapper_tensorRelu); + CallInst::Create(wrapper_tensorRelu, Args, "", TensorII); + } + else if (TensorII->getIntrinsicID() == Intrinsic::visc_tensor_clipped_relu) { + // Create wrapper API runtime function call + Constant* wrapper_tensorClippedRelu = + M->getOrInsertFunction(StringRef("wrapper_tensorClippedRelu"), + RtM->getFunction(StringRef("wrapper_tensorClippedRelu"))->getFunctionType()); + DEBUG(errs() << *wrapper_tensorClippedRelu); + CallInst::Create(wrapper_tensorClippedRelu, Args, "", TensorII); + } + else if (TensorII->getIntrinsicID() == Intrinsic::visc_tensor_tanh) { + // Create wrapper API runtime function call + Constant* wrapper_tensorTanh = + M->getOrInsertFunction(StringRef("wrapper_tensorTanh"), + RtM->getFunction(StringRef("wrapper_tensorTanh"))->getFunctionType()); + DEBUG(errs() << *wrapper_tensorTanh); + CallInst::Create(wrapper_tensorTanh, Args, "", TensorII); + } + + // We can replace the call to hpvm.tensor.{relu,clipped relu, tanh} + // with the 1st argument that, due to in place operation, + // now contains the result + TensorII->replaceAllUsesWith(TensorII->getOperand(0)); + } + break; + + case Intrinsic::visc_tensor_softmax: + { /* llvm.visc.tensor.softmax */ + + DEBUG(errs() << F->getName() << "\t: Handling tensor softmax\n"); + // Tensor softmax(a) is in place for argument a. + Value *Op = TensorII->getOperand(0); + + // Create string for node name, as first argument for wrapper API call + Constant *ConstArray = ConstantDataArray::getString(M->getContext(), + strRef, true); + GlobalVariable *GV = new GlobalVariable(*M,ConstArray->getType(), + true, GlobalValue::ExternalLinkage, ConstArray, ""); + // Create GEP expression to access it + Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M->getContext()), 0); + Constant* GEPIndices[] = { Int_0, Int_0 }; + Constant* GEPConst = + ConstantExpr::getGetElementPtr(GV->getType()->getPointerElementType(), + GV, GEPIndices); + + Args.push_back(GEPConst); + + Args.push_back(TensorII->getOperand(0)); + + // Create wrapper API runtime function call + Constant* wrapper_tensorSoftmax = + M->getOrInsertFunction(StringRef("wrapper_tensorSoftmax"), + RtM->getFunction(StringRef("wrapper_tensorSoftmax"))->getFunctionType()); + DEBUG(errs() << *wrapper_tensorSoftmax); + CallInst::Create(wrapper_tensorSoftmax, Args, "", TensorII); + // We can replace the call to hpvm.tensor.softmax with the 1st argument + // that, due to in place operation, now contains the result + TensorII->replaceAllUsesWith(TensorII->getOperand(0)); + } + break; + + + default: + llvm_unreachable("Unknown VISC Intrinsic!"); + break; + } + + } // No other case exists, since assertion passed + + + // Remove the instructions we translated to the simulator call. + // Traverse the vector backwards, otherwise definitions are deleted while + // their subsequent uses are still around. + for (std::vector<IntrinsicInst *>::reverse_iterator ri = IIs.rbegin(), + re = IIs.rend(); ri != re; ++ri) { + DEBUG(errs() << "Erasing: " << **ri << "\n"); + (*ri)->eraseFromParent(); + } + + + for (std::vector<IntrinsicInst *>::reverse_iterator ri = IIs_remove.rbegin(), + re = IIs_remove.rend(); ri != re; ++ri) { + DEBUG(errs() << "Erasing: " << **ri << "\n"); + (*ri)->eraseFromParent(); + } + +} + +// DFG2LLVM_WrapperAPI - The first implementation. + +struct DFG2LLVM_WrapperAPI : public DFG2LLVM { + static char ID; // Pass identification, replacement for typeid + DFG2LLVM_WrapperAPI() : DFG2LLVM(ID) {} + + +private: + +public: + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<BuildDFG>(); + AU.addRequired<InPlaceDFGAnalysisWrapper>(); + AU.addPreserved<BuildDFG>(); + AU.addPreserved<InPlaceDFGAnalysisWrapper>(); + } + + bool runOnModule(Module &M); +}; + +// Visitor for Code generation traversal (tree traversal for now) +class CGT_WrapperAPI : public CodeGenTraversal { + +private: + //Member variables + unsigned nodeID; // Used as a node identifier + + std::string QuantizationInputsFilenameStr; + std::string ConfigurationInputsFilenameStr; + + InPlaceDFGAnalysis::InPlaceDFGParameter *IPP; + + // VISC Runtime API and Tensor runtime API + Constant* llvm_hpvm_initApproxhpvmRt; + Constant* llvm_hpvm_cleanupApproxhpvmRt; + Constant* hpvm_request_tensor; + + Constant* llvm_hpvm_initializeRuntimeController; + Constant* llvm_hpvm_clearRuntimeController; + + // Functions + + // Virtual Functions + void init(); + void initRuntimeAPI(); + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); + +public: + + // Constructor + CGT_WrapperAPI(Module &_M, BuildDFG &_DFG, + InPlaceDFGAnalysis::InPlaceDFGParameter &_IPP, + std::string &_QuantizationInputsFilenameStr, + std::string &_ConfigurationInputsFilenameStr) + : CodeGenTraversal(_M, _DFG), IPP(&_IPP), + QuantizationInputsFilenameStr(_QuantizationInputsFilenameStr), + ConfigurationInputsFilenameStr(_ConfigurationInputsFilenameStr) { + nodeID = 0; + initRuntimeAPI(); + } + +}; + + +void CGT_WrapperAPI::init() { + // FIXME: what to do here? If anything? +} + +// Initialize the VISC runtime API. This makes it easier to insert these calls +void CGT_WrapperAPI::initRuntimeAPI() { + + // Load Runtime API Module + SMDiagnostic Err; + + char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!\n"); + + // FIXME: set correct path + Twine llvmSrcRoot = LLVM_SRC_ROOT; + Twine runtimeAPI = llvmSrcRoot+"/projects/hpvm-tensor-rt/lib/tensor_runtime.ll"; + runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); + if(runtimeModule == nullptr) + DEBUG(errs() << Err.getMessage()); + else + DEBUG(errs() << "Successfully loaded hpvm-tensor-rt API module\n"); + + // Get or insert Global declarations for + // - initialization + // - cleanup + // - request a tensor + DECLARE(llvm_hpvm_initApproxhpvmRt); + DECLARE(llvm_hpvm_cleanupApproxhpvmRt); + DECLARE(hpvm_request_tensor); + + DECLARE(llvm_hpvm_initializeRuntimeController); + DECLARE(llvm_hpvm_clearRuntimeController); + + // Find visc.init and visc.cleanup calls, and add placeholder methods + // for initialization and cleanup of the hpvm tensor runtime + + Function* VI = M.getFunction("llvm.visc.init"); + assert(VI->getNumUses() == 1 && "__visc__init should only be used once\n"); + InitCall = cast<Instruction>(*VI->user_begin()); + CallInst::Create(llvm_hpvm_initApproxhpvmRt, + ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(M.getContext()), 0)), + "", InitCall); + + StringRef QRangesStrRef = StringRef(QuantizationInputsFilenameStr); + // Create string for node name, as first argument for wrapper API call + Constant *ConstArray1 = ConstantDataArray::getString(M.getContext(), + QRangesStrRef, true); + GlobalVariable *GV1 = new GlobalVariable(M,ConstArray1->getType(), + true, GlobalValue::ExternalLinkage, ConstArray1, ""); + // Create GEP expression to access it + Constant* Int_0 = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0); + Constant* GEPIndices[] = { Int_0, Int_0 }; + Constant* QRangesGEPConst = + ConstantExpr::getGetElementPtr(GV1->getType()->getPointerElementType(), + GV1, GEPIndices); + + StringRef ConfsStrRef = StringRef(ConfigurationInputsFilenameStr); + // Create string for node name, as first argument for wrapper API call + Constant *ConstArray2 = ConstantDataArray::getString(M.getContext(), + ConfsStrRef, true); + GlobalVariable *GV2 = new GlobalVariable(M,ConstArray2->getType(), + true, GlobalValue::ExternalLinkage, ConstArray2, ""); + Constant* ConfsGEPConst = + ConstantExpr::getGetElementPtr(GV2->getType()->getPointerElementType(), + GV2, GEPIndices); + ArrayRef<Value*> RTCInitArgs = {ConfsGEPConst, QRangesGEPConst}; + CallInst::Create(llvm_hpvm_initializeRuntimeController, RTCInitArgs, "", InitCall); + + Function* VC = M.getFunction("llvm.visc.cleanup"); + assert(VC->getNumUses() == 1 && "__visc__clear should only be used once\n"); + CleanupCall = cast<Instruction>(*VC->user_begin()); + CallInst::Create(llvm_hpvm_cleanupApproxhpvmRt, ArrayRef<Value*>(), "", CleanupCall); + CallInst::Create(llvm_hpvm_clearRuntimeController, ArrayRef<Value*>(), "", CleanupCall); + +} + +void CGT_WrapperAPI::codeGen(DFInternalNode* N) { + errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n"; + errs () << "Skipping internal node\n"; +} + +void CGT_WrapperAPI::codeGen(DFLeafNode* N) { + + // Skip code generation if it is a dummy node + if(N->isDummyNode()) { + DEBUG(errs() << "Skipping dummy node\n"); + return; + } + + // Abort code generation if it is an allocation node + if(N->isAllocationNode()) { + assert(false && "Allocation Node not expected in ApproxHPVM"); + return; + } + + + // Increment the node ID, for current node. + ++nodeID; + + // Get the function associated with the dataflow node + Function *F = N->getFuncPointer(); + errs() << "Node Function: " << *F << "\n"; + // Look up if we have visited this function before. If we have, then just + // get the cloned function pointer from DFNode. Otherwise, create the cloned + // function and add it to the DFNode GenFunc. + Function *F_wrapper_api = N->getGenFuncForTarget(visc::PROMISE_TARGET); + + assert((F_wrapper_api == NULL) && + "Error: Visiting a node for which code already generated"); + + // Clone the function + ValueToValueMapTy VMap; + std::string FName(F->getName().data());//Twine FName = F->getName(); + + + F_wrapper_api = CloneFunction(F, VMap); + F_wrapper_api->setName(FName+"_wrapper_api"); + F_wrapper_api->removeFromParent(); + M.getFunctionList().push_back(F_wrapper_api); + + N->addGenFunc(F_wrapper_api, visc::PROMISE_TARGET, true); + + /* Removing HPVM in/out/inout function attributes */ + for(Function::arg_iterator ai = F_wrapper_api->arg_begin(), ae = F_wrapper_api->arg_end(); + ai != ae; ai++){ + Argument *Arg = &*ai; + if(Arg->hasAttribute(Attribute::In)) + Arg->removeAttr(Attribute::In); + if(Arg->hasAttribute(Attribute::Out)) + Arg->removeAttr(Attribute::Out); + if(Arg->hasAttribute(Attribute::InOut)) + Arg->removeAttr(Attribute::InOut); + } + + // Adding nounwind to generated function : FIXME: needed? + DEBUG(errs() << "Adding nounwind to generated function\n"); + F_wrapper_api->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind); + + // Add llvm_visc_requestTensor calls for every pointer argument of the function + // (they are all expected to be tensors), at the beginning of the function. + // This is the first instruction of the function, insert them before this + Instruction* FI = &*(F_wrapper_api->getEntryBlock().begin()); + + // FIXME: verify that we want 1 as a target device + // In this backend, the target device is GPU, represented by i32 1. + ConstantInt *TargetDeviceID = + ConstantInt::get(Type::getInt32Ty(M.getContext()), 1); + + for (Function::arg_iterator ai = F_wrapper_api->arg_begin(), + ae = F_wrapper_api->arg_end(); ai != ae; ++ai) { + Argument* Arg = &*ai; + if (Arg->getType()->isPointerTy()) { + Value *Args[] = {Arg, TargetDeviceID}; + CallInst::Create(hpvm_request_tensor, + ArrayRef<Value*>(Args, 2), + "", FI); + } + } + + CodeGenStateMachine CGM(&M, runtimeModule.get()); + + for (inst_iterator i = inst_begin(F_wrapper_api), e = inst_end(F_wrapper_api); + i != e; ++i) { + Instruction *I = &(*i); + CGM.transition(dyn_cast<IntrinsicInst>(I)); + } + + errs() << "Node ID string: "<< StringRef(std::to_string(nodeID)) << "\n"; + //CGM.codeGen(N, F_wrapper_api, N->getFuncPointer()->getName(), *IPP); + CGM.codeGen(N, F_wrapper_api, StringRef(std::to_string(nodeID)), *IPP); + + return; +} + +bool DFG2LLVM_WrapperAPI::runOnModule(Module &M) { + + errs() << "\nDFG2LLVM_WrapperAPI PASS\n"; + // Get the BuildDFG Analysis Results: + // - Dataflow graph + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + // Get the In Place Analysis Results + InPlaceDFGAnalysis::InPlaceDFGParameter IPP = + (getAnalysis<InPlaceDFGAnalysisWrapper>()).getIPP(); + + + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + + // Visitor for Code Generation Graph Traversal + CGT_WrapperAPI *CGTVisitor = new CGT_WrapperAPI(M, DFG, IPP, + QuantizationInputsFilename, + ConfigurationInputsFilename); + + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode: Roots) { + // Initiate code generation for root DFNode + CGTVisitor->visit(rootNode); + } + + //TODO: Edit module epilogue to remove the VISC intrinsic declarations + delete CGTVisitor; + + + return true; +} + + +/****************************************************************************** + * Helper functions * + ******************************************************************************/ + +/* Method needs to be called as part of an analysis pre-step, before code * + * generation is run on a node function, so that the HPVM intrinsics are still * + * in place. */ +bool isValidOperandForInPlaceOperation(Value *Op, Function *Fgen, DFNode *N, + InPlaceDFGAnalysis::InPlaceDFGParameter &IPP) { + + if (Argument *Arg = dyn_cast<Argument>(Op)) { + DEBUG(errs() << *Arg << "\t: argument, candidate for in place\n"); + assert((Arg->getParent() == Fgen) && + "Extra Parameter in body of Function\n"); + // Candidate parameter is a function argument + // In this case, consult the result of in place analysis + // Find position in arg list + unsigned pos = Arg->getArgNo(); + // If this parameter cannot be used for in place operation + // code gen cannot continue + if (IPP.at(N)[pos]) { + DEBUG(errs() << *Arg << "\t: argument, suitable for in place\n"); + return true; + } else { + DEBUG(errs() << *Arg << "\t: argument, not suitable for in place\n"); + return false; + } + } + else { + // If it is not an argument, then it needs to be the result of + // another intrinsic. These are new objects that are allocated, + // and consumed by next intrinsic. + DEBUG(errs() << *Op << "\t: Test for result of intrinsic operation\n"); + if (dyn_cast<IntrinsicInst>(Op)) { + DEBUG(errs() << *Arg << "\t: local, suitable for in place\n"); + return true; + } else { + DEBUG(errs() << *Arg << "\t: local, not suitable for in place\n"); + return false; + } + } +} + +} // End of namespace + +char DFG2LLVM_WrapperAPI::ID = 0; +static RegisterPass<DFG2LLVM_WrapperAPI> X("dfg2llvm-wrapperapi", + "Dataflow Graph to LLVM for WrapperAPI Pass", + false /* does not modify the CFG */, + true /* transformation, * + * not just analysis */); + diff --git a/lib/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.exports b/lib/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.exports new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/DFG2LLVM_WrapperAPI/LLVMBuild.txt b/lib/DFG2LLVM_WrapperAPI/LLVMBuild.txt new file mode 100644 index 0000000000..b4ebb8019d --- /dev/null +++ b/lib/DFG2LLVM_WrapperAPI/LLVMBuild.txt @@ -0,0 +1,21 @@ +;===- ./lib/Transforms/DFG2LLVM_WrapperAPI/LLVMBuild.txt -------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = DFG2LLVM_WrapperAPI +parent = Transforms diff --git a/lib/DFG2LLVM_X86/CMakeLists.txt b/lib/DFG2LLVM_X86/CMakeLists.txt new file mode 100644 index 0000000000..6a78066c44 --- /dev/null +++ b/lib/DFG2LLVM_X86/CMakeLists.txt @@ -0,0 +1,11 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( LLVMDFG2LLVM_X86 + DFG2LLVM_X86.cpp + + DEPENDS intrinsics_gen + PLUGIN_TOOL + opt + ) diff --git a/lib/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/lib/DFG2LLVM_X86/DFG2LLVM_X86.cpp new file mode 100644 index 0000000000..b693bd0be4 --- /dev/null +++ b/lib/DFG2LLVM_X86/DFG2LLVM_X86.cpp @@ -0,0 +1,2082 @@ +//===-------------------------- DFG2LLVM_X86.cpp --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "DFG2LLVM_X86" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Constant.h" +#include "llvm/SupportVISC/DFG2LLVM.h" + +using namespace llvm; +using namespace builddfg; +using namespace dfg2llvm; + +// VISC Command line option to use timer or not +static cl::opt<bool> +VISCTimer_X86("visc-timers-x86", cl::desc("Enable visc timers")); +// Command line option to enable device abstraction or not +static cl::opt<bool> +DeviceAbstraction("visc-eda", cl::init(false), cl::Hidden, + cl::desc("Enable visc device abstraction")); + + +namespace { + +// Helper Functions +static bool isVISCCall_llvm_visc_policy_getVersion(Instruction *I) { + if (!isa<CallInst>(I)) + return false; + CallInst *CI = cast<CallInst>(I); + return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("llvm_visc_policy_getVersion"); +} + +CallInst *get_llvm_visc_policy_getVersion_call(Function *F) { + for (inst_iterator ib = inst_begin(F), ie = inst_end(F); ib != ie; ++ib) { + Instruction *I = &*ib; + if (isVISCCall_llvm_visc_policy_getVersion(I)) + return cast<CallInst>(I); + } + return NULL; +} + +// DFG2LLVM_X86 - The first implementation. +struct DFG2LLVM_X86 : public DFG2LLVM { + static char ID; // Pass identification, replacement for typeid + DFG2LLVM_X86() :DFG2LLVM(ID) {} + +private: + // Member variables + + // Functions + +public: + bool runOnModule(Module &M); +}; + +// Visitor for Code generation traversal (tree traversal for now) +class CGT_X86 : public CodeGenTraversal { + +private: + //Member variables + + Constant* malloc; + // VISC Runtime API + Constant* llvm_visc_x86_launch; + Constant* llvm_visc_x86_wait; + Constant* llvm_visc_x86_argument_ptr; + + Constant* llvm_visc_streamLaunch; + Constant* llvm_visc_streamPush; + Constant* llvm_visc_streamPop; + Constant* llvm_visc_streamWait; + Constant* llvm_visc_createBindInBuffer; + Constant* llvm_visc_createBindOutBuffer; + Constant* llvm_visc_createEdgeBuffer; + Constant* llvm_visc_createLastInputBuffer; + Constant* llvm_visc_createThread; + //Constant* llvm_visc_freeThreads; + Constant* llvm_visc_bufferPush; + Constant* llvm_visc_bufferPop; + Constant* llvm_visc_x86_dstack_push; + Constant* llvm_visc_x86_dstack_pop; + Constant* llvm_visc_x86_getDimLimit; + Constant* llvm_visc_x86_getDimInstance; + + //Functions + std::vector<IntrinsicInst*>* getUseList(Value* LI); + Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = ""); + void addDoWhileLoop(Instruction*, Instruction*, Value*); + void addWhileLoop(Instruction*, Instruction*, Instruction*, Value*); + Instruction *addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *); + Argument* getArgumentFromEnd(Function* F, unsigned offset); + Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, + Instruction* InsertBefore); + void invokeChild_X86(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap, + Instruction* InsertBefore); + void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap, + Instruction* InsertBefore); + StructType* getArgumentListStructTy(DFNode*); + Function* createFunctionFilter(DFNode* C); + void startNodeThread(DFNode*, std::vector<Value*>, DenseMap<DFEdge*, Value*>, + Value*, Value*, Instruction*); + Function* createLaunchFunction(DFInternalNode*); + Function* createPushFunction(DFInternalNode*); + Function* createPopFunction(DFInternalNode*); + Function* createWaitFunction(DFInternalNode*); + + // Virtual Functions + void init() { + VISCTimer = VISCTimer_X86; + TargetName = "X86"; + } + void initRuntimeAPI(); + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); + Function* codeGenStreamPush(DFInternalNode* N); + Function* codeGenStreamPop(DFInternalNode* N); + +public: + // Constructor + CGT_X86(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG) { + init(); + initRuntimeAPI(); + } + + void codeGenLaunch(DFInternalNode* Root); + void codeGenLaunchStreaming(DFInternalNode* Root); +}; + +bool DFG2LLVM_X86::runOnModule(Module &M) { + errs() << "\nDFG2LLVM_X86 PASS\n"; + + // Get the BuildDFG Analysis Results: + // - Dataflow graph + // - Maps from i8* hansles to DFNode and DFEdge + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + //DFInternalNode *Root = DFG.getRoot(); + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); + // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); + + // Visitor for Code Generation Graph Traversal + CGT_X86 *CGTVisitor = new CGT_X86(M, DFG); + + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode: Roots) { + // Initiate code generation for root DFNode + CGTVisitor->visit(rootNode); + // Go ahead and replace the launch intrinsic with pthread call, otherwise return now. + // TODO: Later on, we might like to do this in a separate pass, which would + // allow us the flexibility to switch between complete static code generation + // for DFG or having a customized runtime+scheduler + + // Do streaming code generation if root node is streaming. Usual otherwise + if(rootNode->isChildGraphStreaming()) + CGTVisitor->codeGenLaunchStreaming(rootNode); + else + CGTVisitor->codeGenLaunch(rootNode); + } + + delete CGTVisitor; + return true; +} + +// Initialize the VISC runtime API. This makes it easier to insert these calls +void CGT_X86::initRuntimeAPI() { + + // Load Runtime API Module + SMDiagnostic Err; + + char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); + + Twine llvmSrcRoot = LLVM_SRC_ROOT; + Twine runtimeAPI = llvmSrcRoot+"/../build/projects/visc-rt/visc-rt.ll"; + + runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); + + if(runtimeModule == NULL) + DEBUG(errs() << Err.getMessage()); + else + DEBUG(errs() << "Successfully loaded visc-rt API module\n"); + + // Get or insert the global declarations for launch/wait functions + DECLARE(llvm_visc_x86_launch); + DECLARE(malloc); + DECLARE(llvm_visc_x86_wait); + DECLARE(llvm_visc_x86_argument_ptr); + DECLARE(llvm_visc_streamLaunch); + DECLARE(llvm_visc_streamPush); + DECLARE(llvm_visc_streamPop); + DECLARE(llvm_visc_streamWait); + DECLARE(llvm_visc_createBindInBuffer); + DECLARE(llvm_visc_createBindOutBuffer); + DECLARE(llvm_visc_createEdgeBuffer); + DECLARE(llvm_visc_createLastInputBuffer); + DECLARE(llvm_visc_createThread); + //DECLARE(llvm_visc_freeThreads); + DECLARE(llvm_visc_bufferPush); + DECLARE(llvm_visc_bufferPop); + DECLARE(llvm_visc_x86_dstack_push); + DECLARE(llvm_visc_x86_dstack_pop); + DECLARE(llvm_visc_x86_getDimLimit); + DECLARE(llvm_visc_x86_getDimInstance); + + // Get or insert timerAPI functions as well if you plan to use timers + initTimerAPI(); + + // Insert init context in main + Function* VI = M.getFunction("llvm.visc.init"); + assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); + DEBUG(errs() << "Inserting x86 timer initialization\n"); + Instruction* I = cast<Instruction>(*VI->user_begin()); + initializeTimerSet(I); + switchToTimer(visc_TimerID_NONE, I); + // Insert code for initializing the sceduling policy + Function *IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_init", + runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType())); + CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I); + DEBUG(errs() << *IPCallInst << "\n"); + + // If device abstraction is enabled, we add a runtime call to start the + // device status simulation + if (DeviceAbstraction) { + Function *ID = + cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_start", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_start")->getFunctionType())); + CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I); + DEBUG(errs() << *IDCallInst << "\n"); + } + + // Insert print instruction at visc exit + Function* VC = M.getFunction("llvm.visc.cleanup"); + assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once"); + + // Insert code for clearing the sceduling policy + I = cast<Instruction>(*VC->user_begin()); + IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_clear", + runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType())); + IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I); + errs() << *IPCallInst << "\n"; + + DEBUG(errs() << "Inserting x86 timer print\n"); + printTimerSet(I); + + // If device abstraction is enabled, we add a runtime call to end the + // device status simulation + if (DeviceAbstraction) { + Function *ID = + cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_end", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_end")->getFunctionType())); + CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I); + DEBUG(errs() << *IDCallInst << "\n"); + } + +} + +/* Returns vector of all wait instructions + */ +std::vector<IntrinsicInst*>* CGT_X86::getUseList(Value* GraphID) { + std::vector<IntrinsicInst*>* UseList = new std::vector<IntrinsicInst*>(); + // It must have been loaded from memory somewhere + for(Value::user_iterator ui = GraphID->user_begin(), + ue = GraphID->user_end(); ui!=ue; ++ui) { + if(IntrinsicInst* waitI = dyn_cast<IntrinsicInst>(*ui)) { + UseList->push_back(waitI); + } + //else if (PHINode* PN = dyn_cast<PHINode>(*ui)){ + //errs() << "Found PhiNode use of graphID\n"; + //std::vector<IntrinsicInst*>* phiUseList = getUseList(PN); + //UseList->insert(UseList->end(), phiUseList->begin(), phiUseList->end()); + //free(phiUseList); + //} + else { + llvm_unreachable("Error: Operation on Graph ID not supported!\n"); + } + } + return UseList; +} + +/* Traverse the function argument list in reverse order to get argument at a + * distance offset fromt he end of argument list of function F + */ +Argument* CGT_X86::getArgumentFromEnd(Function* F, unsigned offset) { + assert((F->getFunctionType()->getNumParams() >= offset && offset > 0) + && "Invalid offset to access arguments!"); + Function::arg_iterator e = F->arg_end(); + // Last element of argument iterator is dummy. Skip it. + e--; + Argument* arg; + for( ; offset != 0; e--) { + offset--; + arg = &*e; + } + return arg; +} + +/* Add Loop around the instruction I + * Algorithm: + * (1) Split the basic block of instruction I into three parts, where the + * middleblock/body would contain instruction I. + * (2) Add phi node before instruction I. Add incoming edge to phi node from + * predecessor + * (3) Add increment and compare instruction to index variable + * (4) Replace terminator/branch instruction of body with conditional branch + * which loops over bidy if true and goes to end if false + * (5) Update phi node of body + */ +void CGT_X86::addWhileLoop(Instruction* CondBlockStart, Instruction* BodyStart, + Instruction* BodyEnd, Value* TerminationCond) { + BasicBlock* Entry = CondBlockStart->getParent(); + BasicBlock* CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition"); + BasicBlock* WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body"); + BasicBlock* WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end"); + + // Replace the terminator instruction of conditional with new conditional + // branch which goes to while.body if true and branches to while.end otherwise + BranchInst* BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond); + ReplaceInstWithInst(CondBlock->getTerminator(), BI); + + // While Body should jump to condition block + BranchInst* UnconditionalBranch = BranchInst::Create(CondBlock); + ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch); + +} + +Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond, + BasicBlock *Body) { + Module *M = Entry->getParent()->getParent(); + Type *Int64Ty = Type::getInt64Ty(M->getContext()); + + // Insert a PHI instruction at the beginning of the condition block + Instruction *IB = Cond->getFirstNonPHI(); + PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB); + + ConstantInt *IConst = + ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true); + Instruction *CounterIncr = + BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst, + "cnt_incr", Body->getTerminator()); + + // Set incoming values for Phi node + IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true); + CounterPhi->addIncoming(IConst, Entry); + CounterPhi->addIncoming(CounterIncr, Body); + + // Return the pointer to the created PHI node in the corresponding argument + return CounterPhi; +} + +/* Add Loop around the instruction I + * Algorithm: + * (1) Split the basic block of instruction I into three parts, where the + * middleblock/body would contain instruction I. + * (2) Add phi node before instruction I. Add incoming edge to phi node from + * predecessor + * (3) Add increment and compare instruction to index variable + * (4) Replace terminator/branch instruction of body with conditional branch + * which loops over bidy if true and goes to end if false + * (5) Update phi node of body + */ +void CGT_X86::addDoWhileLoop(Instruction* From, Instruction* To, Value* TerminationCond) { + BasicBlock* Entry = From->getParent(); + BasicBlock* ForBody = Entry->splitBasicBlock(From, "for.body"); + + // To Instruction should also belong to the same basic block as the From basic + // block will have a terminator instruction + assert(To->getParent() == ForBody + && "To Instruction should also belong to the same basic block!"); + BasicBlock* ForEnd = ForBody->splitBasicBlock(To, "for.end"); + + // Replace the terminator instruction of for.body with new conditional + // branch which loops over body if true and branches to for.end otherwise + BranchInst* BI = BranchInst::Create(ForEnd, ForBody, TerminationCond); + ReplaceInstWithInst(ForBody->getTerminator(), BI); + +} + +/* Add Loop around the instruction I + * Algorithm: + * (1) Split the basic block of instruction I into three parts, where the + * middleblock/body would contain instruction I. + * (2) Add phi node before instruction I. Add incoming edge to phi node from + * predecessor + * (3) Add increment and compare instruction to index variable + * (4) Replace terminator/branch instruction of body with conditional branch + * which loops over bidy if true and goes to end if false + * (5) Update phi node of body + */ +Value* CGT_X86::addLoop(Instruction* I, Value* limit, const Twine& indexName) { + BasicBlock* Entry = I->getParent(); + BasicBlock* ForBody = Entry->splitBasicBlock(I, "for.body"); + + BasicBlock::iterator i(I); + ++i; + Instruction* NextI = &*i; + // Next Instruction should also belong to the same basic block as the basic + // block will have a terminator instruction + assert(NextI->getParent() == ForBody + && "Next Instruction should also belong to the same basic block!"); + BasicBlock* ForEnd = ForBody->splitBasicBlock(NextI, "for.end"); + + + // Add Phi Node for index variable + PHINode* IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()), + 2, "index."+indexName, I); + + // Add incoming edge to phi + IndexPhi->addIncoming(ConstantInt::get(Type::getInt64Ty(I->getContext()), 0), + Entry); + // Increment index variable + BinaryOperator* IndexInc = BinaryOperator::Create(Instruction::Add, + IndexPhi, ConstantInt::get(Type::getInt64Ty(I->getContext()), 1), + "index."+indexName+".inc", ForBody->getTerminator()); + + // Compare index variable with limit + CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc, + limit, "cond."+indexName, ForBody->getTerminator()); + + // Replace the terminator instruction of for.body with new conditional + // branch which loops over body if true and branches to for.end otherwise + BranchInst* BI = BranchInst::Create(ForBody, ForEnd, Cond); + ReplaceInstWithInst(ForBody->getTerminator(), BI); + + // Add incoming edge to phi node in body + IndexPhi->addIncoming(IndexInc, ForBody); + return IndexPhi; +} + +// Returns a packed struct type. The structtype is created by packing the input +// types, output types and isLastInput buffer type. All the streaming +// inputs/outputs are converted to i8*, since this is the type of buffer +// handles. +StructType* CGT_X86::getArgumentListStructTy(DFNode* C) { + std::vector<Type*> TyList; + // Input types + Function* CF = C->getFuncPointer(); + for(Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end(); + ai != ae; ++ai) { + if(C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge()) + TyList.push_back(Type::getInt8PtrTy(CF->getContext())); + else + TyList.push_back(ai->getType()); + } + // Output Types + StructType* OutStructTy = cast<StructType>(CF->getReturnType()); + for (unsigned i = 0; i < OutStructTy->getNumElements(); i++) { + // All outputs of a node are streaming edge + assert(C->getOutDFEdgeAt(i)->isStreamingEdge() + && "All output edges of child node have to be streaming"); + TyList.push_back(Type::getInt8PtrTy(CF->getContext())); + } + // isLastInput buffer element + TyList.push_back(Type::getInt8PtrTy(CF->getContext())); + + StructType* STy = StructType::create(CF->getContext(), TyList, + Twine("struct.thread."+CF->getName()).str(), true); + return STy; + +} + +void CGT_X86::startNodeThread(DFNode* C, std::vector<Value*> Args, DenseMap<DFEdge*, Value*> + EdgeBufferMap, Value* isLastInputBuffer, Value* graphID, + Instruction* IB) { + DEBUG(errs() << "Starting Pipeline for child node: " << C->getFuncPointer()->getName() << "\n"); + // Create a filter/pipeline function for the child node + Function* C_Pipeline = createFunctionFilter(C); + Function* CF = C->getFuncPointer(); + + // Get module context and i32 0 constant, as they would be frequently used in + // this function. + LLVMContext& Ctx = IB->getParent()->getContext(); + Constant* IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0); + + // Marshall arguments + // Create a packed struct type with inputs of C followed by outputs and then + // another i8* to indicate isLastInput buffer. Streaming inputs are replaced + // by i8* + // + StructType* STy = getArgumentListStructTy(C); + // Allocate the struct on heap *NOT* stack and bitcast i8* to STy* + CallInst* CI = CallInst::Create(malloc, ArrayRef<Value*>(ConstantExpr::getSizeOf(STy)), + C->getFuncPointer()->getName()+".inputs", IB); + CastInst* Struct = BitCastInst::CreatePointerCast(CI, STy->getPointerTo(), CI->getName()+".i8ptr", IB); + //AllocaInst* AI = new AllocaInst(STy, C->getFuncPointer()->getName()+".inputs", IB); + // Insert elements in the struct + DEBUG(errs() << "Marshall inputs for child node: " << C->getFuncPointer()->getName() << "\n"); + // Marshall Inputs + for(unsigned i=0; i < CF->getFunctionType()->getNumParams(); i++) { + // Create constant int (i) + Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i); + // Get Element pointer instruction + Value* GEPIndices[] = { IntZero, Int_i }; + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, + ArrayRef<Value*>(GEPIndices, 2), + Struct->getName()+".arg_"+Twine(i), + IB); + DFEdge* E = C->getInDFEdgeAt(i); + if (E->getSourceDF()->isEntryNode()) { + // This is a Bind Input Edge + if(E->isStreamingEdge()) { + // Streaming Bind Input edge. Get buffer corresponding to it + assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming Bind DFEdge!"); + new StoreInst(EdgeBufferMap[E], GEP, IB); + } + else { + // Non-streaming Bind edge + new StoreInst(Args[i], GEP, IB); + } + } + else { + // This is an edge between siblings. + // This must be an streaming edge. As it is our assumption that all edges + // between two nodes in a DFG are streaming. + assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming DFEdge!"); + new StoreInst(EdgeBufferMap[E], GEP, IB); + } + } + unsigned numInputs = CF->getFunctionType()->getNumParams(); + unsigned numOutputs = cast<StructType>(CF->getReturnType())->getNumElements(); + // Marshall Outputs + DEBUG(errs() << "Marshall outputs for child node: " << C->getFuncPointer()->getName() << "\n"); + for(unsigned i = 0; i < numOutputs; i++ ) { + // Create constant int (i+numInputs) + Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i+numInputs); + // Get Element pointer instruction + Value* GEPIndices[] = { IntZero, Int_i }; + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, + ArrayRef<Value*>(GEPIndices, 2), + Struct->getName()+".out_"+Twine(i), + IB); + DFEdge* E = C->getOutDFEdgeAt(i); + assert(E->isStreamingEdge() && "Output Edge must be streaming of all nodes"); + assert(EdgeBufferMap.count(E) && "No mapping buffer for a Out Streaming DFEdge!"); + new StoreInst(EdgeBufferMap[E], GEP, IB); + } + // Marshall last argument. isLastInput buffer + DEBUG(errs() << "Marshall isLastInput for child node: " << C->getFuncPointer()->getName() << "\n"); + // Create constant int (i+numInputs) + Constant* Int_index = ConstantInt::get(Type::getInt32Ty(Ctx), numInputs+numOutputs); + // Get Element pointer instruction + Value* GEPIndices[] = { IntZero, Int_index }; + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, + ArrayRef<Value*>(GEPIndices, 2), + Struct->getName()+".isLastInput", IB); + new StoreInst(isLastInputBuffer, GEP, IB); + + // AllocaInst AI points to memory with all the arguments packed + // Call runtime to create the thread with these arguments + DEBUG(errs() << "Start Thread for child node: " << C->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << *llvm_visc_createThread << "\n"); + DEBUG(errs() << *graphID->getType() << "\n"); + DEBUG(errs() << *C_Pipeline->getType() << "\n"); + DEBUG(errs() << *Struct->getType() << "\n"); + // Bitcast AI to i8* + CastInst* BI = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx), Struct->getName(), IB); + Value* CreateThreadArgs[] = {graphID, C_Pipeline, BI}; + CallInst* CreateThread = CallInst::Create(llvm_visc_createThread, + ArrayRef<Value*>(CreateThreadArgs, 3), + "", + IB); + +} + +Function* CGT_X86::createLaunchFunction(DFInternalNode* N) { + DEBUG(errs() << "Generating Streaming Launch Function\n"); + // Get Function associated with Node N + Function* NF = N->getFuncPointer(); + + // Map from Streaming edge to buffer + DenseMap<DFEdge*, Value*> EdgeBufferMap; + + /* Now we have all the necessary global declarations necessary to generate the + * Launch function, pointer to which can be passed to pthread utils to execute + * DFG. The Launch function has just one input: i8* data.addr + * This is the address of the all the input data that needs to be passed to + * this function. In our case it contains the input arguments of the Root + * function in the correct order. + * (1) Create an empty Launch function of type void (i8* args, i8* GraphID) + * (2) Extract each of inputs from data.addr + * (3) create Buffers for all the streaming edges + * - Put buffers in the context + * (4) Go over each child node + * - marshall its arguments together (use buffers in place of streaming + * arguments) + * - Start the threads + * (5) The return value from Root is stored in memory, pointer to which is + * passed to pthread_exit call. + */ + // (1) Create Launch Function of type void (i8* args, i8* GraphID) + Type* i8Ty = Type::getInt8Ty(M.getContext()); + Type* ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()}; + FunctionType* LaunchFuncTy = FunctionType::get(Type::getVoidTy(NF->getContext()), + ArrayRef<Type*>(ArgTypes, 2), false); + Function* LaunchFunc = Function::Create(LaunchFuncTy, + NF->getLinkage(), + NF->getName()+".LaunchFunction", + &M); + DEBUG(errs() << "Generating Code for Streaming Launch Function\n"); + // Give a name to the argument which is used pass data to this thread + Argument* data = &*LaunchFunc->arg_begin(); + Argument* graphID = &*(++LaunchFunc->arg_begin()); + data->setName("data.addr"); + graphID->setName("graphID"); + // Add a basic block to this empty function and a return null statement to it + DEBUG(errs() << *LaunchFunc->getReturnType() << "\n"); + BasicBlock *BB = BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc); + ReturnInst* RI = ReturnInst::Create(LaunchFunc->getContext(), + BB); + + DEBUG(errs() << "Created Empty Launch Function\n"); + + // (2) Extract each of inputs from data.addr + std::vector<Type*> TyList; + std::vector<std::string> names; + std::vector<Value*> Args; + + for (Function::arg_iterator ai = NF->arg_begin(), ae = NF->arg_end(); + ai != ae; ++ai) { + if(N->getChildGraph()->getEntry()->getOutDFEdgeAt(ai->getArgNo())->isStreamingEdge()) { + TyList.push_back(i8Ty->getPointerTo()); + names.push_back(Twine(ai->getName()+"_buffer").str()); + continue; + } + TyList.push_back(ai->getType()); + names.push_back(ai->getName()); + } + Args = extractElements(data, TyList, names, RI); + DEBUG(errs() << "Launch function for " << NF->getName() << *LaunchFunc << "\n"); + // (3) Create buffers for all the streaming edges + for(DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(), + de = N->getChildGraph()->dfedge_end(); di != de; ++di) { + DFEdge* Edge = *di; + DEBUG(errs() << *Edge->getType() << "\n"); + Value* size = ConstantExpr::getSizeOf(Edge->getType()); + Value* CallArgs[] = {graphID, size}; + if (Edge->isStreamingEdge()) { + CallInst* CI; + // Create a buffer call + if(Edge->getSourceDF()->isEntryNode()) { + // Bind Input Edge + Constant* Int_ArgNo = ConstantInt::get(Type::getInt32Ty(RI->getContext()), + Edge->getSourcePosition()); + Value* BindInCallArgs[] = {graphID, size, Int_ArgNo}; + CI = CallInst::Create(llvm_visc_createBindInBuffer, ArrayRef<Value*>(BindInCallArgs, 3), + "BindIn."+Edge->getDestDF()->getFuncPointer()->getName(), + RI); + } + else if(Edge->getDestDF()->isExitNode()) { + // Bind Output Edge + CI = CallInst::Create(llvm_visc_createBindOutBuffer, ArrayRef<Value*>(CallArgs, 2), + "BindOut."+Edge->getSourceDF()->getFuncPointer()->getName(), + RI); + } + else { + // Streaming Edge + CI = CallInst::Create(llvm_visc_createEdgeBuffer, + ArrayRef<Value*>(CallArgs, 2), + Edge->getSourceDF()->getFuncPointer()->getName()+"." + +Edge->getDestDF()->getFuncPointer()->getName(), + RI); + } + EdgeBufferMap[Edge] = CI; + } + } + // Create buffer for isLastInput for all the child nodes + DFGraph* G = N->getChildGraph(); + DenseMap<DFNode*, Value*> NodeLastInputMap; + for(DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce; ++ci) { + DFNode* child = *ci; + if(child->isDummyNode()) + continue; + Value* size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext())); + Value* CallArgs[] = {graphID, size}; + CallInst* CI = CallInst::Create(llvm_visc_createLastInputBuffer, ArrayRef<Value*>(CallArgs, 2), + "BindIn.isLastInput."+child->getFuncPointer()->getName(), + RI); + NodeLastInputMap[child] = CI; + } + DEBUG(errs() << "Start Each child node filter\n"); + // (4) Marshall arguments for each child node and start the thread with its + // pipeline funtion + for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); ci != ce; ++ci) { + DFNode* C = *ci; + // Skip dummy node call + if (C->isDummyNode()) + continue; + + // Marshall all the arguments for this node into an i8* + // Pass to the runtime to create the thread + // Start the thread for child node C + startNodeThread(C, Args, EdgeBufferMap, NodeLastInputMap[C], graphID, RI); + } + + DEBUG(errs() << "Launch function:\n"); + DEBUG(errs() << *LaunchFunc << "\n"); + + return LaunchFunc; +} + + +Function* CGT_X86::createPushFunction(DFInternalNode* N) { + DEBUG(errs() << "Generating Push function\n"); + Function* PushFunc; + return PushFunc; +} + +Function* CGT_X86::createPopFunction(DFInternalNode* N) { + DEBUG(errs() << "Generating Pop function\n"); + Function* PushFunc; + return PushFunc; +} + +Function* CGT_X86::createWaitFunction(DFInternalNode* N) { + DEBUG(errs() << "Generating Wait function\n"); + Function* PushFunc; + return PushFunc; +} +/* This fuction does the steps necessary to launch a streaming graph + * Steps + * Create Pipeline/Filter function for each node in child graph of Root + * Create Functions DFGLaunch, DFGPush, DFGPop, DFGWait + * Modify each of the instrinsic in host code + * Launch, Push, Pop, Wait + */ +void CGT_X86::codeGenLaunchStreaming(DFInternalNode* Root) { + IntrinsicInst* LI = Root->getInstruction(); + Function* RootLaunch = createLaunchFunction(Root); + //Function* RootPush = createPushFunction(Root); + //Function* RootPop = createPopFunction(Root); + //Function* RootWait = createWaitFunction(Root); + // Substitute launch intrinsic main + DEBUG(errs() << "Substitute launch intrinsic\n"); + Value* LaunchInstArgs[] = {RootLaunch, + LI->getArgOperand(1) + }; + CallInst* LaunchInst = CallInst::Create(llvm_visc_streamLaunch, + ArrayRef<Value*>(LaunchInstArgs,2), + "graph"+Root->getFuncPointer()->getName(), LI); + //ReplaceInstWithInst(LI, LaunchInst); + + DEBUG(errs() << *LaunchInst << "\n"); + // Replace all wait instructions with x86 specific wait instructions + DEBUG(errs() << "Substitute wait, push, pop intrinsics\n"); + std::vector<IntrinsicInst*>* UseList = getUseList(LI); + for(unsigned i=0; i < UseList->size(); ++i) { + IntrinsicInst* II = UseList->at(i); + CallInst* CI; + Value* PushArgs[] = {LaunchInst, II->getOperand(1)}; + switch(II->getIntrinsicID()) { + case Intrinsic::visc_wait: + CI = CallInst::Create(llvm_visc_streamWait, + ArrayRef<Value*>(LaunchInst), + ""); + break; + case Intrinsic::visc_push: + CI = CallInst::Create(llvm_visc_streamPush, + ArrayRef<Value*>(PushArgs, 2), + ""); + break; + case Intrinsic::visc_pop: + CI = CallInst::Create(llvm_visc_streamPop, + ArrayRef<Value*>(LaunchInst), + ""); + break; + default: + llvm_unreachable("GraphID is used by an instruction other than wait, push, pop"); + }; + DEBUG(errs() << "Replace:\n\t" << *II << "\n"); + ReplaceInstWithInst(II, CI); + DEBUG(errs() << "\twith " << *CI << "\n"); + } + + +} + +void CGT_X86::codeGenLaunch(DFInternalNode* Root) { + // TODO: Place an assert to check if the constant passed by launch intrinsic + // as the number of arguments to DFG is same as the number of arguments of the + // root of DFG + DEBUG(errs() << "Generating Launch Function\n"); + // Get Launch Instruction + IntrinsicInst* LI = Root->getInstruction(); + switchToTimer(visc_TimerID_PTHREAD_CREATE, LI); + DEBUG(errs() << "Generating Launch Function\n"); + + /* Now we have all the necessary global declarations necessary to generate the + * Launch function, pointer to which can be passed to pthread utils to execute + * DFG. The Launch function has just one input: i8* data.addr + * This is the address of the all the input data that needs to be passed to + * this function. In our case it contains the input arguments of the Root + * function in the correct order. + * (1) Create an empty Launch function of type i8*(i8*) + * (2) Extract each of inputs from data.addr and pass them as arguments to the + * call to Root function + * (3) The return value from Root is stored in memory, pointer to which is + * passed to pthread_exit call. + */ + // Create Launch Function of type i8*(i8*) which calls the root function + Type* i8Ty = Type::getInt8Ty(M.getContext()); + FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(), + ArrayRef<Type*>(i8Ty->getPointerTo()), + false); + Function* AppFunc = Function::Create(AppFuncTy, + Root->getFuncPointer()->getLinkage(), + "LaunchDataflowGraph", + &M); + DEBUG(errs() << "Generating Launch Function\n"); + // Give a name to the argument which is used pass data to this thread + Value* data = &*AppFunc->arg_begin(); + data->setName("data.addr"); + // Add a basic block to this empty function and a return null statement to it + BasicBlock *BB = BasicBlock::Create(AppFunc->getContext(), "entry", AppFunc); + ReturnInst* RI = ReturnInst::Create(AppFunc->getContext(), + Constant::getNullValue(AppFunc->getReturnType()), + BB); + switchToTimer(visc_TimerID_ARG_UNPACK, RI); + + DEBUG(errs() << "Created Empty Launch Function\n"); + // Find the X86 function generated for Root and +// Function* RootF_X86 = Root->getGenFunc(); + Function* RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET); + assert(RootF_X86 && "Error: No generated CPU function for Root node\n"); + assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) && + "Error: Generated Function for Root node with no x86 wrapper\n"); + + // Generate a call to RootF_X86 with null parameters for now + std::vector<Value*>Args; + for(unsigned i=0; i< RootF_X86->getFunctionType()->getNumParams(); i++) { + Args.push_back(Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i))); + } + CallInst* CI = CallInst::Create(RootF_X86, Args, RootF_X86->getName()+".output", RI); + + // Extract input data from i8* data.addr and patch them to correct argument of + // call to RootF_X86. For each argument + std::vector<Type*> TyList; + std::vector<std::string> names; + for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end(); + ai != ae; ++ai) { + TyList.push_back(ai->getType()); + names.push_back(ai->getName()); + } + std::vector<Value*> elements = extractElements(data, TyList, names, CI); + // Patch the elements to the call arguments + for(unsigned i=0; i<CI->getNumArgOperands(); i++) + CI->setArgOperand(i, elements[i]); + + // Add timers around Call to RootF_X86 function + switchToTimer(visc_TimerID_COMPUTATION, CI); + switchToTimer(visc_TimerID_OUTPUT_PACK, RI); + + // Code for returning the output + CastInst* OutputAddrCast = CastInst::CreatePointerCast(data, + CI->getType()->getPointerTo(), + CI->getName()+".addr", + RI); + new StoreInst(CI, OutputAddrCast, RI); + switchToTimer(visc_TimerID_NONE, RI); + + DEBUG(errs() << "Application specific function:\n"); + DEBUG(errs() << *AppFunc << "\n"); + + // Substitute launch intrinsic main + Value* LaunchInstArgs[] = {AppFunc, + LI->getArgOperand(1) + }; + CallInst* LaunchInst = CallInst::Create(llvm_visc_x86_launch, + ArrayRef<Value*>(LaunchInstArgs,2), + "graph"+Root->getFuncPointer()->getName(), LI); + //ReplaceInstWithInst(LI, LaunchInst); + + DEBUG(errs() << *LaunchInst << "\n"); + // Replace all wait instructions with x86 specific wait instructions + std::vector<IntrinsicInst*>* UseList = getUseList(LI); + for(unsigned i=0; i < UseList->size(); ++i) { + IntrinsicInst* II = UseList->at(i); + CallInst* CI; + switch(II->getIntrinsicID()) { + case Intrinsic::visc_wait: + CI = CallInst::Create(llvm_visc_x86_wait, + ArrayRef<Value*>(LaunchInst), + ""); + break; + case Intrinsic::visc_push: + CI = CallInst::Create(llvm_visc_bufferPush, + ArrayRef<Value*>(LaunchInst), + ""); + break; + case Intrinsic::visc_pop: + CI = CallInst::Create(llvm_visc_bufferPop, + ArrayRef<Value*>(LaunchInst), + ""); + break; + default: + llvm_unreachable("GraphID is used by an instruction other than wait, push, pop"); + }; + ReplaceInstWithInst(II, CI); + DEBUG(errs() << *CI << "\n"); + } + +} + +Value* CGT_X86::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore) { + // TODO: Assumption is that each input port of a node has just one + // incoming edge. May change later on. + + // Find the incoming edge at the requested input port + DFEdge* E = Child->getInDFEdgeAt(i); + assert(E && "No incoming edge or binding for input element!"); + // Find the Source DFNode associated with the incoming edge + DFNode* SrcDF = E->getSourceDF(); + + // If Source DFNode is a dummyNode, edge is from parent. Get the + // argument from argument list of this internal node + Value* inputVal; + if(SrcDF->isEntryNode()) { + inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition()); + DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + } + else { + // edge is from a sibling + // Check - code should already be generated for this source dfnode + assert(OutputMap.count(SrcDF) + && "Source node call not found. Dependency violation!"); + + // Find CallInst associated with the Source DFNode using OutputMap + Value* CI = OutputMap[SrcDF]; + + // Extract element at source position from this call instruction + std::vector<unsigned> IndexList; + IndexList.push_back(E->getSourcePosition()); + DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); + ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, + "", InsertBefore); + inputVal = EI; + } + return inputVal; +} + +void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86, + ValueToValueMapTy &VMap,Instruction* IB) { + Function* CF = C->getFuncPointer(); + +// Function* CF_X86 = C->getGenFunc(); + Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET); + assert(CF_X86 != NULL + && "Found leaf node for which code generation has not happened yet!\n"); + assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) && + "The generated function to be called from x86 backend is not an x86 function\n"); + DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n"); + + std::vector<Value*> Args; + // Create argument list to pass to call instruction + // First find the correct values using the edges + // The remaing six values are inserted as constants for now. + for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) { + Args.push_back(getInValueAt(C, i, F_X86, IB)); + } + + Value* I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0); + for(unsigned j=0; j<6; j++) + Args.push_back(I64Zero); + + errs() << "Gen Function type: " << *CF_X86->getType() << "\n"; + errs() << "Node Function type: " << *CF->getType() << "\n"; + errs() << "Arguments: " << Args.size() << "\n"; + + // Call the F_X86 function associated with this node + CallInst* CI = CallInst::Create(CF_X86, Args, + CF_X86->getName()+"_output", + IB); + DEBUG(errs() << *CI << "\n"); + OutputMap[C] = CI; + + // Find num of dimensions this node is replicated in. + // Based on number of dimensions, insert loop instructions + std::string varNames[3] = {"x", "y", "z"}; + unsigned numArgs = CI->getNumArgOperands(); + for(unsigned j=0; j < C->getNumOfDim(); j++) { + Value* indexLimit = NULL; + // Limit can either be a constant or an arguement of the internal node. + // In case of constant we can use that constant value directly in the + // new F_X86 function. In case of an argument, we need to get the mapped + // value using VMap + if(isa<Constant>(C->getDimLimits()[j])) { + indexLimit = C->getDimLimits()[j]; + DEBUG(errs() << "In Constant case:\n" + << " indexLimit type = " << *indexLimit->getType() << "\n"); + } + else { + indexLimit = VMap[C->getDimLimits()[j]]; + DEBUG(errs() << "In VMap case:" + <<" indexLimit type = " << *indexLimit->getType() << "\n"); + } + assert(indexLimit && "Invalid dimension limit!"); + // Insert loop + Value* indexVar = addLoop(CI, indexLimit, varNames[j]); + DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n"); + // Insert index variable and limit arguments + CI->setArgOperand(numArgs-6+j, indexVar); + CI->setArgOperand(numArgs-3+j, indexLimit); + } + // Insert call to runtime to push the dim limits and instanceID on the depth + // stack + Value* args[] = { + ConstantInt::get(Type::getInt32Ty(CI->getContext()), C->getNumOfDim()), // numDim + CI->getArgOperand(numArgs-3+0), // limitX + CI->getArgOperand(numArgs-6+0), // iX + CI->getArgOperand(numArgs-3+1), // limitY + CI->getArgOperand(numArgs-6+1), // iY + CI->getArgOperand(numArgs-3+2), // limitZ + CI->getArgOperand(numArgs-6+2) // iZ + }; + + CallInst* Push = CallInst::Create(llvm_visc_x86_dstack_push, ArrayRef<Value*>(args, 7), "", CI); + DEBUG(errs() << "Push on stack: " << *Push << "\n"); + // Insert call to runtime to pop the dim limits and instanceID from the depth + // stack + BasicBlock::iterator i(CI); + ++i; + Instruction* NextI = &*i; + // Next Instruction should also belong to the same basic block as the basic + // block will have a terminator instruction + assert(NextI->getParent() == CI->getParent() + && "Next Instruction should also belong to the same basic block!"); + + CallInst* Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI); + DEBUG(errs() << "Pop from stack: " << *Pop << "\n"); + DEBUG(errs() << *CI->getParent()->getParent()); +} + +/* This function takes a DFNode, and creates a filter function for it. By filter + * function we mean a function which keeps on getting input from input buffers, + * applying the function on the inputs and then pushes data on output buffers + */ +// Create a function with void* (void*) type. +// Create a new basic block +// Add a return instruction to the basic block +// extract arguments from the aggregate data input. Type list would be +// Replace the streaming inputs with i8* types signifying handle to +// corresponding buffers +// Add a boolean argument isLastInput +// Add runtime API calls to get input for each of the streaming inputs +// Add a call to the generated function of the child node +// Add runtime API calls to push output for each of the streaming outputs +// Add loop around the basic block, which exits the loop if isLastInput is false + +Function* CGT_X86::createFunctionFilter(DFNode* C) { + DEBUG(errs() << "*********Creating Function filter for " << C->getFuncPointer()->getName() << "*****\n"); + + /* Create a function with same argument list as child.*/ + DEBUG(errs() << "\tCreate a function with the same argument list as child\n"); + // Get the generated function for child node + Function* CF = C->getFuncPointer(); + // Create Filter Function of type i8*(i8*) which calls the root function + Type* i8Ty = Type::getInt8Ty(M.getContext()); + FunctionType* CF_PipelineTy = FunctionType::get(i8Ty->getPointerTo(), + ArrayRef<Type*>(i8Ty->getPointerTo()), + false); + Function* CF_Pipeline = Function::Create(CF_PipelineTy, + CF->getLinkage(), + CF->getName()+"_Pipeline", + &M); + DEBUG(errs() << "Generating Pipline Function\n"); + // Give a name to the argument which is used pass data to this thread + Value* data = &*CF_Pipeline->arg_begin(); + data->setName("data.addr"); + // Create a new basic block + DEBUG(errs() << "\tCreate new BB and add a return function\n"); + // Add a basic block to this empty function + BasicBlock *BB = BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline); + // Add a return instruction to the basic block + ReturnInst* RI = ReturnInst::Create(CF_Pipeline->getContext(), + UndefValue::get(CF_Pipeline->getReturnType()), BB); + + + /* Extract the elements from the aggregate argument to the function. + * Replace the streaming inputs with i8* types signifying handle to + * corresponding buffers + * Add outputs to the list as well + * Add isLastInput to the list + */ + DEBUG(errs() << "\tReplace streaming input arguments with i8* type\n"); + // These Args will be used when passing arguments to the generated function + // inside loop, and reading outputs as well. + std::vector<Value*> Args; + std::vector<Type*> TyList; + std::vector<std::string> names; + // Adding inputs + for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); + i != e; ++i) { + if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { + TyList.push_back(i8Ty->getPointerTo()); + names.push_back((Twine(i->getName())+"_buffer").str()); + } + else { + TyList.push_back(i->getType()); + names.push_back(i->getName()); + } + } + // Adding outputs. FIXME: Since we assume all outputs to be streaming edges, + // because we get there buffer handles + StructType* RetTy = cast<StructType>(CF->getReturnType()); + for (unsigned i=0; i<RetTy->getNumElements(); i++) { + TyList.push_back(i8Ty->getPointerTo()); + names.push_back("out"); + } + /* Add a boolean argument isLastInput */ + DEBUG(errs() << "\tAdd a boolean argument called isLastInput to function\n"); + TyList.push_back(i8Ty->getPointerTo()); + names.push_back("isLastInput_buffer"); + + // Extract the inputs, outputs and + Args = extractElements(data, TyList, names, RI); + for(unsigned i=0; i<Args.size(); i++) { + DEBUG(errs() << *Args[i] << "\n"); + } + + // Split the Args vector into, input output and isLastInput + unsigned numInputs = CF->getFunctionType()->getNumParams(); + unsigned numOutputs = RetTy->getNumElements(); + std::vector<Value*> InputArgs(Args.begin(), Args.begin() + numInputs); + std::vector<Value*> OutputArgs(Args.begin() + numInputs, Args.begin() + numInputs + numOutputs); + Instruction* isLastInput = cast<Instruction>(Args[Args.size()-1]); + + /* Add runtime API calls to get input for each of the streaming input edges */ + DEBUG(errs() << "\tAdd runtime API calls to get input for each of the streaming input edges\n"); + // First read the termination condition variable islastInput + CallInst* isLastInputPop = CallInst::Create(llvm_visc_bufferPop, + ArrayRef<Value*>(isLastInput), + "", + RI); + + CastInst* BI = BitCastInst::CreateIntegerCast(isLastInputPop, + Type::getInt64Ty(CF_Pipeline->getContext()), + false, + "isLastInput", + RI); + isLastInput = BI; + // Create a loop termination condition + CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, + isLastInput, Constant::getNullValue(Type::getInt64Ty(CF->getContext())), "isLastInputNotZero", + RI); + + // Get input from buffers of all the incoming streaming edges + for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); + i != e; ++i) { + if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { + CallInst* bufferIn = CallInst::Create(llvm_visc_bufferPop, + ArrayRef<Value*>(InputArgs[i->getArgNo()]), + "", + RI); + CastInst* BI; + if(i->getType()->isPointerTy()) { + BI = CastInst::Create(CastInst::IntToPtr, + bufferIn, + i->getType(), + i->getName()+".addr", + RI); + } + else if(i->getType()->isFloatTy()) { + BI = CastInst::CreateFPCast(bufferIn, + i->getType(), + i->getName()+".addr", + RI); + } + else { + BI = CastInst::CreateIntegerCast(bufferIn, + i->getType(), + false, + i->getName()+".addr", + RI); + } + // Replace the argument in Args vector. We would be using the vector as + // parameters passed to the call + InputArgs[i->getArgNo()] = BI; + } + } + /* Add a call to the generated function of the child node */ + DEBUG(errs() << "\tAdd a call to the generated function of the child node\n"); +// DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n"); +// CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs, +// C->getGenFunc()->getName()+".output", RI); + Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET); + DEBUG(errs() << "Type: " + << *CGenF->getType() + << "\n"); + CallInst* CI = CallInst::Create(CGenF, + InputArgs, + CGenF->getName()+".output", + RI); + + /* Add runtime API calls to push output for each of the streaming outputs */ + // FIXME: Assumption + // All edges between siblings are streaming edges + DEBUG(errs() << "\tAdd runtime API calls to push output for each of the streaming outputs\n"); + for (unsigned i=0; i< numOutputs; i++) { + // Extract output + ExtractValueInst* EI = ExtractValueInst::Create(CI, ArrayRef<unsigned>(i), + "",RI); + // Convert to i64 + CastInst* BI; + if(EI->getType()->isPointerTy()) + BI = CastInst::Create(CastInst::PtrToInt,EI, + Type::getInt64Ty(CF_Pipeline->getContext()), + "", + RI); + else + BI = CastInst::CreateIntegerCast(EI, Type::getInt64Ty(CF_Pipeline->getContext()), + false, "", RI); + // Push to Output buffer + Value* bufferOutArgs[] = {OutputArgs[i], BI}; + CallInst* bufferOut = CallInst::Create(llvm_visc_bufferPush, + ArrayRef<Value*>(bufferOutArgs, 2), + "", + RI); + } + + // Add loop around the basic block, which exits the loop if isLastInput is false + //addDoWhileLoop(cast<Instruction>(Cond)->getNextNode(), RI, Cond); +// addWhileLoop(cast<Instruction>(isLastInputPop), cast<Instruction>(Cond)->getNextNode(), +// RI, Cond); + + // Add loop around the basic block, which exits the loop if isLastInput is false + // Pointers to keep the created loop structure + BasicBlock *EntryBB, *CondBB, *BodyBB; + Instruction *CondStartI = cast<Instruction>(isLastInputPop); + Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode(); + EntryBB = CondStartI->getParent(); + + addWhileLoop(CondStartI, BodyStartI, RI, Cond); + CondBB = CondStartI->getParent(); + BodyBB = CI->getParent(); + Instruction *CntI = NULL; + CallInst *GetPolicyCI = get_llvm_visc_policy_getVersion_call(CGenF); + + // If the node function calls the visc runtime call to get policy, we update + // it with the counter information. This means we need to pass an additional + // argument to the generated function, that is the iteration number, and then + // use it as an argument to the policy_getVersion call + if (GetPolicyCI) { + CntI = addWhileLoopCounter(EntryBB, CondBB, BodyBB); + assert(CntI && "Counter instruction not found\n"); + + // Create new function type (with additional argument for iteration number) + Type *NewRetTy = CGenF->getFunctionType()->getReturnType(); + std::vector<Type*> NewArgTypes; + for (Function::arg_iterator ai = CGenF->arg_begin(), ae = CGenF->arg_end(); + ai != ae ; ++ai) { + NewArgTypes.push_back(ai->getType()); + } + NewArgTypes.push_back(Type::getInt64Ty(M.getContext())); + FunctionType *NewFT = FunctionType::get(NewRetTy, NewArgTypes, false); + Function *NewCGenF = viscUtils::cloneFunction(CGenF, NewFT, false); + // At least one (the last) argument exists (we added it) + Function::arg_iterator ae = NewCGenF->arg_end(); + --ae; + Argument *CntArg = &*ae; + CntArg->setName("iteration"); + // Replace the old cpu gen func with this one + C->addGenFunc(NewCGenF, visc::CPU_TARGET, true); + + // Add counter to the actual parameter list, to create the new call + InputArgs.push_back(CntI); + CallInst* newCI = CallInst::Create(NewCGenF, + InputArgs, + NewCGenF->getName()+".output"); + ReplaceInstWithInst(CI, newCI); + + // Set second operand of the policy_getVersion call to the last function + // argument + GetPolicyCI = get_llvm_visc_policy_getVersion_call(NewCGenF); + GetPolicyCI->setArgOperand(1, CntArg); + } + + // Return the Function pointer + DEBUG(errs() << "Pipeline Version of " << CF->getName() << ":\n"); + DEBUG(errs() << *CF_Pipeline << "\n"); + return CF_Pipeline; +} + +void CGT_X86::codeGen(DFInternalNode* N) { + // Check if N is root node and its graph is streaming. We do not do codeGen + // for Root in such a case + if(N->isRoot() && N->isChildGraphStreaming()) + return; + + // Check if clone already exists. If it does, it means we have visited this + // function before and nothing else needs to be done for this leaf node. +// if(N->getGenFunc() != NULL) +// return; + if (!preferredTargetIncludes(N, visc::CPU_TARGET)) { + errs() << "No CPU hint for node " << N->getFuncPointer()->getName() << + " : skipping it\n"; + return; + } + + assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL && + "Error: Visiting a node for which code already generated\n"); + + // Sort children in topological order before code generation + N->getChildGraph()->sortChildren(); + + // Only process if all children have a CPU x86 function + // Otherwise skip to end + bool codeGen = true; + for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); ci != ce; ++ci) { + DFNode* C = *ci; + // Skip dummy node call + if (C->isDummyNode()) + continue; + + if (!(C->hasX86GenFuncForTarget(visc::CPU_TARGET))) { + errs() << "No CPU x86 version for child node " + << C->getFuncPointer()->getName() + << "\n Skip code gen for parent node " + << N->getFuncPointer()->getName() << "\n"; + codeGen = false; + } + } + + if (codeGen) { + Function* F = N->getFuncPointer(); + // Create of clone of F with no instructions. Only the type is the same as F + // without the extra arguments. + Function* F_X86; + + // Clone the function, if we are seeing this function for the first time. We + // only need a clone in terms of type. + ValueToValueMapTy VMap; + + // Create new function with the same type + F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); + + // Loop over the arguments, copying the names of arguments over. + Function::arg_iterator dest_iterator = F_X86->arg_begin(); + for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); + i != e; ++i) { + dest_iterator->setName(i->getName()); // Copy the name over... + // Increment dest iterator + ++dest_iterator; + } + + // Add a basic block to this empty function + BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86); + ReturnInst* RI = ReturnInst::Create(F_X86->getContext(), + UndefValue::get(F_X86->getReturnType()), BB); + + // Add Index and Dim arguments except for the root node and the child graph of + // parent node is not streaming + if(!N->isRoot() && !N->getParent()->isChildGraphStreaming()) + F_X86 = addIdxDimArgs(F_X86); + + BB = &*F_X86->begin(); + RI = cast<ReturnInst>(BB->getTerminator()); + + //Add generated function info to DFNode +// N->setGenFunc(F_X86, visc::CPU_TARGET); + N->addGenFunc(F_X86, visc::CPU_TARGET, true); + + // Loop over the arguments, to create the VMap. + dest_iterator = F_X86->arg_begin(); + for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); + i != e; ++i) { + // Add mapping and increment dest iterator + VMap[&*i] = &*dest_iterator; + ++dest_iterator; + } + + // Iterate over children in topological order + for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); ci != ce; ++ci) { + DFNode* C = *ci; + // Skip dummy node call + if (C->isDummyNode()) + continue; + + // Create calls to CPU function of child node + invokeChild_X86(C, F_X86, VMap, RI); + + } + + DEBUG(errs() << "*** Generating epilogue code for the function****\n"); + // Generate code for output bindings + // Get Exit node + DFNode* C = N->getChildGraph()->getExit(); + // Get OutputType of this node + StructType* OutTy = N->getOutputType(); + Value *retVal = UndefValue::get(F_X86->getReturnType()); + // Find all the input edges to exit node + for (unsigned i=0; i < OutTy->getNumElements(); i++) { + DEBUG(errs() << "Output Edge " << i << "\n"); + // Find the incoming edge at the requested input port + DFEdge* E = C->getInDFEdgeAt(i); + + assert(E && "No Binding for output element!"); + // Find the Source DFNode associated with the incoming edge + DFNode* SrcDF = E->getSourceDF(); + + DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); + + // If Source DFNode is a dummyNode, edge is from parent. Get the + // argument from argument list of this internal node + Value* inputVal; + if(SrcDF->isEntryNode()) { + inputVal = getArgumentAt(F_X86, i); + DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + } + else { + // edge is from a internal node + // Check - code should already be generated for this source dfnode + assert(OutputMap.count(SrcDF) + && "Source node call not found. Dependency violation!"); + + // Find Output Value associated with the Source DFNode using OutputMap + Value* CI = OutputMap[SrcDF]; + + // Extract element at source position from this call instruction + std::vector<unsigned> IndexList; + IndexList.push_back(E->getSourcePosition()); + DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); + ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, + "",RI); + inputVal = EI; + } + std::vector<unsigned> IdxList; + IdxList.push_back(i); + retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI); + } + DEBUG(errs() << "Extracted all\n"); + retVal->setName("output"); + ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); + ReplaceInstWithInst(RI, newRI); + + } + + //-------------------------------------------------------------------------// + // Here, we need to check if this node (N) has more than one versions + // If so, we query the policy and have a call to each version + // If not, we see which version exists, check that it is in fact an x86 + // function and save it as the CPU_TARGET function + + // TODO: visc_id per node, so we can use this for id for policies + // For now, use node function name and change it later + Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET); + Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET); + Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET); + + bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); + bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); + bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET); + + errs() << "Node: " << N->getFuncPointer()->getName() + << " with tag " << N->getTag() << "\n"; + errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n"; + errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n"; + errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n"; + + + if (N->getTag() == visc::None) { + // No code is available for this node. This (usually) means that this + // node is a node that + // - from the accelerator backends has been mapped to an intermediate + // node, and thus they have not produced a genFunc + // - a child node had no CPU hint, thus no code gen for CPU could + // take place + errs() << "No GenFunc - Skipping CPU code generation for node " + << N->getFuncPointer()->getName() << "\n"; + } else if (viscUtils::isSingleTargetTag(N->getTag())) { + // There is a single version for this node according to code gen hints. + // Therefore, we do not need to check the policy, we simply use the + // available implementation, whichever target it is for. + + // Sanity check - to be removed TODO + switch (N->getTag()) { + case visc::CPU_TARGET: + assert(N->getGenFuncForTarget(visc::CPU_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && ""); + assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && ""); + break; + case visc::GPU_TARGET: + assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(N->getGenFuncForTarget(visc::GPU_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && ""); + assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && ""); + break; + case visc::SPIR_TARGET: + assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(N->getGenFuncForTarget(visc::SPIR_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::SPIR_TARGET) && ""); + break; + default: + assert(false && "Unreachable: we checked that tag was single target!\n"); + break; + } + + // If device abstraction is enabled, then we may need to edit the node + // function. In case this is a GPU or SPIR gen func, we issue a call to + // the runtime that waits for the device to be available + if (DeviceAbstraction) { + Function *NodeGenFunc = NULL; + switch (N->getTag()) { + case visc::GPU_TARGET: + NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET); + break; + case visc::SPIR_TARGET: + NodeGenFunc = N->getGenFuncForTarget(visc::SPIR_TARGET); + break; + default: + break; + } + + if (NodeGenFunc) { + // If we found a function to edit, we add the call to the runtime as + // its first statement + BasicBlock *BB = &*NodeGenFunc->begin(); + std::vector<Value *> Args; // TODO: add the device type as argument? + Function *RTF = + cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType())); + CallInst *RTFInst = CallInst::Create(RTF, Args, "", BB->getFirstNonPHI()); + } + + } + + Function *Ftmp = N->getGenFuncForTarget(N->getTag()); + N->removeGenFuncForTarget(visc::GPU_TARGET); + N->removeGenFuncForTarget(visc::SPIR_TARGET); + N->setTag(visc::None); + N->addGenFunc(Ftmp, visc::CPU_TARGET, true); + N->setTag(visc::CPU_TARGET); + + // Sanity checks - to be removed TODO + CF = N->getGenFuncForTarget(visc::CPU_TARGET); + GF = N->getGenFuncForTarget(visc::GPU_TARGET); + SF = N->getGenFuncForTarget(visc::SPIR_TARGET); + + CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); + GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); + SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET); + + errs() << "After editing\n"; + errs() << "Node: " << N->getFuncPointer()->getName() + << " with tag " << N->getTag() << "\n"; + errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n"; + errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n"; + errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n"; + + // assert(false && "got to the point where we have to select\n"); + } else { + // We have more than one targets + + errs() << "Node Name (for policy) : " + << N->getFuncPointer()->getName() << "\n"; + + Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET); + Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET); + Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET); + + bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); + bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); + bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET); + + // These assertions express what we can support with the current runtime. + // Code generation works the same way even for other target combinations. + // For now, we want either CPU and GPU, or CPU and SPIR + assert((CF && (GF && !SF || !GF && SF)) && "Invalid target selection\n"); + assert((CFx86 && (GFx86 && !SFx86 || !GFx86 && SFx86)) && + "Generated functions without appropriate x86 wrapper\n"); + + FunctionType *FT = CF->getFunctionType(); + if (GF) + assert(FT == GF->getFunctionType() && + "Type mismatch between generated functions for GPU and CPU targets.\n"); + if (SF) + assert(FT == SF->getFunctionType() && + "Type mismatch between generated functions for SPIR and CPU targets.\n"); + + // Code generation of wrapper function + Function *F_wrapper; + ValueToValueMapTy VMap; + F_wrapper = Function::Create(FT, CF->getLinkage(), CF->getName()+"_wrapper", &M); + + // Copy argument names over + Function::arg_iterator dest_iterator = F_wrapper->arg_begin(); + for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); + i != e; ++i) { + dest_iterator->setName(i->getName()); + VMap[&*i] = &*dest_iterator; + ++dest_iterator; + } + // Gather all arguments of wrapper in a vector, to prepare the call to + // the individual gen functions + std::vector<Value *> GenFuncCallArgs; + for (Function::arg_iterator i = F_wrapper->arg_begin(), e = F_wrapper->arg_end(); + i != e; ++i) { + GenFuncCallArgs.push_back(&*i); + } + + BasicBlock *BBcurrent, *BBtrue, *BBfalse; + + BBcurrent = BasicBlock::Create(M.getContext(), "entry", F_wrapper); + + StringRef FName = N->getFuncPointer()->getName(); + size_t nameSize = FName.size()+1; + std::vector<Constant *> NameV; + for (char c: FName) { + NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), c)); + } + NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), '\0')); + ArrayType *NameType = + ArrayType::get(IntegerType::get(M.getContext(), 8), nameSize); + AllocaInst *AI = new AllocaInst(NameType, nullptr, "", BBcurrent); + Constant *NameConst = ConstantArray::get(NameType, NameV); + StoreInst *StI = new StoreInst(NameConst, AI, BBcurrent); + CastInst *BI = BitCastInst::CreatePointerCast(AI, + Type::getInt8PtrTy(M.getContext()), "", BBcurrent); + std::vector<Value *> Args; + Args.push_back(BI); + Args.push_back(ConstantInt::get(Type::getInt64Ty(M.getContext()), -1, true)); + Function *RTF = + cast<Function>(M.getOrInsertFunction("llvm_visc_policy_getVersion", + runtimeModule->getFunction("llvm_visc_policy_getVersion")->getFunctionType())); + CallInst *RTFInst = CallInst::Create(RTF, Args, "", BBcurrent); + + ConstantInt *CmpConst = + ConstantInt::get(Type::getInt32Ty(M.getContext()), 0, true); + CmpInst *CmpI = CmpInst::Create(Instruction::ICmp, + CmpInst::ICMP_EQ, + RTFInst, CmpConst, + "", BBcurrent); + + BBtrue = BasicBlock::Create(M.getContext(), "version_cpu", F_wrapper); + BBfalse = BasicBlock::Create(M.getContext(), "not_cpu", F_wrapper); + BranchInst *BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); + + CallInst *GenFuncCI = CallInst::Create(CF, GenFuncCallArgs, "", BBtrue); + ReturnInst *RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); + + // Switch basic block pointers + BBcurrent = BBfalse; + if (GF) { + // We have a GPU version. Generate policy check and call + CmpConst = + ConstantInt::get(Type::getInt32Ty(M.getContext()), 1, true); + CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + RTFInst, CmpConst, "", BBcurrent); + BBtrue = BasicBlock::Create(M.getContext(), "version_gpu", F_wrapper); + BBfalse = BasicBlock::Create(M.getContext(), "not_gpu", F_wrapper); + BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); + + GenFuncCI = CallInst::Create(GF, GenFuncCallArgs, "", BBtrue); + RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); + + if (DeviceAbstraction) { + // Prepare arguments and function for call to wait for device runtime call + std::vector<Value *> Args; // TODO: add the device type as argument? + Function *RTF = + cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType())); + CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI); + } + } + + // Switch basic block pointers + BBcurrent = BBfalse; + if (SF) { + // We have a GPU version. Generate policy check and call + CmpConst = + ConstantInt::get(Type::getInt32Ty(M.getContext()), 2, true); + CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + RTFInst, CmpConst, "", BBcurrent); + BBtrue = BasicBlock::Create(M.getContext(), "version_spir", F_wrapper); + BBfalse = BasicBlock::Create(M.getContext(), "not_spir", F_wrapper); + BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); + + GenFuncCI = CallInst::Create(SF, GenFuncCallArgs, "", BBtrue); + RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); + + if (DeviceAbstraction) { + // Prepare arguments and function for call to wait for device runtime call + std::vector<Value *> Args; // TODO: add the device type as argument? + Function *RTF = + cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType())); + CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI); + } + } + + RI = ReturnInst::Create(M.getContext(), + UndefValue::get(FT->getReturnType()), BBfalse); + + // Now, make the node cpu gen func to be this one + // Remove all other versions and update the tag + N->addGenFunc(F_wrapper, visc::CPU_TARGET, true); + N->removeGenFuncForTarget(visc::GPU_TARGET); + N->removeGenFuncForTarget(visc::SPIR_TARGET); + N->setTag(visc::CPU_TARGET); + + // assert(false && "got to the point where we have to combine\n"); + } + +} + +// Code generation for leaf nodes +void CGT_X86::codeGen(DFLeafNode* N) { + // Skip code generation if it is a dummy node + if(N->isDummyNode()) { + DEBUG(errs() << "Skipping dummy node\n"); + return; + } + + // At this point, the X86 backend does not support code generation for + // the case where allocation node is used, so we skip. This means that a + // CPU version will not be created, and therefore code generation will + // only succeed if another backend (nvptx or spir) has been invoked to + // generate a node function for the node including the allocation node. + if (N->isAllocationNode()) { + DEBUG(errs() << "Skipping allocation node\n"); + return; + } + + // Check if clone already exists. If it does, it means we have visited this + // function before and nothing else needs to be done for this leaf node. +// if(N->getGenFunc() != NULL) +// return; + + if (!preferredTargetIncludes(N, visc::CPU_TARGET)) { + errs() << "No CPU hint for node " << N->getFuncPointer()->getName() << + " : skipping it\n"; + + errs() << "Check for cudnn or promise hint for node " + << N->getFuncPointer()->getName() << "\n"; + + switch (N->getTag()) { + case visc::CUDNN_TARGET: { + errs() << "CUDNN hint found. Store CUDNN function as CPU funtion.\n"; + // Make sure there is a generated x86 function for cudnn + assert(N->getGenFuncForTarget(visc::CUDNN_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::CUDNN_TARGET) && ""); + // Store the CUDNN x86 function as the CPU generated function + Function *Ftmp = N->getGenFuncForTarget(N->getTag()); + // after adding the required number of arguments + if (!N->getParent()->isChildGraphStreaming()) + Ftmp = addIdxDimArgs(Ftmp); + + N->removeGenFuncForTarget(visc::CUDNN_TARGET); + N->setTag(visc::None); + N->addGenFunc(Ftmp, visc::CPU_TARGET, true); + N->setTag(visc::CPU_TARGET); + break; + } + case visc::PROMISE_TARGET: { + errs() << "Promise hint found. Store PROMISE function as CPU funtion.\n"; + // Make sure there is a generated x86 function for promise + assert(N->getGenFuncForTarget(visc::PROMISE_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::PROMISE_TARGET) && ""); + // Store the PROMISE x86 function as the CPU generated function + Function *Ftmp = N->getGenFuncForTarget(N->getTag()); + // after adding the required number of arguments + if (!N->getParent()->isChildGraphStreaming()) + Ftmp = addIdxDimArgs(Ftmp); + + N->setTag(visc::None); + N->removeGenFuncForTarget(visc::PROMISE_TARGET); + N->addGenFunc(Ftmp, visc::CPU_TARGET, true); + N->setTag(visc::CPU_TARGET); + break; + } + case visc::GPU_TARGET: + // A leaf node should not have an x86 function for GPU + // by design of DFG2LLVM_NVPTX backend + assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); + break; + case visc::SPIR_TARGET: + // A leaf node should not have an x86 function for SPIR + // by design of DFG2LLVM_SPIR backend + assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && ""); + break; + default: + break; + } + + return; + } + + assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL && + "Error: Visiting a node for which code already generated\n"); + + std::vector<IntrinsicInst *> IItoRemove; + std::vector<std::pair<IntrinsicInst *, Value *> > IItoReplace; + BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap; + + // Get the function associated woth the dataflow node + Function *F = N->getFuncPointer(); + + // Clone the function, if we are seeing this function for the first time. + Function *F_X86; + ValueToValueMapTy VMap; + F_X86 = CloneFunction(F, VMap); + F_X86->removeFromParent(); + // Insert the cloned function into the module + M.getFunctionList().push_back(F_X86); + + // Add the new argument to the argument list. Add arguments only if the cild + // graph of parent node is not streaming + if(!N->getParent()->isChildGraphStreaming()) + F_X86 = addIdxDimArgs(F_X86); + + // Add generated function info to DFNode +// N->setGenFunc(F_X86, visc::CPU_TARGET); + N->addGenFunc(F_X86, visc::CPU_TARGET, true); + + // Go through the arguments, and any pointer arguments with in attribute need + // to have x86_argument_ptr call to get the x86 ptr of the argument + // Insert these calls in a new BB which would dominate all other BBs + // Create new BB + BasicBlock* EntryBB = &*F_X86->begin(); + BasicBlock* BB = BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB); + BranchInst* Terminator = BranchInst::Create(EntryBB, BB); + // Insert calls + for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end(); + ai != ae; ++ai) { + if (F_X86->getAttributes().hasAttribute(ai->getArgNo()+1, Attribute::In)) { + assert(ai->getType()->isPointerTy() + && "Only pointer arguments can have visc in/out attributes "); + Function::arg_iterator aiNext = ai; + ++aiNext; + Argument* size = &*aiNext; + assert(size->getType() == Type::getInt64Ty(M.getContext()) + && "Next argument after a pointer should be an i64 type"); + CastInst* BI = BitCastInst::CreatePointerCast(&*ai, + Type::getInt8PtrTy(M.getContext()), + ai->getName()+".i8ptr", + Terminator); + Value* ArgPtrCallArgs[] = {BI, size}; + CallInst::Create(llvm_visc_x86_argument_ptr, + ArrayRef<Value*>(ArgPtrCallArgs, 2), + "", + Terminator); + + } + } + errs() << *BB << "\n"; + + // Go through all the instructions + for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) { + Instruction *I = &(*i); + DEBUG(errs() << *I << "\n"); + // Leaf nodes should not contain VISC graph intrinsics or launch + assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!"); + assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!"); + + if (BuildDFG::isViscQueryIntrinsic(I)) { + IntrinsicInst* II = cast<IntrinsicInst>(I); + IntrinsicInst* ArgII; + DFNode* ArgDFNode; + + /*********************************************************************** + * Handle VISC Query intrinsics * + ***********************************************************************/ + switch (II->getIntrinsicID()) { + /**************************** llvm.visc.getNode() *******************/ + case Intrinsic::visc_getNode: { + // add mapping <intrinsic, this node> to the node-specific map + Leaf_HandleToDFNodeMap[II] = N; + IItoRemove.push_back(II); + break; + } + /************************* llvm.visc.getParentNode() ****************/ + case Intrinsic::visc_getParentNode: { + // get the parent node of the arg node + // get argument node + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + // get the parent node of the arg node + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + // Add mapping <intrinsic, parent node> to the node-specific map + // the argument node must have been added to the map, orelse the + // code could not refer to it + Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent(); + IItoRemove.push_back(II); + break; + } + /*************************** llvm.visc.getNumDims() *****************/ + case Intrinsic::visc_getNumDims: { + // get node from map + // get the appropriate field + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + int numOfDim = Leaf_HandleToDFNodeMap[ArgII]->getNumOfDim(); + IntegerType* IntTy = Type::getInt32Ty(M.getContext()); + ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim); + + II->replaceAllUsesWith(numOfDimConstant); + IItoRemove.push_back(II); + break; + } + /*********************** llvm.visc.getNodeInstanceID() **************/ + case Intrinsic::visc_getNodeInstanceID_x: + case Intrinsic::visc_getNodeInstanceID_y: + case Intrinsic::visc_getNodeInstanceID_z: { + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + + // The dfnode argument should be an ancestor of this leaf node or + // the leaf node itself + int parentLevel = N->getAncestorHops(ArgDFNode); + assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N ) + && "Invalid DFNode argument to getNodeInstanceID_[xyz]!"); + + // Get specified dimension + // (dim = 0) => x + // (dim = 1) => y + // (dim = 2) => z + int dim = (int) (II->getIntrinsicID() - + Intrinsic::visc_getNodeInstanceID_x); + assert((dim >= 0) && (dim < 3) + && "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic ID!"); + + // For immediate ancestor, use the extra argument introduced in + // F_X86 + int numParamsF = F->getFunctionType()->getNumParams(); + int numParamsF_X86 = F_X86->getFunctionType()->getNumParams(); + assert((numParamsF_X86 - numParamsF == 6) + && "Difference of arguments between function and its clone is not 6!"); + + if(parentLevel == 0) { + // Case when the query is for this node itself + unsigned offset = 3 + (3-dim); + // Traverse argument list of F_X86 in reverse order to find the + // correct index or dim argument. + Argument* indexVal = getArgumentFromEnd(F_X86, offset); + assert(indexVal && "Index argument not found. Invalid offset!"); + + DEBUG(errs() << *II << " replaced with " << *indexVal << "\n"); + + II->replaceAllUsesWith(indexVal); + IItoRemove.push_back(II); + } + else { + // Case when query is for an ancestor + Value* args[] = { + ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel), + ConstantInt::get(Type::getInt32Ty(II->getContext()), dim) + }; + CallInst* CI = CallInst::Create(llvm_visc_x86_getDimInstance, + ArrayRef<Value*>(args, 2), + "nodeInstanceID", II); + DEBUG(errs() << *II << " replaced with " << *CI << "\n"); + II->replaceAllUsesWith(CI); + IItoRemove.push_back(II); + } + break; + } + /********************** llvm.visc.getNumNodeInstances() *************/ + case Intrinsic::visc_getNumNodeInstances_x: + case Intrinsic::visc_getNumNodeInstances_y: + case Intrinsic::visc_getNumNodeInstances_z: { + + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + + // The dfnode argument should be an ancestor of this leaf node or + // the leaf node itself + int parentLevel = N->getAncestorHops(ArgDFNode); + assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N ) + && "Invalid DFNode argument to getNodeInstanceID_[xyz]!"); + + // Get specified dimension + // (dim = 0) => x + // (dim = 1) => y + // (dim = 2) => z + int dim = (int) (II->getIntrinsicID() - + Intrinsic::visc_getNumNodeInstances_x); + assert((dim >= 0) && (dim < 3) + && "Invalid dimension for getNumNodeInstances_[xyz]. Check Intrinsic ID!"); + + // For immediate ancestor, use the extra argument introduced in + // F_X86 + int numParamsF = F->getFunctionType()->getNumParams(); + int numParamsF_X86 = F_X86->getFunctionType()->getNumParams(); + assert((numParamsF_X86 - numParamsF == 6) + && "Difference of arguments between function and its clone is not 6!"); + + if(parentLevel == 0) { + // Case when the query is for this node itself + unsigned offset = 3 - dim; + // Traverse argument list of F_X86 in reverse order to find the + // correct index or dim argument. + Argument* limitVal = getArgumentFromEnd(F_X86, offset); + assert(limitVal && "Limit argument not found. Invalid offset!"); + + DEBUG(errs() << *II << " replaced with " << *limitVal << "\n"); + + II->replaceAllUsesWith(limitVal); + IItoRemove.push_back(II); + } + else { + // Case when query is from the ancestor + Value* args[] = { + ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel), + ConstantInt::get(Type::getInt32Ty(II->getContext()), dim) + }; + CallInst* CI = CallInst::Create(llvm_visc_x86_getDimLimit, + ArrayRef<Value*>(args, 2), + "numNodeInstances", II); + DEBUG(errs() << *II << " replaced with " << *CI << "\n"); + II->replaceAllUsesWith(CI); + IItoRemove.push_back(II); + } + + break; + } + default: + DEBUG(errs() << "Found unknown intrinsic with ID = " << + II->getIntrinsicID() << "\n"); + assert(false && "Unknown VISC Intrinsic!"); + break; + } + + } else { + //TODO: how to handle address space qualifiers in load/store + } + + } + + //TODO: + // When to replace the uses? + // In which order is it safe to replace the instructions in + // IItoReplace? + // Probably in the reverse order in the vectors + // It is a good idea to have them in one vector and chech the type + // using dyn_cast in order to determine if we replace with inst or value + + + //TODO: maybe leave these instructions to be removed by a later DCE pass + for (std::vector<IntrinsicInst *>::iterator i = IItoRemove.begin(); + i != IItoRemove.end(); ++i) { + (*i)->replaceAllUsesWith(UndefValue::get((*i)->getType())); + (*i)->eraseFromParent(); + } + + DEBUG(errs() << *F_X86); +} + +} // End of namespace + +char DFG2LLVM_X86::ID = 0; +static RegisterPass<DFG2LLVM_X86> X("dfg2llvm-x86", + "Dataflow Graph to LLVM for X86 backend", + false /* does not modify the CFG */, + true /* transformation, not just analysis */); + diff --git a/lib/DFG2LLVM_X86/DFG2LLVM_X86.exports b/lib/DFG2LLVM_X86/DFG2LLVM_X86.exports new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/DFG2LLVM_X86/LLVMBuild.txt b/lib/DFG2LLVM_X86/LLVMBuild.txt new file mode 100644 index 0000000000..1e82065bf0 --- /dev/null +++ b/lib/DFG2LLVM_X86/LLVMBuild.txt @@ -0,0 +1,21 @@ +;===- ./lib/Transforms/DFG2LLVM_X86/LLVMBuild.txt --------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = DFG2LLVM_X86 +parent = Transforms diff --git a/lib/DFG2LLVM_X86_dsoc/CMakeLists.txt b/lib/DFG2LLVM_X86_dsoc/CMakeLists.txt new file mode 100644 index 0000000000..75569addda --- /dev/null +++ b/lib/DFG2LLVM_X86_dsoc/CMakeLists.txt @@ -0,0 +1,13 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( DFG2LLVM_X86_dsoc + DFG2LLVM_X86_dsoc.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) + diff --git a/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports b/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp b/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp new file mode 100644 index 0000000000..fbe5e4f6bd --- /dev/null +++ b/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp @@ -0,0 +1,2128 @@ +//===-------------------------- DFG2LLVM_X86.cpp --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "DFG2LLVM_X86" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Constant.h" +#include "llvm/SupportVISC/DFG2LLVM.h" + +using namespace llvm; +using namespace builddfg; +using namespace dfg2llvm; + +// VISC Command line option to use timer or not +static cl::opt<bool> +VISCTimer_X86("visc-timers-x86", cl::desc("Enable visc timers")); +// Command line option to enable device abstraction or not +static cl::opt<bool> +DeviceAbstraction("visc-eda", cl::init(false), cl::Hidden, + cl::desc("Enable visc device abstraction")); + + +namespace { + +// Helper Functions +static bool isVISCCall_llvm_visc_policy_getVersion(Instruction *I) { + if (!isa<CallInst>(I)) + return false; + CallInst *CI = cast<CallInst>(I); + return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("llvm_visc_policy_getVersion"); +} + +CallInst *get_llvm_visc_policy_getVersion_call(Function *F) { + for (inst_iterator ib = inst_begin(F), ie = inst_end(F); ib != ie; ++ib) { + Instruction *I = &*ib; + if (isVISCCall_llvm_visc_policy_getVersion(I)) + return cast<CallInst>(I); + } + return NULL; +} + +// DFG2LLVM_X86 - The first implementation. +struct DFG2LLVM_X86 : public DFG2LLVM { + static char ID; // Pass identification, replacement for typeid + DFG2LLVM_X86() :DFG2LLVM(ID) {} + +private: + // Member variables + + // Functions + +public: + bool runOnModule(Module &M); +}; + +// Visitor for Code generation traversal (tree traversal for now) +class CGT_X86 : public CodeGenTraversal { + +private: + //Member variables + + Constant* malloc; + // VISC Runtime API + Constant* llvm_visc_x86_launch; + Constant* llvm_visc_x86_wait; + Constant* llvm_visc_x86_argument_ptr; + + Constant* llvm_visc_streamLaunch; + Constant* llvm_visc_streamPush; + Constant* llvm_visc_streamPop; + Constant* llvm_visc_streamWait; + Constant* llvm_visc_createBindInBuffer; + Constant* llvm_visc_createBindOutBuffer; + Constant* llvm_visc_createEdgeBuffer; + Constant* llvm_visc_createLastInputBuffer; + Constant* llvm_visc_createThread; + //Constant* llvm_visc_freeThreads; + Constant* llvm_visc_bufferPush; + Constant* llvm_visc_bufferPop; + Constant* llvm_visc_x86_dstack_push; + Constant* llvm_visc_x86_dstack_pop; + Constant* llvm_visc_x86_getDimLimit; + Constant* llvm_visc_x86_getDimInstance; + + //Functions + std::vector<IntrinsicInst*>* getUseList(Value* LI); + Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = ""); + void addDoWhileLoop(Instruction*, Instruction*, Value*); + void addWhileLoop(Instruction*, Instruction*, Instruction*, Value*); + Instruction *addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *); + Argument* getArgumentFromEnd(Function* F, unsigned offset); + Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, + Instruction* InsertBefore); + void invokeChild_X86(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap, + Instruction* InsertBefore); + void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap, + Instruction* InsertBefore); + StructType* getArgumentListStructTy(DFNode*); + Function* createFunctionFilter(DFNode* C); + void startNodeThread(DFNode*, std::vector<Value*>, DenseMap<DFEdge*, Value*>, + Value*, Value*, Instruction*); + Function* createLaunchFunction(DFInternalNode*); + Function* createPushFunction(DFInternalNode*); + Function* createPopFunction(DFInternalNode*); + Function* createWaitFunction(DFInternalNode*); + + // Virtual Functions + void init() { + VISCTimer = VISCTimer_X86; + TargetName = "X86"; + } + void initRuntimeAPI(); + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); + Function* codeGenStreamPush(DFInternalNode* N); + Function* codeGenStreamPop(DFInternalNode* N); + +public: + // Constructor + CGT_X86(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG) { + init(); + initRuntimeAPI(); + } + + void codeGenLaunch(DFInternalNode* Root); + void codeGenLaunchStreaming(DFInternalNode* Root); +}; + +bool DFG2LLVM_X86::runOnModule(Module &M) { + errs() << "\nDFG2LLVM_X86 PASS\n"; + + // Get the BuildDFG Analysis Results: + // - Dataflow graph + // - Maps from i8* hansles to DFNode and DFEdge + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + //DFInternalNode *Root = DFG.getRoot(); + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); + // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); + + // Visitor for Code Generation Graph Traversal + CGT_X86 *CGTVisitor = new CGT_X86(M, DFG); + + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode: Roots) { + // Initiate code generation for root DFNode + CGTVisitor->visit(rootNode); + // Go ahead and replace the launch intrinsic with pthread call, otherwise return now. + // TODO: Later on, we might like to do this in a separate pass, which would + // allow us the flexibility to switch between complete static code generation + // for DFG or having a customized runtime+scheduler + + // Do streaming code generation if root node is streaming. Usual otherwise + if(rootNode->isChildGraphStreaming()) + CGTVisitor->codeGenLaunchStreaming(rootNode); + else + CGTVisitor->codeGenLaunch(rootNode); + } + + delete CGTVisitor; + return true; +} + +// Initialize the VISC runtime API. This makes it easier to insert these calls +void CGT_X86::initRuntimeAPI() { + + // Load Runtime API Module + SMDiagnostic Err; + + char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); + + // FIXME: hardcoded path to 'build_dsoc' - should probably be a environment variable + Twine llvmSrcRoot = LLVM_SRC_ROOT; + Twine runtimeAPI = llvmSrcRoot+"/../build_dsoc/projects/visc-rt/visc-rt.ll"; + + runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); + + if(runtimeModule == NULL) + DEBUG(errs() << Err.getMessage()); + else + DEBUG(errs() << "Successfully loaded visc-rt API module\n"); + + // Get or insert the global declarations for launch/wait functions + DECLARE(llvm_visc_x86_launch); + DECLARE(malloc); + DECLARE(llvm_visc_x86_wait); + DECLARE(llvm_visc_x86_argument_ptr); + DECLARE(llvm_visc_streamLaunch); + DECLARE(llvm_visc_streamPush); + DECLARE(llvm_visc_streamPop); + DECLARE(llvm_visc_streamWait); + DECLARE(llvm_visc_createBindInBuffer); + DECLARE(llvm_visc_createBindOutBuffer); + DECLARE(llvm_visc_createEdgeBuffer); + DECLARE(llvm_visc_createLastInputBuffer); + DECLARE(llvm_visc_createThread); + //DECLARE(llvm_visc_freeThreads); + DECLARE(llvm_visc_bufferPush); + DECLARE(llvm_visc_bufferPop); + DECLARE(llvm_visc_x86_dstack_push); + DECLARE(llvm_visc_x86_dstack_pop); + DECLARE(llvm_visc_x86_getDimLimit); + DECLARE(llvm_visc_x86_getDimInstance); + + // Get or insert timerAPI functions as well if you plan to use timers + initTimerAPI(); + + // Insert init context in main + Function* VI = M.getFunction("llvm.visc.init"); + assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); + DEBUG(errs() << "Inserting x86 timer initialization\n"); + Instruction* I = cast<Instruction>(*VI->user_begin()); + initializeTimerSet(I); + switchToTimer(visc_TimerID_NONE, I); + // Insert code for initializing the sceduling policy + Function *IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_init", + runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType())); + CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I); + DEBUG(errs() << *IPCallInst << "\n"); + + // If device abstraction is enabled, we add a runtime call to start the + // device status simulation + if (DeviceAbstraction) { + Function *ID = + cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_start", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_start")->getFunctionType())); + CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I); + DEBUG(errs() << *IDCallInst << "\n"); + } + + // Insert print instruction at visc exit + Function* VC = M.getFunction("llvm.visc.cleanup"); + assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once"); + + // Insert code for clearing the sceduling policy + I = cast<Instruction>(*VC->user_begin()); + IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_clear", + runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType())); + IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I); + errs() << *IPCallInst << "\n"; + + DEBUG(errs() << "Inserting x86 timer print\n"); + printTimerSet(I); + + // If device abstraction is enabled, we add a runtime call to end the + // device status simulation + if (DeviceAbstraction) { + Function *ID = + cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_end", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_end")->getFunctionType())); + CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I); + DEBUG(errs() << *IDCallInst << "\n"); + } + +} + +/* Returns vector of all wait instructions + */ +std::vector<IntrinsicInst*>* CGT_X86::getUseList(Value* GraphID) { + std::vector<IntrinsicInst*>* UseList = new std::vector<IntrinsicInst*>(); + // It must have been loaded from memory somewhere + for(Value::user_iterator ui = GraphID->user_begin(), + ue = GraphID->user_end(); ui!=ue; ++ui) { + if(IntrinsicInst* waitI = dyn_cast<IntrinsicInst>(*ui)) { + UseList->push_back(waitI); + } + //else if (PHINode* PN = dyn_cast<PHINode>(*ui)){ + //errs() << "Found PhiNode use of graphID\n"; + //std::vector<IntrinsicInst*>* phiUseList = getUseList(PN); + //UseList->insert(UseList->end(), phiUseList->begin(), phiUseList->end()); + //free(phiUseList); + //} + else { + llvm_unreachable("Error: Operation on Graph ID not supported!\n"); + } + } + return UseList; +} + +/* Traverse the function argument list in reverse order to get argument at a + * distance offset fromt he end of argument list of function F + */ +Argument* CGT_X86::getArgumentFromEnd(Function* F, unsigned offset) { + assert((F->getFunctionType()->getNumParams() >= offset && offset > 0) + && "Invalid offset to access arguments!"); + Function::arg_iterator e = F->arg_end(); + // Last element of argument iterator is dummy. Skip it. + e--; + Argument* arg; + for( ; offset != 0; e--) { + offset--; + arg = &*e; + } + return arg; +} + +/* Add Loop around the instruction I + * Algorithm: + * (1) Split the basic block of instruction I into three parts, where the + * middleblock/body would contain instruction I. + * (2) Add phi node before instruction I. Add incoming edge to phi node from + * predecessor + * (3) Add increment and compare instruction to index variable + * (4) Replace terminator/branch instruction of body with conditional branch + * which loops over bidy if true and goes to end if false + * (5) Update phi node of body + */ +void CGT_X86::addWhileLoop(Instruction* CondBlockStart, Instruction* BodyStart, + Instruction* BodyEnd, Value* TerminationCond) { + BasicBlock* Entry = CondBlockStart->getParent(); + BasicBlock* CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition"); + BasicBlock* WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body"); + BasicBlock* WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end"); + + // Replace the terminator instruction of conditional with new conditional + // branch which goes to while.body if true and branches to while.end otherwise + BranchInst* BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond); + ReplaceInstWithInst(CondBlock->getTerminator(), BI); + + // While Body should jump to condition block + BranchInst* UnconditionalBranch = BranchInst::Create(CondBlock); + ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch); + +} + +Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond, + BasicBlock *Body) { + Module *M = Entry->getParent()->getParent(); + Type *Int64Ty = Type::getInt64Ty(M->getContext()); + + // Insert a PHI instruction at the beginning of the condition block + Instruction *IB = Cond->getFirstNonPHI(); + PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB); + + ConstantInt *IConst = + ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true); + Instruction *CounterIncr = + BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst, + "cnt_incr", Body->getTerminator()); + + // Set incoming values for Phi node + IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true); + CounterPhi->addIncoming(IConst, Entry); + CounterPhi->addIncoming(CounterIncr, Body); + + // Return the pointer to the created PHI node in the corresponding argument + return CounterPhi; +} + +/* Add Loop around the instruction I + * Algorithm: + * (1) Split the basic block of instruction I into three parts, where the + * middleblock/body would contain instruction I. + * (2) Add phi node before instruction I. Add incoming edge to phi node from + * predecessor + * (3) Add increment and compare instruction to index variable + * (4) Replace terminator/branch instruction of body with conditional branch + * which loops over bidy if true and goes to end if false + * (5) Update phi node of body + */ +void CGT_X86::addDoWhileLoop(Instruction* From, Instruction* To, Value* TerminationCond) { + BasicBlock* Entry = From->getParent(); + BasicBlock* ForBody = Entry->splitBasicBlock(From, "for.body"); + + // To Instruction should also belong to the same basic block as the From basic + // block will have a terminator instruction + assert(To->getParent() == ForBody + && "To Instruction should also belong to the same basic block!"); + BasicBlock* ForEnd = ForBody->splitBasicBlock(To, "for.end"); + + // Replace the terminator instruction of for.body with new conditional + // branch which loops over body if true and branches to for.end otherwise + BranchInst* BI = BranchInst::Create(ForEnd, ForBody, TerminationCond); + ReplaceInstWithInst(ForBody->getTerminator(), BI); + +} + +/* Add Loop around the instruction I + * Algorithm: + * (1) Split the basic block of instruction I into three parts, where the + * middleblock/body would contain instruction I. + * (2) Add phi node before instruction I. Add incoming edge to phi node from + * predecessor + * (3) Add increment and compare instruction to index variable + * (4) Replace terminator/branch instruction of body with conditional branch + * which loops over bidy if true and goes to end if false + * (5) Update phi node of body + */ +Value* CGT_X86::addLoop(Instruction* I, Value* limit, const Twine& indexName) { + BasicBlock* Entry = I->getParent(); + BasicBlock* ForBody = Entry->splitBasicBlock(I, "for.body"); + + BasicBlock::iterator i(I); + ++i; + Instruction* NextI = &*i; + // Next Instruction should also belong to the same basic block as the basic + // block will have a terminator instruction + assert(NextI->getParent() == ForBody + && "Next Instruction should also belong to the same basic block!"); + BasicBlock* ForEnd = ForBody->splitBasicBlock(NextI, "for.end"); + + + // Add Phi Node for index variable + PHINode* IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()), + 2, "index."+indexName, I); + + // Add incoming edge to phi + IndexPhi->addIncoming(ConstantInt::get(Type::getInt64Ty(I->getContext()), 0), + Entry); + // Increment index variable + BinaryOperator* IndexInc = BinaryOperator::Create(Instruction::Add, + IndexPhi, ConstantInt::get(Type::getInt64Ty(I->getContext()), 1), + "index."+indexName+".inc", ForBody->getTerminator()); + + // Compare index variable with limit + CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc, + limit, "cond."+indexName, ForBody->getTerminator()); + + // Replace the terminator instruction of for.body with new conditional + // branch which loops over body if true and branches to for.end otherwise + BranchInst* BI = BranchInst::Create(ForBody, ForEnd, Cond); + ReplaceInstWithInst(ForBody->getTerminator(), BI); + + // Add incoming edge to phi node in body + IndexPhi->addIncoming(IndexInc, ForBody); + return IndexPhi; +} + +// Returns a packed struct type. The structtype is created by packing the input +// types, output types and isLastInput buffer type. All the streaming +// inputs/outputs are converted to i8*, since this is the type of buffer +// handles. +StructType* CGT_X86::getArgumentListStructTy(DFNode* C) { + std::vector<Type*> TyList; + // Input types + Function* CF = C->getFuncPointer(); + for(Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end(); + ai != ae; ++ai) { + if(C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge()) + TyList.push_back(Type::getInt8PtrTy(CF->getContext())); + else + TyList.push_back(ai->getType()); + } + // Output Types + StructType* OutStructTy = cast<StructType>(CF->getReturnType()); + for (unsigned i = 0; i < OutStructTy->getNumElements(); i++) { + // All outputs of a node are streaming edge + assert(C->getOutDFEdgeAt(i)->isStreamingEdge() + && "All output edges of child node have to be streaming"); + TyList.push_back(Type::getInt8PtrTy(CF->getContext())); + } + // isLastInput buffer element + TyList.push_back(Type::getInt8PtrTy(CF->getContext())); + + StructType* STy = StructType::create(CF->getContext(), TyList, + Twine("struct.thread."+CF->getName()).str(), true); + return STy; + +} + +void CGT_X86::startNodeThread(DFNode* C, std::vector<Value*> Args, DenseMap<DFEdge*, Value*> + EdgeBufferMap, Value* isLastInputBuffer, Value* graphID, + Instruction* IB) { + DEBUG(errs() << "Starting Pipeline for child node: " << C->getFuncPointer()->getName() << "\n"); + // Create a filter/pipeline function for the child node + Function* C_Pipeline = createFunctionFilter(C); + Function* CF = C->getFuncPointer(); + + // Get module context and i32 0 constant, as they would be frequently used in + // this function. + LLVMContext& Ctx = IB->getParent()->getContext(); + Constant* IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0); + + // Marshall arguments + // Create a packed struct type with inputs of C followed by outputs and then + // another i8* to indicate isLastInput buffer. Streaming inputs are replaced + // by i8* + // + StructType* STy = getArgumentListStructTy(C); + // Allocate the struct on heap *NOT* stack and bitcast i8* to STy* + CallInst* CI = CallInst::Create(malloc, ArrayRef<Value*>(ConstantExpr::getSizeOf(STy)), + C->getFuncPointer()->getName()+".inputs", IB); + CastInst* Struct = BitCastInst::CreatePointerCast(CI, STy->getPointerTo(), CI->getName()+".i8ptr", IB); + //AllocaInst* AI = new AllocaInst(STy, C->getFuncPointer()->getName()+".inputs", IB); + // Insert elements in the struct + DEBUG(errs() << "Marshall inputs for child node: " << C->getFuncPointer()->getName() << "\n"); + // Marshall Inputs + for(unsigned i=0; i < CF->getFunctionType()->getNumParams(); i++) { + // Create constant int (i) + Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i); + // Get Element pointer instruction + Value* GEPIndices[] = { IntZero, Int_i }; + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, + ArrayRef<Value*>(GEPIndices, 2), + Struct->getName()+".arg_"+Twine(i), + IB); + DFEdge* E = C->getInDFEdgeAt(i); + if (E->getSourceDF()->isEntryNode()) { + // This is a Bind Input Edge + if(E->isStreamingEdge()) { + // Streaming Bind Input edge. Get buffer corresponding to it + assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming Bind DFEdge!"); + new StoreInst(EdgeBufferMap[E], GEP, IB); + } + else { + // Non-streaming Bind edge + new StoreInst(Args[i], GEP, IB); + } + } + else { + // This is an edge between siblings. + // This must be an streaming edge. As it is our assumption that all edges + // between two nodes in a DFG are streaming. + assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming DFEdge!"); + new StoreInst(EdgeBufferMap[E], GEP, IB); + } + } + unsigned numInputs = CF->getFunctionType()->getNumParams(); + unsigned numOutputs = cast<StructType>(CF->getReturnType())->getNumElements(); + // Marshall Outputs + DEBUG(errs() << "Marshall outputs for child node: " << C->getFuncPointer()->getName() << "\n"); + for(unsigned i = 0; i < numOutputs; i++ ) { + // Create constant int (i+numInputs) + Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i+numInputs); + // Get Element pointer instruction + Value* GEPIndices[] = { IntZero, Int_i }; + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, + ArrayRef<Value*>(GEPIndices, 2), + Struct->getName()+".out_"+Twine(i), + IB); + DFEdge* E = C->getOutDFEdgeAt(i); + assert(E->isStreamingEdge() && "Output Edge must be streaming of all nodes"); + assert(EdgeBufferMap.count(E) && "No mapping buffer for a Out Streaming DFEdge!"); + new StoreInst(EdgeBufferMap[E], GEP, IB); + } + // Marshall last argument. isLastInput buffer + DEBUG(errs() << "Marshall isLastInput for child node: " << C->getFuncPointer()->getName() << "\n"); + // Create constant int (i+numInputs) + Constant* Int_index = ConstantInt::get(Type::getInt32Ty(Ctx), numInputs+numOutputs); + // Get Element pointer instruction + Value* GEPIndices[] = { IntZero, Int_index }; + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, + ArrayRef<Value*>(GEPIndices, 2), + Struct->getName()+".isLastInput", IB); + new StoreInst(isLastInputBuffer, GEP, IB); + + // AllocaInst AI points to memory with all the arguments packed + // Call runtime to create the thread with these arguments + DEBUG(errs() << "Start Thread for child node: " << C->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << *llvm_visc_createThread << "\n"); + DEBUG(errs() << *graphID->getType() << "\n"); + DEBUG(errs() << *C_Pipeline->getType() << "\n"); + DEBUG(errs() << *Struct->getType() << "\n"); + // Bitcast AI to i8* + CastInst* BI = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx), Struct->getName(), IB); + Value* CreateThreadArgs[] = {graphID, C_Pipeline, BI}; + CallInst* CreateThread = CallInst::Create(llvm_visc_createThread, + ArrayRef<Value*>(CreateThreadArgs, 3), + "", + IB); + +} + +Function* CGT_X86::createLaunchFunction(DFInternalNode* N) { + DEBUG(errs() << "Generating Streaming Launch Function\n"); + // Get Function associated with Node N + Function* NF = N->getFuncPointer(); + + // Map from Streaming edge to buffer + DenseMap<DFEdge*, Value*> EdgeBufferMap; + + /* Now we have all the necessary global declarations necessary to generate the + * Launch function, pointer to which can be passed to pthread utils to execute + * DFG. The Launch function has just one input: i8* data.addr + * This is the address of the all the input data that needs to be passed to + * this function. In our case it contains the input arguments of the Root + * function in the correct order. + * (1) Create an empty Launch function of type void (i8* args, i8* GraphID) + * (2) Extract each of inputs from data.addr + * (3) create Buffers for all the streaming edges + * - Put buffers in the context + * (4) Go over each child node + * - marshall its arguments together (use buffers in place of streaming + * arguments) + * - Start the threads + * (5) The return value from Root is stored in memory, pointer to which is + * passed to pthread_exit call. + */ + // (1) Create Launch Function of type void (i8* args, i8* GraphID) + Type* i8Ty = Type::getInt8Ty(M.getContext()); + Type* ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()}; + FunctionType* LaunchFuncTy = FunctionType::get(Type::getVoidTy(NF->getContext()), + ArrayRef<Type*>(ArgTypes, 2), false); + Function* LaunchFunc = Function::Create(LaunchFuncTy, + NF->getLinkage(), + NF->getName()+".LaunchFunction", + &M); + DEBUG(errs() << "Generating Code for Streaming Launch Function\n"); + // Give a name to the argument which is used pass data to this thread + Argument* data = &*LaunchFunc->arg_begin(); + Argument* graphID = &*(++LaunchFunc->arg_begin()); + data->setName("data.addr"); + graphID->setName("graphID"); + // Add a basic block to this empty function and a return null statement to it + DEBUG(errs() << *LaunchFunc->getReturnType() << "\n"); + BasicBlock *BB = BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc); + ReturnInst* RI = ReturnInst::Create(LaunchFunc->getContext(), + BB); + + DEBUG(errs() << "Created Empty Launch Function\n"); + + // (2) Extract each of inputs from data.addr + std::vector<Type*> TyList; + std::vector<std::string> names; + std::vector<Value*> Args; + + for (Function::arg_iterator ai = NF->arg_begin(), ae = NF->arg_end(); + ai != ae; ++ai) { + if(N->getChildGraph()->getEntry()->getOutDFEdgeAt(ai->getArgNo())->isStreamingEdge()) { + TyList.push_back(i8Ty->getPointerTo()); + names.push_back(Twine(ai->getName()+"_buffer").str()); + continue; + } + TyList.push_back(ai->getType()); + names.push_back(ai->getName()); + } + Args = extractElements(data, TyList, names, RI); + DEBUG(errs() << "Launch function for " << NF->getName() << *LaunchFunc << "\n"); + // (3) Create buffers for all the streaming edges + for(DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(), + de = N->getChildGraph()->dfedge_end(); di != de; ++di) { + DFEdge* Edge = *di; + DEBUG(errs() << *Edge->getType() << "\n"); + Value* size = ConstantExpr::getSizeOf(Edge->getType()); + Value* CallArgs[] = {graphID, size}; + if (Edge->isStreamingEdge()) { + CallInst* CI; + // Create a buffer call + if(Edge->getSourceDF()->isEntryNode()) { + // Bind Input Edge + Constant* Int_ArgNo = ConstantInt::get(Type::getInt32Ty(RI->getContext()), + Edge->getSourcePosition()); + Value* BindInCallArgs[] = {graphID, size, Int_ArgNo}; + CI = CallInst::Create(llvm_visc_createBindInBuffer, ArrayRef<Value*>(BindInCallArgs, 3), + "BindIn."+Edge->getDestDF()->getFuncPointer()->getName(), + RI); + } + else if(Edge->getDestDF()->isExitNode()) { + // Bind Output Edge + CI = CallInst::Create(llvm_visc_createBindOutBuffer, ArrayRef<Value*>(CallArgs, 2), + "BindOut."+Edge->getSourceDF()->getFuncPointer()->getName(), + RI); + } + else { + // Streaming Edge + CI = CallInst::Create(llvm_visc_createEdgeBuffer, + ArrayRef<Value*>(CallArgs, 2), + Edge->getSourceDF()->getFuncPointer()->getName()+"." + +Edge->getDestDF()->getFuncPointer()->getName(), + RI); + } + EdgeBufferMap[Edge] = CI; + } + } + // Create buffer for isLastInput for all the child nodes + DFGraph* G = N->getChildGraph(); + DenseMap<DFNode*, Value*> NodeLastInputMap; + for(DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce; ++ci) { + DFNode* child = *ci; + if(child->isDummyNode()) + continue; + Value* size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext())); + Value* CallArgs[] = {graphID, size}; + CallInst* CI = CallInst::Create(llvm_visc_createLastInputBuffer, ArrayRef<Value*>(CallArgs, 2), + "BindIn.isLastInput."+child->getFuncPointer()->getName(), + RI); + NodeLastInputMap[child] = CI; + } + DEBUG(errs() << "Start Each child node filter\n"); + // (4) Marshall arguments for each child node and start the thread with its + // pipeline funtion + for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); ci != ce; ++ci) { + DFNode* C = *ci; + // Skip dummy node call + if (C->isDummyNode()) + continue; + + // Marshall all the arguments for this node into an i8* + // Pass to the runtime to create the thread + // Start the thread for child node C + startNodeThread(C, Args, EdgeBufferMap, NodeLastInputMap[C], graphID, RI); + } + + DEBUG(errs() << "Launch function:\n"); + DEBUG(errs() << *LaunchFunc << "\n"); + + return LaunchFunc; +} + + +Function* CGT_X86::createPushFunction(DFInternalNode* N) { + DEBUG(errs() << "Generating Push function\n"); + Function* PushFunc; + return PushFunc; +} + +Function* CGT_X86::createPopFunction(DFInternalNode* N) { + DEBUG(errs() << "Generating Pop function\n"); + Function* PushFunc; + return PushFunc; +} + +Function* CGT_X86::createWaitFunction(DFInternalNode* N) { + DEBUG(errs() << "Generating Wait function\n"); + Function* PushFunc; + return PushFunc; +} +/* This fuction does the steps necessary to launch a streaming graph + * Steps + * Create Pipeline/Filter function for each node in child graph of Root + * Create Functions DFGLaunch, DFGPush, DFGPop, DFGWait + * Modify each of the instrinsic in host code + * Launch, Push, Pop, Wait + */ +void CGT_X86::codeGenLaunchStreaming(DFInternalNode* Root) { + IntrinsicInst* LI = Root->getInstruction(); + Function* RootLaunch = createLaunchFunction(Root); + //Function* RootPush = createPushFunction(Root); + //Function* RootPop = createPopFunction(Root); + //Function* RootWait = createWaitFunction(Root); + // Substitute launch intrinsic main + DEBUG(errs() << "Substitute launch intrinsic\n"); + Value* LaunchInstArgs[] = {RootLaunch, + LI->getArgOperand(1) + }; + CallInst* LaunchInst = CallInst::Create(llvm_visc_streamLaunch, + ArrayRef<Value*>(LaunchInstArgs,2), + "graph"+Root->getFuncPointer()->getName(), LI); + //ReplaceInstWithInst(LI, LaunchInst); + + DEBUG(errs() << *LaunchInst << "\n"); + // Replace all wait instructions with x86 specific wait instructions + DEBUG(errs() << "Substitute wait, push, pop intrinsics\n"); + std::vector<IntrinsicInst*>* UseList = getUseList(LI); + for(unsigned i=0; i < UseList->size(); ++i) { + IntrinsicInst* II = UseList->at(i); + CallInst* CI; + Value* PushArgs[] = {LaunchInst, II->getOperand(1)}; + switch(II->getIntrinsicID()) { + case Intrinsic::visc_wait: + CI = CallInst::Create(llvm_visc_streamWait, + ArrayRef<Value*>(LaunchInst), + ""); + break; + case Intrinsic::visc_push: + CI = CallInst::Create(llvm_visc_streamPush, + ArrayRef<Value*>(PushArgs, 2), + ""); + break; + case Intrinsic::visc_pop: + CI = CallInst::Create(llvm_visc_streamPop, + ArrayRef<Value*>(LaunchInst), + ""); + break; + default: + llvm_unreachable("GraphID is used by an instruction other than wait, push, pop"); + }; + DEBUG(errs() << "Replace:\n\t" << *II << "\n"); + ReplaceInstWithInst(II, CI); + DEBUG(errs() << "\twith " << *CI << "\n"); + } + + +} + +void CGT_X86::codeGenLaunch(DFInternalNode* Root) { + // TODO: Place an assert to check if the constant passed by launch intrinsic + // as the number of arguments to DFG is same as the number of arguments of the + // root of DFG + DEBUG(errs() << "Generating Launch Function\n"); + // Get Launch Instruction + IntrinsicInst* LI = Root->getInstruction(); + switchToTimer(visc_TimerID_PTHREAD_CREATE, LI); + DEBUG(errs() << "Generating Launch Function\n"); + + /* Now we have all the necessary global declarations necessary to generate the + * Launch function, pointer to which can be passed to pthread utils to execute + * DFG. The Launch function has just one input: i8* data.addr + * This is the address of the all the input data that needs to be passed to + * this function. In our case it contains the input arguments of the Root + * function in the correct order. + * (1) Create an empty Launch function of type i8*(i8*) + * (2) Extract each of inputs from data.addr and pass them as arguments to the + * call to Root function + * (3) The return value from Root is stored in memory, pointer to which is + * passed to pthread_exit call. + */ + // Create Launch Function of type i8*(i8*) which calls the root function + Type* i8Ty = Type::getInt8Ty(M.getContext()); + FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(), + ArrayRef<Type*>(i8Ty->getPointerTo()), + false); + Function* AppFunc = Function::Create(AppFuncTy, + Root->getFuncPointer()->getLinkage(), + "LaunchDataflowGraph", + &M); + DEBUG(errs() << "Generating Launch Function\n"); + // Give a name to the argument which is used pass data to this thread + Value* data = &*AppFunc->arg_begin(); + data->setName("data.addr"); + // Add a basic block to this empty function and a return null statement to it + BasicBlock *BB = BasicBlock::Create(AppFunc->getContext(), "entry", AppFunc); + ReturnInst* RI = ReturnInst::Create(AppFunc->getContext(), + Constant::getNullValue(AppFunc->getReturnType()), + BB); + switchToTimer(visc_TimerID_ARG_UNPACK, RI); + + DEBUG(errs() << "Created Empty Launch Function\n"); + // Find the X86 function generated for Root and +// Function* RootF_X86 = Root->getGenFunc(); + Function* RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET); + assert(RootF_X86 && "Error: No generated CPU function for Root node\n"); + assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) && + "Error: Generated Function for Root node with no x86 wrapper\n"); + + // Generate a call to RootF_X86 with null parameters for now + std::vector<Value*>Args; + for(unsigned i=0; i< RootF_X86->getFunctionType()->getNumParams(); i++) { + Args.push_back(Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i))); + } + CallInst* CI = CallInst::Create(RootF_X86, Args, RootF_X86->getName()+".output", RI); + + // Extract input data from i8* data.addr and patch them to correct argument of + // call to RootF_X86. For each argument + std::vector<Type*> TyList; + std::vector<std::string> names; + for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end(); + ai != ae; ++ai) { + TyList.push_back(ai->getType()); + names.push_back(ai->getName()); + } + std::vector<Value*> elements = extractElements(data, TyList, names, CI); + // Patch the elements to the call arguments + for(unsigned i=0; i<CI->getNumArgOperands(); i++) + CI->setArgOperand(i, elements[i]); + + // Add timers around Call to RootF_X86 function + switchToTimer(visc_TimerID_COMPUTATION, CI); + switchToTimer(visc_TimerID_OUTPUT_PACK, RI); + + // Code for returning the output + CastInst* OutputAddrCast = CastInst::CreatePointerCast(data, + CI->getType()->getPointerTo(), + CI->getName()+".addr", + RI); + new StoreInst(CI, OutputAddrCast, RI); + switchToTimer(visc_TimerID_NONE, RI); + + DEBUG(errs() << "Application specific function:\n"); + DEBUG(errs() << *AppFunc << "\n"); + + // Substitute launch intrinsic main + Value* LaunchInstArgs[] = {AppFunc, + LI->getArgOperand(1) + }; + CallInst* LaunchInst = CallInst::Create(llvm_visc_x86_launch, + ArrayRef<Value*>(LaunchInstArgs,2), + "graph"+Root->getFuncPointer()->getName(), LI); + //ReplaceInstWithInst(LI, LaunchInst); + + DEBUG(errs() << *LaunchInst << "\n"); + // Replace all wait instructions with x86 specific wait instructions + std::vector<IntrinsicInst*>* UseList = getUseList(LI); + for(unsigned i=0; i < UseList->size(); ++i) { + IntrinsicInst* II = UseList->at(i); + CallInst* CI; + switch(II->getIntrinsicID()) { + case Intrinsic::visc_wait: + CI = CallInst::Create(llvm_visc_x86_wait, + ArrayRef<Value*>(LaunchInst), + ""); + break; + case Intrinsic::visc_push: + CI = CallInst::Create(llvm_visc_bufferPush, + ArrayRef<Value*>(LaunchInst), + ""); + break; + case Intrinsic::visc_pop: + CI = CallInst::Create(llvm_visc_bufferPop, + ArrayRef<Value*>(LaunchInst), + ""); + break; + default: + llvm_unreachable("GraphID is used by an instruction other than wait, push, pop"); + }; + ReplaceInstWithInst(II, CI); + DEBUG(errs() << *CI << "\n"); + } + +} + +Value* CGT_X86::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore) { + // TODO: Assumption is that each input port of a node has just one + // incoming edge. May change later on. + + // Find the incoming edge at the requested input port + DFEdge* E = Child->getInDFEdgeAt(i); + assert(E && "No incoming edge or binding for input element!"); + // Find the Source DFNode associated with the incoming edge + DFNode* SrcDF = E->getSourceDF(); + + // If Source DFNode is a dummyNode, edge is from parent. Get the + // argument from argument list of this internal node + Value* inputVal; + if(SrcDF->isEntryNode()) { + inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition()); + DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + } + else { + // edge is from a sibling + // Check - code should already be generated for this source dfnode + assert(OutputMap.count(SrcDF) + && "Source node call not found. Dependency violation!"); + + // Find CallInst associated with the Source DFNode using OutputMap + Value* CI = OutputMap[SrcDF]; + + // Extract element at source position from this call instruction + std::vector<unsigned> IndexList; + IndexList.push_back(E->getSourcePosition()); + DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); + ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, + "", InsertBefore); + inputVal = EI; + } + return inputVal; +} + +void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86, + ValueToValueMapTy &VMap,Instruction* IB) { + Function* CF = C->getFuncPointer(); + +// Function* CF_X86 = C->getGenFunc(); + Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET); + assert(CF_X86 != NULL + && "Found leaf node for which code generation has not happened yet!\n"); + assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) && + "The generated function to be called from x86 backend is not an x86 function\n"); + DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n"); + + std::vector<Value*> Args; + // Create argument list to pass to call instruction + // First find the correct values using the edges + // The remaing six values are inserted as constants for now. + for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) { + Args.push_back(getInValueAt(C, i, F_X86, IB)); + } + + Value* I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0); + for(unsigned j=0; j<6; j++) + Args.push_back(I64Zero); + + errs() << "Gen Function type: " << *CF_X86->getType() << "\n"; + errs() << "Node Function type: " << *CF->getType() << "\n"; + errs() << "Arguments: " << Args.size() << "\n"; + + // Call the F_X86 function associated with this node + CallInst* CI = CallInst::Create(CF_X86, Args, + CF_X86->getName()+"_output", + IB); + DEBUG(errs() << *CI << "\n"); + OutputMap[C] = CI; + + // Find num of dimensions this node is replicated in. + // Based on number of dimensions, insert loop instructions + std::string varNames[3] = {"x", "y", "z"}; + unsigned numArgs = CI->getNumArgOperands(); + for(unsigned j=0; j < C->getNumOfDim(); j++) { + Value* indexLimit = NULL; + // Limit can either be a constant or an arguement of the internal node. + // In case of constant we can use that constant value directly in the + // new F_X86 function. In case of an argument, we need to get the mapped + // value using VMap + if(isa<Constant>(C->getDimLimits()[j])) { + indexLimit = C->getDimLimits()[j]; + DEBUG(errs() << "In Constant case:\n" + << " indexLimit type = " << *indexLimit->getType() << "\n"); + } + else { + indexLimit = VMap[C->getDimLimits()[j]]; + DEBUG(errs() << "In VMap case:" + <<" indexLimit type = " << *indexLimit->getType() << "\n"); + } + assert(indexLimit && "Invalid dimension limit!"); + // Insert loop + Value* indexVar = addLoop(CI, indexLimit, varNames[j]); + DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n"); + // Insert index variable and limit arguments + CI->setArgOperand(numArgs-6+j, indexVar); + CI->setArgOperand(numArgs-3+j, indexLimit); + } + // Insert call to runtime to push the dim limits and instanceID on the depth + // stack + Value* args[] = { + ConstantInt::get(Type::getInt32Ty(CI->getContext()), C->getNumOfDim()), // numDim + CI->getArgOperand(numArgs-3+0), // limitX + CI->getArgOperand(numArgs-6+0), // iX + CI->getArgOperand(numArgs-3+1), // limitY + CI->getArgOperand(numArgs-6+1), // iY + CI->getArgOperand(numArgs-3+2), // limitZ + CI->getArgOperand(numArgs-6+2) // iZ + }; + + CallInst* Push = CallInst::Create(llvm_visc_x86_dstack_push, ArrayRef<Value*>(args, 7), "", CI); + DEBUG(errs() << "Push on stack: " << *Push << "\n"); + // Insert call to runtime to pop the dim limits and instanceID from the depth + // stack + BasicBlock::iterator i(CI); + ++i; + Instruction* NextI = &*i; + // Next Instruction should also belong to the same basic block as the basic + // block will have a terminator instruction + assert(NextI->getParent() == CI->getParent() + && "Next Instruction should also belong to the same basic block!"); + + CallInst* Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI); + DEBUG(errs() << "Pop from stack: " << *Pop << "\n"); + DEBUG(errs() << *CI->getParent()->getParent()); +} + +/* This function takes a DFNode, and creates a filter function for it. By filter + * function we mean a function which keeps on getting input from input buffers, + * applying the function on the inputs and then pushes data on output buffers + */ +// Create a function with void* (void*) type. +// Create a new basic block +// Add a return instruction to the basic block +// extract arguments from the aggregate data input. Type list would be +// Replace the streaming inputs with i8* types signifying handle to +// corresponding buffers +// Add a boolean argument isLastInput +// Add runtime API calls to get input for each of the streaming inputs +// Add a call to the generated function of the child node +// Add runtime API calls to push output for each of the streaming outputs +// Add loop around the basic block, which exits the loop if isLastInput is false + +Function* CGT_X86::createFunctionFilter(DFNode* C) { + DEBUG(errs() << "*********Creating Function filter for " << C->getFuncPointer()->getName() << "*****\n"); + + /* Create a function with same argument list as child.*/ + DEBUG(errs() << "\tCreate a function with the same argument list as child\n"); + // Get the generated function for child node + Function* CF = C->getFuncPointer(); + // Create Filter Function of type i8*(i8*) which calls the root function + Type* i8Ty = Type::getInt8Ty(M.getContext()); + FunctionType* CF_PipelineTy = FunctionType::get(i8Ty->getPointerTo(), + ArrayRef<Type*>(i8Ty->getPointerTo()), + false); + Function* CF_Pipeline = Function::Create(CF_PipelineTy, + CF->getLinkage(), + CF->getName()+"_Pipeline", + &M); + DEBUG(errs() << "Generating Pipline Function\n"); + // Give a name to the argument which is used pass data to this thread + Value* data = &*CF_Pipeline->arg_begin(); + data->setName("data.addr"); + // Create a new basic block + DEBUG(errs() << "\tCreate new BB and add a return function\n"); + // Add a basic block to this empty function + BasicBlock *BB = BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline); + // Add a return instruction to the basic block + ReturnInst* RI = ReturnInst::Create(CF_Pipeline->getContext(), + UndefValue::get(CF_Pipeline->getReturnType()), BB); + + + /* Extract the elements from the aggregate argument to the function. + * Replace the streaming inputs with i8* types signifying handle to + * corresponding buffers + * Add outputs to the list as well + * Add isLastInput to the list + */ + DEBUG(errs() << "\tReplace streaming input arguments with i8* type\n"); + // These Args will be used when passing arguments to the generated function + // inside loop, and reading outputs as well. + std::vector<Value*> Args; + std::vector<Type*> TyList; + std::vector<std::string> names; + // Adding inputs + for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); + i != e; ++i) { + if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { + TyList.push_back(i8Ty->getPointerTo()); + names.push_back((Twine(i->getName())+"_buffer").str()); + } + else { + TyList.push_back(i->getType()); + names.push_back(i->getName()); + } + } + // Adding outputs. FIXME: Since we assume all outputs to be streaming edges, + // because we get there buffer handles + StructType* RetTy = cast<StructType>(CF->getReturnType()); + for (unsigned i=0; i<RetTy->getNumElements(); i++) { + TyList.push_back(i8Ty->getPointerTo()); + names.push_back("out"); + } + /* Add a boolean argument isLastInput */ + DEBUG(errs() << "\tAdd a boolean argument called isLastInput to function\n"); + TyList.push_back(i8Ty->getPointerTo()); + names.push_back("isLastInput_buffer"); + + // Extract the inputs, outputs and + Args = extractElements(data, TyList, names, RI); + for(unsigned i=0; i<Args.size(); i++) { + DEBUG(errs() << *Args[i] << "\n"); + } + + // Split the Args vector into, input output and isLastInput + unsigned numInputs = CF->getFunctionType()->getNumParams(); + unsigned numOutputs = RetTy->getNumElements(); + std::vector<Value*> InputArgs(Args.begin(), Args.begin() + numInputs); + std::vector<Value*> OutputArgs(Args.begin() + numInputs, Args.begin() + numInputs + numOutputs); + Instruction* isLastInput = cast<Instruction>(Args[Args.size()-1]); + + /* Add runtime API calls to get input for each of the streaming input edges */ + DEBUG(errs() << "\tAdd runtime API calls to get input for each of the streaming input edges\n"); + // First read the termination condition variable islastInput + CallInst* isLastInputPop = CallInst::Create(llvm_visc_bufferPop, + ArrayRef<Value*>(isLastInput), + "", + RI); + + CastInst* BI = BitCastInst::CreateIntegerCast(isLastInputPop, + Type::getInt64Ty(CF_Pipeline->getContext()), + false, + "isLastInput", + RI); + isLastInput = BI; + // Create a loop termination condition + CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, + isLastInput, Constant::getNullValue(Type::getInt64Ty(CF->getContext())), "isLastInputNotZero", + RI); + + // Get input from buffers of all the incoming streaming edges + for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); + i != e; ++i) { + if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { + CallInst* bufferIn = CallInst::Create(llvm_visc_bufferPop, + ArrayRef<Value*>(InputArgs[i->getArgNo()]), + "", + RI); + CastInst* BI; + if(i->getType()->isPointerTy()) { + BI = CastInst::Create(CastInst::IntToPtr, + bufferIn, + i->getType(), + i->getName()+".addr", + RI); + } + else if(i->getType()->isFloatTy()) { + BI = CastInst::CreateFPCast(bufferIn, + i->getType(), + i->getName()+".addr", + RI); + } + else { + BI = CastInst::CreateIntegerCast(bufferIn, + i->getType(), + false, + i->getName()+".addr", + RI); + } + // Replace the argument in Args vector. We would be using the vector as + // parameters passed to the call + InputArgs[i->getArgNo()] = BI; + } + } + /* Add a call to the generated function of the child node */ + DEBUG(errs() << "\tAdd a call to the generated function of the child node\n"); +// DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n"); +// CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs, +// C->getGenFunc()->getName()+".output", RI); + Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET); + DEBUG(errs() << "Type: " + << *CGenF->getType() + << "\n"); + CallInst* CI = CallInst::Create(CGenF, + InputArgs, + CGenF->getName()+".output", + RI); + + /* Add runtime API calls to push output for each of the streaming outputs */ + // FIXME: Assumption + // All edges between siblings are streaming edges + DEBUG(errs() << "\tAdd runtime API calls to push output for each of the streaming outputs\n"); + for (unsigned i=0; i< numOutputs; i++) { + // Extract output + ExtractValueInst* EI = ExtractValueInst::Create(CI, ArrayRef<unsigned>(i), + "",RI); + // Convert to i64 + CastInst* BI; + if(EI->getType()->isPointerTy()) + BI = CastInst::Create(CastInst::PtrToInt,EI, + Type::getInt64Ty(CF_Pipeline->getContext()), + "", + RI); + else + BI = CastInst::CreateIntegerCast(EI, Type::getInt64Ty(CF_Pipeline->getContext()), + false, "", RI); + // Push to Output buffer + Value* bufferOutArgs[] = {OutputArgs[i], BI}; + CallInst* bufferOut = CallInst::Create(llvm_visc_bufferPush, + ArrayRef<Value*>(bufferOutArgs, 2), + "", + RI); + } + + // Add loop around the basic block, which exits the loop if isLastInput is false + //addDoWhileLoop(cast<Instruction>(Cond)->getNextNode(), RI, Cond); +// addWhileLoop(cast<Instruction>(isLastInputPop), cast<Instruction>(Cond)->getNextNode(), +// RI, Cond); + + // Add loop around the basic block, which exits the loop if isLastInput is false + // Pointers to keep the created loop structure + BasicBlock *EntryBB, *CondBB, *BodyBB; + Instruction *CondStartI = cast<Instruction>(isLastInputPop); + Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode(); + EntryBB = CondStartI->getParent(); + + addWhileLoop(CondStartI, BodyStartI, RI, Cond); + CondBB = CondStartI->getParent(); + BodyBB = CI->getParent(); + Instruction *CntI = NULL; + CallInst *GetPolicyCI = get_llvm_visc_policy_getVersion_call(CGenF); + + // If the node function calls the visc runtime call to get policy, we update + // it with the counter information. This means we need to pass an additional + // argument to the generated function, that is the iteration number, and then + // use it as an argument to the policy_getVersion call + if (GetPolicyCI) { + CntI = addWhileLoopCounter(EntryBB, CondBB, BodyBB); + assert(CntI && "Counter instruction not found\n"); + + // Create new function type (with additional argument for iteration number) + Type *NewRetTy = CGenF->getFunctionType()->getReturnType(); + std::vector<Type*> NewArgTypes; + for (Function::arg_iterator ai = CGenF->arg_begin(), ae = CGenF->arg_end(); + ai != ae ; ++ai) { + NewArgTypes.push_back(ai->getType()); + } + NewArgTypes.push_back(Type::getInt64Ty(M.getContext())); + FunctionType *NewFT = FunctionType::get(NewRetTy, NewArgTypes, false); + Function *NewCGenF = viscUtils::cloneFunction(CGenF, NewFT, false); + // At least one (the last) argument exists (we added it) + Function::arg_iterator ae = NewCGenF->arg_end(); + --ae; + Argument *CntArg = &*ae; + CntArg->setName("iteration"); + // Replace the old cpu gen func with this one + C->addGenFunc(NewCGenF, visc::CPU_TARGET, true); + + // Add counter to the actual parameter list, to create the new call + InputArgs.push_back(CntI); + CallInst* newCI = CallInst::Create(NewCGenF, + InputArgs, + NewCGenF->getName()+".output"); + ReplaceInstWithInst(CI, newCI); + + // Set second operand of the policy_getVersion call to the last function + // argument + GetPolicyCI = get_llvm_visc_policy_getVersion_call(NewCGenF); + GetPolicyCI->setArgOperand(1, CntArg); + } + + // Return the Function pointer + DEBUG(errs() << "Pipeline Version of " << CF->getName() << ":\n"); + DEBUG(errs() << *CF_Pipeline << "\n"); + return CF_Pipeline; +} + +void CGT_X86::codeGen(DFInternalNode* N) { + // Check if N is root node and its graph is streaming. We do not do codeGen + // for Root in such a case + if(N->isRoot() && N->isChildGraphStreaming()) + return; + + // Check if clone already exists. If it does, it means we have visited this + // function before and nothing else needs to be done for this leaf node. +// if(N->getGenFunc() != NULL) +// return; + if (!preferredTargetIncludes(N, visc::CPU_TARGET)) { + errs() << "No CPU hint for node " << N->getFuncPointer()->getName() << + " : skipping it\n"; + return; + } + + assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL && + "Error: Visiting a node for which code already generated\n"); + + // Sort children in topological order before code generation + N->getChildGraph()->sortChildren(); + + // Only process if all children have a CPU x86 function + // Otherwise skip to end + bool codeGen = true; + for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); ci != ce; ++ci) { + DFNode* C = *ci; + // Skip dummy node call + if (C->isDummyNode()) + continue; + + if (!(C->hasX86GenFuncForTarget(visc::CPU_TARGET))) { + errs() << "No CPU x86 version for child node " + << C->getFuncPointer()->getName() + << "\n Skip code gen for parent node " + << N->getFuncPointer()->getName() << "\n"; + codeGen = false; + } + } + + if (codeGen) { + Function* F = N->getFuncPointer(); + // Create of clone of F with no instructions. Only the type is the same as F + // without the extra arguments. + Function* F_X86; + + // Clone the function, if we are seeing this function for the first time. We + // only need a clone in terms of type. + ValueToValueMapTy VMap; + + // Create new function with the same type + F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); + + // Loop over the arguments, copying the names of arguments over. + Function::arg_iterator dest_iterator = F_X86->arg_begin(); + for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); + i != e; ++i) { + dest_iterator->setName(i->getName()); // Copy the name over... + // Increment dest iterator + ++dest_iterator; + } + + // Add a basic block to this empty function + BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86); + ReturnInst* RI = ReturnInst::Create(F_X86->getContext(), + UndefValue::get(F_X86->getReturnType()), BB); + + // Add Index and Dim arguments except for the root node and the child graph of + // parent node is not streaming + if(!N->isRoot() && !N->getParent()->isChildGraphStreaming()) + F_X86 = addIdxDimArgs(F_X86); + + BB = &*F_X86->begin(); + RI = cast<ReturnInst>(BB->getTerminator()); + + //Add generated function info to DFNode +// N->setGenFunc(F_X86, visc::CPU_TARGET); + N->addGenFunc(F_X86, visc::CPU_TARGET, true); + + // Loop over the arguments, to create the VMap. + dest_iterator = F_X86->arg_begin(); + for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); + i != e; ++i) { + // Add mapping and increment dest iterator + VMap[&*i] = &*dest_iterator; + ++dest_iterator; + } + + // Iterate over children in topological order + for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); ci != ce; ++ci) { + DFNode* C = *ci; + // Skip dummy node call + if (C->isDummyNode()) + continue; + + // Create calls to CPU function of child node + invokeChild_X86(C, F_X86, VMap, RI); + + } + + DEBUG(errs() << "*** Generating epilogue code for the function****\n"); + // Generate code for output bindings + // Get Exit node + DFNode* C = N->getChildGraph()->getExit(); + // Get OutputType of this node + StructType* OutTy = N->getOutputType(); + Value *retVal = UndefValue::get(F_X86->getReturnType()); + // Find all the input edges to exit node + for (unsigned i=0; i < OutTy->getNumElements(); i++) { + DEBUG(errs() << "Output Edge " << i << "\n"); + // Find the incoming edge at the requested input port + DFEdge* E = C->getInDFEdgeAt(i); + + assert(E && "No Binding for output element!"); + // Find the Source DFNode associated with the incoming edge + DFNode* SrcDF = E->getSourceDF(); + + DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); + + // If Source DFNode is a dummyNode, edge is from parent. Get the + // argument from argument list of this internal node + Value* inputVal; + if(SrcDF->isEntryNode()) { + inputVal = getArgumentAt(F_X86, i); + DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + } + else { + // edge is from a internal node + // Check - code should already be generated for this source dfnode + assert(OutputMap.count(SrcDF) + && "Source node call not found. Dependency violation!"); + + // Find Output Value associated with the Source DFNode using OutputMap + Value* CI = OutputMap[SrcDF]; + + // Extract element at source position from this call instruction + std::vector<unsigned> IndexList; + IndexList.push_back(E->getSourcePosition()); + DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); + ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, + "",RI); + inputVal = EI; + } + std::vector<unsigned> IdxList; + IdxList.push_back(i); + retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI); + } + DEBUG(errs() << "Extracted all\n"); + retVal->setName("output"); + ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); + ReplaceInstWithInst(RI, newRI); + + } + + //-------------------------------------------------------------------------// + // Here, we need to check if this node (N) has more than one versions + // If so, we query the policy and have a call to each version + // If not, we see which version exists, check that it is in fact an x86 + // function and save it as the CPU_TARGET function + + // TODO: visc_id per node, so we can use this for id for policies + // For now, use node function name and change it later + Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET); + Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET); + Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET); + + bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); + bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); + bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET); + + errs() << "Node: " << N->getFuncPointer()->getName() + << " with tag " << N->getTag() << "\n"; + errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n"; + errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n"; + errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n"; + + + if (N->getTag() == visc::None) { + // No code is available for this node. This (usually) means that this + // node is a node that + // - from the accelerator backends has been mapped to an intermediate + // node, and thus they have not produced a genFunc + // - a child node had no CPU hint, thus no code gen for CPU could + // take place + errs() << "No GenFunc - Skipping CPU code generation for node " + << N->getFuncPointer()->getName() << "\n"; + } else if (viscUtils::isSingleTargetTag(N->getTag())) { + // There is a single version for this node according to code gen hints. + // Therefore, we do not need to check the policy, we simply use the + // available implementation, whichever target it is for. + + // Sanity check - to be removed TODO + switch (N->getTag()) { + case visc::CPU_TARGET: + assert(N->getGenFuncForTarget(visc::CPU_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && ""); + assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && ""); + break; + case visc::GPU_TARGET: + assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(N->getGenFuncForTarget(visc::GPU_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && ""); + assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && ""); + break; + case visc::SPIR_TARGET: + assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(N->getGenFuncForTarget(visc::SPIR_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::SPIR_TARGET) && ""); + break; + default: + assert(false && "Unreachable: we checked that tag was single target!\n"); + break; + } + + // If device abstraction is enabled, then we may need to edit the node + // function. In case this is a GPU or SPIR gen func, we issue a call to + // the runtime that waits for the device to be available + if (DeviceAbstraction) { + Function *NodeGenFunc = NULL; + switch (N->getTag()) { + case visc::GPU_TARGET: + NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET); + break; + case visc::SPIR_TARGET: + NodeGenFunc = N->getGenFuncForTarget(visc::SPIR_TARGET); + break; + default: + break; + } + + if (NodeGenFunc) { + // If we found a function to edit, we add the call to the runtime as + // its first statement + BasicBlock *BB = &*NodeGenFunc->begin(); + std::vector<Value *> Args; // TODO: add the device type as argument? + Function *RTF = + cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType())); + CallInst *RTFInst = CallInst::Create(RTF, Args, "", BB->getFirstNonPHI()); + } + + } + + Function *Ftmp = N->getGenFuncForTarget(N->getTag()); + N->removeGenFuncForTarget(visc::GPU_TARGET); + N->removeGenFuncForTarget(visc::SPIR_TARGET); + N->setTag(visc::None); + N->addGenFunc(Ftmp, visc::CPU_TARGET, true); + N->setTag(visc::CPU_TARGET); + + // Sanity checks - to be removed TODO + CF = N->getGenFuncForTarget(visc::CPU_TARGET); + GF = N->getGenFuncForTarget(visc::GPU_TARGET); + SF = N->getGenFuncForTarget(visc::SPIR_TARGET); + + CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); + GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); + SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET); + + errs() << "After editing\n"; + errs() << "Node: " << N->getFuncPointer()->getName() + << " with tag " << N->getTag() << "\n"; + errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n"; + errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n"; + errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n"; + + // assert(false && "got to the point where we have to select\n"); + } else { + // We have more than one targets + + errs() << "Node Name (for policy) : " + << N->getFuncPointer()->getName() << "\n"; + + Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET); + Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET); + Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET); + + bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); + bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); + bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET); + + // These assertions express what we can support with the current runtime. + // Code generation works the same way even for other target combinations. + // For now, we want either CPU and GPU, or CPU and SPIR + assert((CF && (GF && !SF || !GF && SF)) && "Invalid target selection\n"); + assert((CFx86 && (GFx86 && !SFx86 || !GFx86 && SFx86)) && + "Generated functions without appropriate x86 wrapper\n"); + + FunctionType *FT = CF->getFunctionType(); + if (GF) + assert(FT == GF->getFunctionType() && + "Type mismatch between generated functions for GPU and CPU targets.\n"); + if (SF) + assert(FT == SF->getFunctionType() && + "Type mismatch between generated functions for SPIR and CPU targets.\n"); + + // Code generation of wrapper function + Function *F_wrapper; + ValueToValueMapTy VMap; + F_wrapper = Function::Create(FT, CF->getLinkage(), CF->getName()+"_wrapper", &M); + + // Copy argument names over + Function::arg_iterator dest_iterator = F_wrapper->arg_begin(); + for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); + i != e; ++i) { + dest_iterator->setName(i->getName()); + VMap[&*i] = &*dest_iterator; + ++dest_iterator; + } + // Gather all arguments of wrapper in a vector, to prepare the call to + // the individual gen functions + std::vector<Value *> GenFuncCallArgs; + for (Function::arg_iterator i = F_wrapper->arg_begin(), e = F_wrapper->arg_end(); + i != e; ++i) { + GenFuncCallArgs.push_back(&*i); + } + + BasicBlock *BBcurrent, *BBtrue, *BBfalse; + + BBcurrent = BasicBlock::Create(M.getContext(), "entry", F_wrapper); + + StringRef FName = N->getFuncPointer()->getName(); + size_t nameSize = FName.size()+1; + std::vector<Constant *> NameV; + for (char c: FName) { + NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), c)); + } + NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), '\0')); + ArrayType *NameType = + ArrayType::get(IntegerType::get(M.getContext(), 8), nameSize); + AllocaInst *AI = new AllocaInst(NameType, nullptr, "", BBcurrent); + Constant *NameConst = ConstantArray::get(NameType, NameV); + StoreInst *StI = new StoreInst(NameConst, AI, BBcurrent); + CastInst *BI = BitCastInst::CreatePointerCast(AI, + Type::getInt8PtrTy(M.getContext()), "", BBcurrent); + std::vector<Value *> Args; + Args.push_back(BI); + Args.push_back(ConstantInt::get(Type::getInt64Ty(M.getContext()), -1, true)); + Function *RTF = + cast<Function>(M.getOrInsertFunction("llvm_visc_policy_getVersion", + runtimeModule->getFunction("llvm_visc_policy_getVersion")->getFunctionType())); + CallInst *RTFInst = CallInst::Create(RTF, Args, "", BBcurrent); + + ConstantInt *CmpConst = + ConstantInt::get(Type::getInt32Ty(M.getContext()), 0, true); + CmpInst *CmpI = CmpInst::Create(Instruction::ICmp, + CmpInst::ICMP_EQ, + RTFInst, CmpConst, + "", BBcurrent); + + BBtrue = BasicBlock::Create(M.getContext(), "version_cpu", F_wrapper); + BBfalse = BasicBlock::Create(M.getContext(), "not_cpu", F_wrapper); + BranchInst *BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); + + CallInst *GenFuncCI = CallInst::Create(CF, GenFuncCallArgs, "", BBtrue); + ReturnInst *RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); + + // Switch basic block pointers + BBcurrent = BBfalse; + if (GF) { + // We have a GPU version. Generate policy check and call + CmpConst = + ConstantInt::get(Type::getInt32Ty(M.getContext()), 1, true); + CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + RTFInst, CmpConst, "", BBcurrent); + BBtrue = BasicBlock::Create(M.getContext(), "version_gpu", F_wrapper); + BBfalse = BasicBlock::Create(M.getContext(), "not_gpu", F_wrapper); + BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); + + GenFuncCI = CallInst::Create(GF, GenFuncCallArgs, "", BBtrue); + RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); + + if (DeviceAbstraction) { + // Prepare arguments and function for call to wait for device runtime call + std::vector<Value *> Args; // TODO: add the device type as argument? + Function *RTF = + cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType())); + CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI); + } + } + + // Switch basic block pointers + BBcurrent = BBfalse; + if (SF) { + // We have a GPU version. Generate policy check and call + CmpConst = + ConstantInt::get(Type::getInt32Ty(M.getContext()), 2, true); + CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + RTFInst, CmpConst, "", BBcurrent); + BBtrue = BasicBlock::Create(M.getContext(), "version_spir", F_wrapper); + BBfalse = BasicBlock::Create(M.getContext(), "not_spir", F_wrapper); + BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); + + GenFuncCI = CallInst::Create(SF, GenFuncCallArgs, "", BBtrue); + RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); + + if (DeviceAbstraction) { + // Prepare arguments and function for call to wait for device runtime call + std::vector<Value *> Args; // TODO: add the device type as argument? + Function *RTF = + cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType())); + CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI); + } + } + + RI = ReturnInst::Create(M.getContext(), + UndefValue::get(FT->getReturnType()), BBfalse); + + // Now, make the node cpu gen func to be this one + // Remove all other versions and update the tag + N->addGenFunc(F_wrapper, visc::CPU_TARGET, true); + N->removeGenFuncForTarget(visc::GPU_TARGET); + N->removeGenFuncForTarget(visc::SPIR_TARGET); + N->setTag(visc::CPU_TARGET); + + // assert(false && "got to the point where we have to combine\n"); + } + +} + +// Code generation for leaf nodes +void CGT_X86::codeGen(DFLeafNode* N) { + // Skip code generation if it is a dummy node + if(N->isDummyNode()) { + DEBUG(errs() << "Skipping dummy node\n"); + return; + } + + // At this point, the X86 backend does not support code generation for + // the case where allocation node is used, so we skip. This means that a + // CPU version will not be created, and therefore code generation will + // only succeed if another backend (nvptx or spir) has been invoked to + // generate a node function for the node including the allocation node. + if (N->isAllocationNode()) { + DEBUG(errs() << "Skipping allocation node\n"); + return; + } + + // Check if clone already exists. If it does, it means we have visited this + // function before and nothing else needs to be done for this leaf node. +// if(N->getGenFunc() != NULL) +// return; + + if (!preferredTargetIncludes(N, visc::CPU_TARGET)) { + errs() << "No CPU hint for node " << N->getFuncPointer()->getName() << + " : skipping it\n"; + + errs() << "Check for cudnn or promise hint for node " + << N->getFuncPointer()->getName() << "\n"; + + switch (N->getTag()) { + case visc::CUDNN_TARGET: { + errs() << "CUDNN hint found. Store CUDNN function as CPU funtion.\n"; + // Make sure there is a generated x86 function for cudnn + assert(N->getGenFuncForTarget(visc::CUDNN_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::CUDNN_TARGET) && ""); + // Store the CUDNN x86 function as the CPU generated function + Function *Ftmp = N->getGenFuncForTarget(N->getTag()); + // after adding the required number of arguments + if (!N->getParent()->isChildGraphStreaming()) + Ftmp = addIdxDimArgs(Ftmp); + + N->removeGenFuncForTarget(visc::CUDNN_TARGET); + N->setTag(visc::None); + N->addGenFunc(Ftmp, visc::CPU_TARGET, true); + N->setTag(visc::CPU_TARGET); + break; + } + case visc::PROMISE_TARGET: { + errs() << "Promise hint found. Store PROMISE function as CPU funtion.\n"; + // Make sure there is a generated x86 function for promise + assert(N->getGenFuncForTarget(visc::PROMISE_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::PROMISE_TARGET) && ""); + // Store the PROMISE x86 function as the CPU generated function + Function *Ftmp = N->getGenFuncForTarget(N->getTag()); + // after adding the required number of arguments + if (!N->getParent()->isChildGraphStreaming()) + Ftmp = addIdxDimArgs(Ftmp); + + N->setTag(visc::None); + N->removeGenFuncForTarget(visc::PROMISE_TARGET); + N->addGenFunc(Ftmp, visc::CPU_TARGET, true); + N->setTag(visc::CPU_TARGET); + break; + } + case visc::GPU_TARGET: + // A leaf node should not have an x86 function for GPU + // by design of DFG2LLVM_NVPTX backend + assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); + break; + case visc::SPIR_TARGET: + // A leaf node should not have an x86 function for SPIR + // by design of DFG2LLVM_SPIR backend + assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && ""); + break; + default: + break; + } + + return; + } + + assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL && + "Error: Visiting a node for which code already generated\n"); + + std::vector<IntrinsicInst *> IItoRemove; + std::vector<std::pair<IntrinsicInst *, Value *> > IItoReplace; + BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap; + + // Get the function associated woth the dataflow node + Function *F = N->getFuncPointer(); + + // Clone the function, if we are seeing this function for the first time. + Function *F_X86; + ValueToValueMapTy VMap; + F_X86 = CloneFunction(F, VMap); + F_X86->removeFromParent(); + // Insert the cloned function into the module + M.getFunctionList().push_back(F_X86); + + // Add the new argument to the argument list. Add arguments only if the cild + // graph of parent node is not streaming + if(!N->getParent()->isChildGraphStreaming()) + F_X86 = addIdxDimArgs(F_X86); + + // Add generated function info to DFNode +// N->setGenFunc(F_X86, visc::CPU_TARGET); + N->addGenFunc(F_X86, visc::CPU_TARGET, true); + + /*** FIXME: HACK FOR DSSOC DEMO -- BEGIN ***/ + /* This part of the code is meant to handle turning the CPU backend into an + "accelerator" backend for ApproxHPVM. For this reason, the HPVM runtime + needs to be essentially deactivated. */ + + /* We look into the leaf node's function for function call starting from + "tensor". These are functions with which we replaced the ApproxHPVM + intrinsics, and for which we have LLVM implementations. If found, it means + we are dealing with an AproxHPVM program. */ + bool isApproxHPVMnode = false; + for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) { + Instruction *I = &(*i); + DEBUG(errs() << *I << "\n"); + + if (CallInst *CI = dyn_cast<CallInst>(I)) { + if ((CI->getCalledFunction()->getName()).startswith("tensor")) { + isApproxHPVMnode = true; + break; + } + } + } + + /*As in CUDNN backend, we remove the in out attributes of tensor operations, + aiming to deactivate the HPVM runtime calls. This has been tested through + CUDNN backend for the internal node codegen, and should ensure that code + does not insert llvm_visc_x86_argument_ptr in the generated function for + leaf node codegen as well. */ + + /* Removing HPVM in/out/inout function attributes */ + if (isApproxHPVMnode) { + for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end(); ai != ae; ai++) { + Argument *Arg = &*ai; + if(Arg->hasAttribute(Attribute::In)) + Arg->removeAttr(Attribute::In); + if(Arg->hasAttribute(Attribute::Out)) + Arg->removeAttr(Attribute::Out); + if(Arg->hasAttribute(Attribute::InOut)) + Arg->removeAttr(Attribute::InOut); + } + }else{ + printf("****** NO REMOVEAL *** \n\n"); + } + + /*** FIXME: HACK FOR DSSOC DEMO -- END ***/ + + // Go through the arguments, and any pointer arguments with in attribute need + // to have x86_argument_ptr call to get the x86 ptr of the argument + // Insert these calls in a new BB which would dominate all other BBs + // Create new BB + BasicBlock* EntryBB = &*F_X86->begin(); + BasicBlock* BB = BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB); + BranchInst* Terminator = BranchInst::Create(EntryBB, BB); + // Insert calls + for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end(); + ai != ae; ++ai) { + if (F_X86->getAttributes().hasAttribute(ai->getArgNo()+1, Attribute::In)) { + assert(ai->getType()->isPointerTy() + && "Only pointer arguments can have visc in/out attributes "); + Function::arg_iterator aiNext = ai; + ++aiNext; + Argument* size = &*aiNext; + assert(size->getType() == Type::getInt64Ty(M.getContext()) + && "Next argument after a pointer should be an i64 type"); + CastInst* BI = BitCastInst::CreatePointerCast(&*ai, + Type::getInt8PtrTy(M.getContext()), + ai->getName()+".i8ptr", + Terminator); + Value* ArgPtrCallArgs[] = {BI, size}; + CallInst::Create(llvm_visc_x86_argument_ptr, + ArrayRef<Value*>(ArgPtrCallArgs, 2), + "", + Terminator); + + } + } + errs() << *BB << "\n"; + + // Go through all the instructions + for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) { + Instruction *I = &(*i); + DEBUG(errs() << *I << "\n"); + // Leaf nodes should not contain VISC graph intrinsics or launch + assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!"); + assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!"); + + if (BuildDFG::isViscQueryIntrinsic(I)) { + IntrinsicInst* II = cast<IntrinsicInst>(I); + IntrinsicInst* ArgII; + DFNode* ArgDFNode; + + /*********************************************************************** + * Handle VISC Query intrinsics * + ***********************************************************************/ + switch (II->getIntrinsicID()) { + /**************************** llvm.visc.getNode() *******************/ + case Intrinsic::visc_getNode: { + // add mapping <intrinsic, this node> to the node-specific map + Leaf_HandleToDFNodeMap[II] = N; + IItoRemove.push_back(II); + break; + } + /************************* llvm.visc.getParentNode() ****************/ + case Intrinsic::visc_getParentNode: { + // get the parent node of the arg node + // get argument node + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + // get the parent node of the arg node + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + // Add mapping <intrinsic, parent node> to the node-specific map + // the argument node must have been added to the map, orelse the + // code could not refer to it + Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent(); + IItoRemove.push_back(II); + break; + } + /*************************** llvm.visc.getNumDims() *****************/ + case Intrinsic::visc_getNumDims: { + // get node from map + // get the appropriate field + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + int numOfDim = Leaf_HandleToDFNodeMap[ArgII]->getNumOfDim(); + IntegerType* IntTy = Type::getInt32Ty(M.getContext()); + ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim); + + II->replaceAllUsesWith(numOfDimConstant); + IItoRemove.push_back(II); + break; + } + /*********************** llvm.visc.getNodeInstanceID() **************/ + case Intrinsic::visc_getNodeInstanceID_x: + case Intrinsic::visc_getNodeInstanceID_y: + case Intrinsic::visc_getNodeInstanceID_z: { + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + + // The dfnode argument should be an ancestor of this leaf node or + // the leaf node itself + int parentLevel = N->getAncestorHops(ArgDFNode); + assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N ) + && "Invalid DFNode argument to getNodeInstanceID_[xyz]!"); + + // Get specified dimension + // (dim = 0) => x + // (dim = 1) => y + // (dim = 2) => z + int dim = (int) (II->getIntrinsicID() - + Intrinsic::visc_getNodeInstanceID_x); + assert((dim >= 0) && (dim < 3) + && "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic ID!"); + + // For immediate ancestor, use the extra argument introduced in + // F_X86 + int numParamsF = F->getFunctionType()->getNumParams(); + int numParamsF_X86 = F_X86->getFunctionType()->getNumParams(); + assert((numParamsF_X86 - numParamsF == 6) + && "Difference of arguments between function and its clone is not 6!"); + + if(parentLevel == 0) { + // Case when the query is for this node itself + unsigned offset = 3 + (3-dim); + // Traverse argument list of F_X86 in reverse order to find the + // correct index or dim argument. + Argument* indexVal = getArgumentFromEnd(F_X86, offset); + assert(indexVal && "Index argument not found. Invalid offset!"); + + DEBUG(errs() << *II << " replaced with " << *indexVal << "\n"); + + II->replaceAllUsesWith(indexVal); + IItoRemove.push_back(II); + } + else { + // Case when query is for an ancestor + Value* args[] = { + ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel), + ConstantInt::get(Type::getInt32Ty(II->getContext()), dim) + }; + CallInst* CI = CallInst::Create(llvm_visc_x86_getDimInstance, + ArrayRef<Value*>(args, 2), + "nodeInstanceID", II); + DEBUG(errs() << *II << " replaced with " << *CI << "\n"); + II->replaceAllUsesWith(CI); + IItoRemove.push_back(II); + } + break; + } + /********************** llvm.visc.getNumNodeInstances() *************/ + case Intrinsic::visc_getNumNodeInstances_x: + case Intrinsic::visc_getNumNodeInstances_y: + case Intrinsic::visc_getNumNodeInstances_z: { + + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + + // The dfnode argument should be an ancestor of this leaf node or + // the leaf node itself + int parentLevel = N->getAncestorHops(ArgDFNode); + assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N ) + && "Invalid DFNode argument to getNodeInstanceID_[xyz]!"); + + // Get specified dimension + // (dim = 0) => x + // (dim = 1) => y + // (dim = 2) => z + int dim = (int) (II->getIntrinsicID() - + Intrinsic::visc_getNumNodeInstances_x); + assert((dim >= 0) && (dim < 3) + && "Invalid dimension for getNumNodeInstances_[xyz]. Check Intrinsic ID!"); + + // For immediate ancestor, use the extra argument introduced in + // F_X86 + int numParamsF = F->getFunctionType()->getNumParams(); + int numParamsF_X86 = F_X86->getFunctionType()->getNumParams(); + assert((numParamsF_X86 - numParamsF == 6) + && "Difference of arguments between function and its clone is not 6!"); + + if(parentLevel == 0) { + // Case when the query is for this node itself + unsigned offset = 3 - dim; + // Traverse argument list of F_X86 in reverse order to find the + // correct index or dim argument. + Argument* limitVal = getArgumentFromEnd(F_X86, offset); + assert(limitVal && "Limit argument not found. Invalid offset!"); + + DEBUG(errs() << *II << " replaced with " << *limitVal << "\n"); + + II->replaceAllUsesWith(limitVal); + IItoRemove.push_back(II); + } + else { + // Case when query is from the ancestor + Value* args[] = { + ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel), + ConstantInt::get(Type::getInt32Ty(II->getContext()), dim) + }; + CallInst* CI = CallInst::Create(llvm_visc_x86_getDimLimit, + ArrayRef<Value*>(args, 2), + "numNodeInstances", II); + DEBUG(errs() << *II << " replaced with " << *CI << "\n"); + II->replaceAllUsesWith(CI); + IItoRemove.push_back(II); + } + + break; + } + default: + DEBUG(errs() << "Found unknown intrinsic with ID = " << + II->getIntrinsicID() << "\n"); + assert(false && "Unknown VISC Intrinsic!"); + break; + } + + } else { + //TODO: how to handle address space qualifiers in load/store + } + + } + + //TODO: + // When to replace the uses? + // In which order is it safe to replace the instructions in + // IItoReplace? + // Probably in the reverse order in the vectors + // It is a good idea to have them in one vector and chech the type + // using dyn_cast in order to determine if we replace with inst or value + + + //TODO: maybe leave these instructions to be removed by a later DCE pass + for (std::vector<IntrinsicInst *>::iterator i = IItoRemove.begin(); + i != IItoRemove.end(); ++i) { + (*i)->replaceAllUsesWith(UndefValue::get((*i)->getType())); + (*i)->eraseFromParent(); + } + + DEBUG(errs() << *F_X86); +} + +} // End of namespace + +char DFG2LLVM_X86::ID = 0; +static RegisterPass<DFG2LLVM_X86> X("dfg2llvm-x86-dsoc", + "Dataflow Graph to LLVM for X86 backend (DSOCC version)", + false /* does not modify the CFG */, + true /* transformation, not just analysis */); + diff --git a/lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt b/lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt new file mode 100644 index 0000000000..a6c4de9537 --- /dev/null +++ b/lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = DFG2LLVM_X86_dsoc +parent = Transforms + diff --git a/lib/ExtractHPVMLeafNodes/CMakeLists.txt b/lib/ExtractHPVMLeafNodes/CMakeLists.txt new file mode 100644 index 0000000000..6421b528d7 --- /dev/null +++ b/lib/ExtractHPVMLeafNodes/CMakeLists.txt @@ -0,0 +1,13 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( ExtractHPVMLeafNodes + ExtractHPVMLeafNodes.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) + diff --git a/lib/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.cpp b/lib/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.cpp new file mode 100644 index 0000000000..cd7ead9f6c --- /dev/null +++ b/lib/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.cpp @@ -0,0 +1,246 @@ +//===------------------- ExtractHPVMLeafNodeGenFunctions.cpp -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "ExtractHPVMLeafNodes" + +#include "llvm/Support/SourceMgr.h" +#include "llvm/Pass.h" +#include "llvm/SupportVISC/DFGTreeTraversal.h" +#include "llvm/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#include "llvm/IR/IRPrintingPasses.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Support/ToolOutputFile.h" +#include "llvm/Support/FileSystem.h" + +using namespace llvm; +using namespace builddfg; +using namespace extracthpvmleaf; +using namespace dfg2llvm; + +namespace { + +class PrintLeafNodes : public DFGTreeTraversal { + public: + virtual void process(DFInternalNode* N) override; + virtual void process(DFLeafNode* N) override; + + // Constructor + PrintLeafNodes(Module &_M, BuildDFG &_DFG) : DFGTreeTraversal(_M, _DFG) {} + +}; + +} + +void PrintLeafNodes::process(DFInternalNode* N) { + DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n"); + return; // nothing to do +} + +void PrintLeafNodes::process(DFLeafNode* N) { + DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n"); + if((N->isDummyNode())) { + DEBUG(errs() << "Skipping Dummy Node: " << N->getFuncPointer()->getName() << "\n"); + return; + } + + // Find function generated for node + Function *F = N->getGenFuncForTarget(visc::CPU_TARGET); + assert(F != NULL + && "This pass is invoked after code generation for x86 is completed.\nFound leaf node for which code generation has not happened!\n"); + assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && + "The generated function from x86 pass is not an x86 function\n"); + + std::string module_name = std::string("./build/") + std::string(F->getName().str().c_str()) + std::string("_module.ll"); + Twine tw(module_name); + // Create a new module for the node function + //Twine tw = Twine(F->getName()).concat(Twine("_module.ll")); + Module *m = new Module(tw.str(), F->getParent()->getContext()); + // Create a new function for F. It will be written to a new module. + ValueToValueMapTy VMap; + Function *ClonedF = CloneFunction(F, VMap); + // Remove it from current module + ClonedF->removeFromParent(); + // Insert it to the newly created module for it + m->getFunctionList().push_back(ClonedF); + + std::vector<Instruction*> ItoRemove; + + for (inst_iterator i = inst_begin(ClonedF), e = inst_end(ClonedF); i != e; ++i) { + Instruction *I = &(*i); + errs() << *I << "\n"; + + if (CallInst *CI = dyn_cast<CallInst>(I)) { + errs() << "Found call instruction\n"; + + Function *CalledF = CI->getCalledFunction(); + StringRef CallName = CalledF->getName(); + errs() << "CallName: " << CallName << "\n"; + +// if (CallName.startswith("llvm_visc")) { //TODO + if ((CallName.startswith("llvm_visc")) || (CallName.startswith("tensor"))) { //TODO +// errs() << "This is an HPVM runtime call. Include its declaration.\n"; + errs() << "This is an HPVM runtime call or tensor. Include its declaration.\n"; + + FunctionType *CalledFType = CalledF->getFunctionType(); + + std::vector<Value*> Fargs; + for (unsigned argno = 0; argno < CI->getNumArgOperands(); argno++) { + Fargs.push_back(CI->getArgOperand(argno)); + } + Function *FDecl = cast<Function>(m->getOrInsertFunction(CallName, CalledFType)); + CallInst *NewCI = CallInst::Create(CalledFType, FDecl, Fargs, CallName, CI); + errs() << "NewCI: " << *NewCI << "\n"; + CI->replaceAllUsesWith(NewCI); + ItoRemove.push_back(CI); + } + } + } + + for (unsigned i = 0; i < ItoRemove.size() ; i++) { + ItoRemove[i]->eraseFromParent(); + } + + ItoRemove.clear(); + + // Print new module + legacy::PassManager Passes; + + errs() << "Writing to File --- " << tw.str() << "\n"; + std::error_code EC; + tool_output_file Out(tw.str(), EC, sys::fs::F_None); + if (EC) { + errs() << EC.message() << '\n'; + } + + Passes.add(createPrintModulePass(Out.os())); + Passes.run(*m); + // Declare success. + Out.keep(); + + // Any call that is to F, needs to call the new external function + // Edit initial module to do so + // This is the name with which the function is called now + StringRef FName = ClonedF->getName(); + FunctionType *FType = F->getFunctionType(); + + // This is a node function, so it is only called through the dataflow graph + assert(F->hasOneUse() && "F is an HPVM node function\n"); + +/* + errs() << "F uses: " << F->getNumUses() << "\n" ; + for(Value::user_iterator ui = F->user_begin(), + ue = F->user_end(); ui!=ue; ++ui) { + errs() << "use : "<< **ui << "\n"; + } +*/ + + // Get the parent node's generated x86 function + DFInternalNode *ParentNode = N->getParent(); + Function *PGenF = ParentNode->getGenFuncForTarget(visc::CPU_TARGET); + assert(PGenF != NULL + && "This pass is invoked after code generation for x86 is completed.\nFound node for which code generation has not happened!\n"); + assert(ParentNode->hasX86GenFuncForTarget(visc::CPU_TARGET) && + "The generated function from x86 pass is not an x86 function\n"); + + for (inst_iterator i = inst_begin(PGenF), e = inst_end(PGenF); i != e; ++i) { + Instruction *I = &(*i); + errs() << *I << "\n"; + + if (CallInst *CI = dyn_cast<CallInst>(I)) { + errs() << "Found call instruction\n"; + + StringRef CallName = CI->getCalledFunction()->getName(); + errs() << "CallName: " << CallName << "\n"; + errs() << "F->getName(): " << F->getName() << "\n"; + + if (CallName == F->getName()) { + // Found the call to the leaf node function we moved to the other module. + // Replace the call + std::vector<Value*> Fargs; + for (unsigned argno = 0; argno < CI->getNumArgOperands(); argno++) { + Fargs.push_back(CI->getArgOperand(argno)); + } + Function *FDecl = cast<Function>(M.getOrInsertFunction(FName, FType)); + CallInst *NewCI = CallInst::Create(FType, FDecl, Fargs, FName, CI); + errs() << "NewCI: " << *NewCI << "\n"; + CI->replaceAllUsesWith(NewCI); + ItoRemove.push_back(CI); + } + } + } + + for (unsigned i = 0; i < ItoRemove.size() ; i++) { + ItoRemove[i]->eraseFromParent(); + } + + // Clean up + ClonedF->eraseFromParent(); + delete m; + + F->replaceAllUsesWith(UndefValue::get(F->getType())); + F->eraseFromParent(); + + return; +} + +void ExtractHPVMLeafNodeFunctions::run(Module &M, BuildDFG &DFG) { + + errs() << "\nEXTRACT HPVM LEAF NODE FUNCTIONS PASS\n"; + + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + + // Visitor for Graph Traversal + PrintLeafNodes *LeafVisitor = new PrintLeafNodes(M, DFG); + + // Iterate over all the DFGs + // Analyse the edges for parameters that are valid to be used in place + for (auto rootNode: Roots) { + LeafVisitor->visit(rootNode); + } + + delete LeafVisitor; + return; +} + +namespace { +struct ExtractHPVMLeafNodeGenFunctionsWrapper : public ModulePass { + static char ID; + ExtractHPVMLeafNodeGenFunctionsWrapper() : ModulePass(ID) {} + + bool runOnModule(Module &) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; +} // end anonymous namespace + +void ExtractHPVMLeafNodeGenFunctionsWrapper::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<BuildDFG>(); + AU.addPreserved<BuildDFG>(); +} + +bool ExtractHPVMLeafNodeGenFunctionsWrapper::runOnModule(Module &M) { + // Get the BuildDFG Analysis Results: + // - Dataflow graph + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + ExtractHPVMLeafNodeFunctions ELNF; + ELNF.run(M, DFG); + + return false; +} + +char ExtractHPVMLeafNodeGenFunctionsWrapper::ID = 0; +static RegisterPass<ExtractHPVMLeafNodeGenFunctionsWrapper> X( + "hpvm-extract-leaf-gen", + "Pass to extract leaf nodes to modules in HPVM", + false /* does not modify the CFG */, +true /* transformation, not just analysis */); diff --git a/lib/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.exports b/lib/ExtractHPVMLeafNodes/ExtractHPVMLeafNodes.exports new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/ExtractHPVMLeafNodes/LLVMBuild.txt b/lib/ExtractHPVMLeafNodes/LLVMBuild.txt new file mode 100644 index 0000000000..9862f559e5 --- /dev/null +++ b/lib/ExtractHPVMLeafNodes/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = ExtractHPVMLeafNodes +parent = Transforms + diff --git a/lib/FuseHPVMTensorNodes/CMakeLists.txt b/lib/FuseHPVMTensorNodes/CMakeLists.txt new file mode 100644 index 0000000000..374f3b26f1 --- /dev/null +++ b/lib/FuseHPVMTensorNodes/CMakeLists.txt @@ -0,0 +1,12 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( LLVMFuseHPVMTensorNodes + FuseHPVMTensorNodes.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) diff --git a/lib/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp b/lib/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp new file mode 100644 index 0000000000..541efe4e1d --- /dev/null +++ b/lib/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp @@ -0,0 +1,1007 @@ +//=== FuseHPVMTensorNodes.cpp ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "FuseTensorNodes" + +#include "llvm/IR/ValueMap.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#include "llvm/FuseHPVMTensorNodes/FuseHPVMTensorNodes.h" +#include "llvm/SupportVISC/DFG2LLVM.h" +#include "llvm/SupportVISC/VISCUtils.h" + +using namespace llvm; +using namespace builddfg; +using namespace dfg2llvm; +using namespace viscUtils; + +namespace tensorfuse { +/*** Classes ***/ + +/*** Helper Functions ***/ + +/* Return the constant integer represented by value V */ +static unsigned getNumericValue(Value* V) { + assert(isa<ConstantInt>(V) + && "Value indicating the number of arguments should be a constant integer"); + return cast<ConstantInt>(V)->getZExtValue(); +} + +/* Query the kind of edge described by a createEdge intrinsic IIe * + * with respect to node handle IIn */ +static bool isIncomingEdgeIntrinsic(IntrinsicInst* IIe, IntrinsicInst* IIn) { + Value* Src = IIe->getArgOperand(1); + IntrinsicInst* ArgII = cast<IntrinsicInst>(Src); + assert(ArgII && "First argument of createEdge is not an intrinsic"); + return (ArgII == IIn); +} +static bool isOutgoingEdgeIntrinsic(IntrinsicInst* IIe, IntrinsicInst* IIn) { + Value* Src = IIe->getArgOperand(0); + IntrinsicInst* ArgII = cast<IntrinsicInst>(Src); + assert(ArgII && "First argument of createEdge is not an intrinsic"); + return (ArgII == IIn); +} + +/* Populates vector with all incoming edge intrinsics to node II */ +static void getIncomingEdgeIntrinsicList(IntrinsicInst *II, + std::vector<IntrinsicInst*> &EdgeList) { + for(Value::user_iterator ui = II->user_begin(), + ue = II->user_end(); ui!=ue; ++ui) { + IntrinsicInst* useI = dyn_cast<IntrinsicInst>(*ui); + assert(useI && + "HPVM graph intrinsic used in non HPVM intrinsic instruction\n"); + if (useI->getIntrinsicID() != Intrinsic::visc_createEdge) + continue; // Skip all non edge intrinsics + + // For edge intrinsics, test the descination operand + if (useI->getOperand(1) == II) { // Argument is the destination + EdgeList.push_back(useI); + } + } + return; +} + +/* Returns true if argument at position argno is coming from a dataflow edge * + * in the vector EdgeList */ +static bool isIncomingEdgeArgument(unsigned argno, + std::vector<IntrinsicInst*> &EdgeList) { + for (IntrinsicInst *ii : EdgeList) { + if (getNumericValue(ii->getOperand(4)) == argno) + return true; + } + return false; +} + + +// Check that this is a valid HPVM Tensor Node (starts with an HPVM intrinsic) +// Return the node intrinsic function +static IntrinsicInst *isValidHPVMTensorNode(DFNode *N) { + + Function *F = N->getFuncPointer(); + //IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*(inst_begin(F))); + + IntrinsicInst *II; + for (auto I = inst_begin(F), E = inst_end(F); I != E; I++){ + + if(dyn_cast<IntrinsicInst>(&*I)){ + II = dyn_cast<IntrinsicInst>(&*I); + if ((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor")){ + errs()<<"** Tensor Intrinsic = " << *II << "\n"; + } + + } + } + + //assert(II && + // "HPVM tensor intrinsic expected as first instruction of HPVM tensor node\n"); + + //assert((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor") && + // "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n"); + + return II; +} + + +// Returns the next node in a node sequence, or NULL if it does not exist. +// We consider two nodes a sequence if SrcN has a single successor, DstN, +// and DstN a single predeccessor, SrcN (other than the Root node) +static DFNode *findNextNodeInSequence(DFNode *SrcN) { + + DFNode *DstN = NULL; + + for (DFNode::successor_iterator si = SrcN->successors_begin(), + se = SrcN->successors_end(); si != se; ++si) { + DFNode *N = *si; + if (N->isDummyNode()) { + continue; + } + if (!DstN) + DstN = N; + if (DstN != N) { + errs() << "Found different destination nodes: no node sequence.\n"; + return NULL; + } + } + + // If we reach this point, DstN is the unique successor of SrcN + + // Now, test that the DstN has a single predeccessor except Root (dummy) + for (DFNode::indfedge_iterator eb = DstN->indfedge_begin(), + ee = DstN->indfedge_end(); eb != ee; ++eb) { + DFNode *SN = (*eb)->getSourceDF(); + if ((SN != SrcN) && (!(SN->isDummyNode()))) { + // Does not satisfy requirement + return NULL; + } + } + + return DstN; +} + +/*** Methods ***/ + +/* Create an identical bind (in or out, depending on the argument intrinsic) * + * with different src (true) or dst (false) port */ +IntrinsicInst* FuseHPVMTensorNodes::createIdenticalBindWithDifferentPort( + IntrinsicInst* II, unsigned port, bool srcport) { + // Argument of the function to be called + ConstantInt* PortConstant = + ConstantInt::get(Type::getInt32Ty(II->getContext()), port); + Value* SrcPort = (srcport) ? PortConstant: II->getArgOperand(1); + Value* DstPort = (srcport) ? II->getArgOperand(2): PortConstant; + + Value* BindArgs[] = {II->getArgOperand(0), + SrcPort, + DstPort, + II->getArgOperand(3) + }; + Function* BindF = II->getCalledFunction(); + CallInst* BindInst = CallInst::Create(BindF, + ArrayRef<Value*>(BindArgs, 4), + ""); + IntrinsicInst* newII = dyn_cast<IntrinsicInst>(BindInst); + + return newII; +} + +/* Given two createNode intrinsics describing connected nodes, this function * + * returns the argument list type of the fused function */ +void FuseHPVMTensorNodes::createArgTypes(IntrinsicInst* II1, + IntrinsicInst* II2, + std::vector<Type*> &ArgTypes) { + Function* F1 = cast<Function>((II1->getOperand(0))->stripPointerCasts()); + Function* F2 = cast<Function>((II2->getOperand(0))->stripPointerCasts()); + + // Arguments of the first node are simply added + for(auto& arg: F1->getArgumentList()) { + DEBUG(errs() << arg << "\n"); + ArgTypes.push_back(arg.getType()); + } + + // Arguments of the second node are added only if they are not the output of + // the previous node + + // Find all incoming edges. + std::vector<IntrinsicInst *> IncomingEdgeList; + getIncomingEdgeIntrinsicList(II2, IncomingEdgeList); + + // Their source must be the first fusion node, otherwise they would not have + // been fusion candidates + for (IntrinsicInst *ii : IncomingEdgeList) { + assert((ii->getOperand(0) == II1) && "Unexpected source operand\n"); + } + + // Add argument type to the new function only if it is not incoming from + // an edge + for(auto& arg: F2->getArgumentList()) { + DEBUG(errs() << arg << "\n"); + unsigned inport = arg.getArgNo(); + if (isIncomingEdgeArgument(inport, IncomingEdgeList)) + continue; + ArgTypes.push_back(arg.getType()); + } +} + +/* Get the return type of the function for fused node II1-II2 */ +StructType* FuseHPVMTensorNodes::createReturnType(IntrinsicInst* II1, + IntrinsicInst* II2) { + Function* F1 = cast<Function>((II1->getOperand(0))->stripPointerCasts()); + Function* F2 = cast<Function>((II2->getOperand(0))->stripPointerCasts()); + + // Based on the HPVM tensor node assumptions and the patterns we want to + // support, when two nodes are fused the result will always be the result + // of the second node. + StructType* F1RetTy = dyn_cast<StructType>(F1->getReturnType()); + assert(F1RetTy && "Return Type must always be a struct"); + StructType* F2RetTy = dyn_cast<StructType>(F2->getReturnType()); + assert(F2RetTy && "Return Type must always be a struct"); + + return F2RetTy; +} + +/* Copy argument names, from functions of II1 and II2 to F */ +void FuseHPVMTensorNodes::copyArgumentNames(IntrinsicInst* II1, + IntrinsicInst* II2, + Function* F) { + Function* F1 = cast<Function>((II1->getOperand(0))->stripPointerCasts()); + Function* F2 = cast<Function>((II2->getOperand(0))->stripPointerCasts()); + + Function::arg_iterator dest_it = F->arg_begin(); + + // Argument names of the first node are simply copied + for(auto& arg: F1->getArgumentList()) { + dest_it->setName("s_" + arg.getName()); + dest_it++; + } + + // For the second node, we ignore those arguments that are incoming edges + // (from II1) + // Find all incoming edges. + std::vector<IntrinsicInst *> IncomingEdgeList; + getIncomingEdgeIntrinsicList(II2, IncomingEdgeList); + + // Their source must be the first fusion node, otherwise they would not have + // been fusion candidates + for (IntrinsicInst *ii : IncomingEdgeList) { + assert((ii->getOperand(0) == II1) && "Unexpected source operand\n"); + } + + // Copy argument name to the new function only if it is not incoming from + // an edge + for(auto& arg: F2->getArgumentList()) { + DEBUG(errs() << arg << "\n"); + unsigned inport = arg.getArgNo(); + if (isIncomingEdgeArgument(inport, IncomingEdgeList)) + continue; + + dest_it->setName("d_" + arg.getName()); + dest_it++; + } + assert((dest_it == F->arg_end()) && + "Argument list of fused function not fully traversed\n"); + return; +} + +/* Copy attributes, from functions of II1 and II2 to F */ +void FuseHPVMTensorNodes::copyAttrList(IntrinsicInst* II1, + IntrinsicInst* II2, + Function* F) { + Function* F1 = cast<Function>((II1->getOperand(0))->stripPointerCasts()); + Function* F2 = cast<Function>((II2->getOperand(0))->stripPointerCasts()); + + Function::arg_iterator f1_ai = F1->arg_begin(), f1_ae = F1->arg_end(); + Function::arg_iterator f2_ai = F2->arg_begin(), f2_ae = F2->arg_end(); + Function::arg_iterator f_ai = F->arg_begin(), f_ae = F->arg_end(); + + // For the second node, we have to ignore the arguments that are incoming + // edges (from II1) + // Find all incoming edges. + std::vector<IntrinsicInst *> IncomingEdgeList; + getIncomingEdgeIntrinsicList(II2, IncomingEdgeList); + + // Their source must be the first fusion node, otherwise they would not have + // been fusion candidates + for (IntrinsicInst *ii : IncomingEdgeList) { + assert((ii->getOperand(0) == II1) && "Unexpected source operand\n"); + } + + // Copy attributes of F1 + for(; f1_ai != f1_ae && f_ai != f_ae; ++f1_ai, ++f_ai) { + AttributeSet AS = F1->getAttributes(); + DEBUG(errs() << "Copying attributes from " + << F1->getName() << " at " << f1_ai->getArgNo() << "\n"); + AttrBuilder AB(AS, f1_ai->getArgNo()+1); + AttributeSet argAS = AttributeSet::get(F1->getContext(), + f_ai->getArgNo()+1, AB); + F->addAttributes(f_ai->getArgNo()+1, argAS); + } + + // Copy needed attributes of F2 + for(; f2_ai != f2_ae && f_ai != f_ae; ++f2_ai) { + unsigned inport = f2_ai->getArgNo(); + if (isIncomingEdgeArgument(inport, IncomingEdgeList)) + continue; + + AttributeSet AS = F2->getAttributes(); + DEBUG(errs() << "Copying attributes from " + << F2->getName() << " at " << f2_ai->getArgNo() << "\n"); + AttrBuilder AB(AS, f2_ai->getArgNo()+1); + AttributeSet argAS = AttributeSet::get(F2->getContext(), + f_ai->getArgNo()+1, AB); + F->addAttributes(f_ai->getArgNo()+1, argAS); + ++f_ai;; + } + return; +} + +/* Creates and inserts an empty function of the rght type for the fused node */ +Function* FuseHPVMTensorNodes::createEmptyDFNodeFunction(IntrinsicInst* II1, + IntrinsicInst* II2, + Module &M) { + Function* F1 = cast<Function>((II1->getOperand(0))->stripPointerCasts()); + Function* F2 = cast<Function>((II2->getOperand(0))->stripPointerCasts()); + + DEBUG(errs () << "Constructing argument list\n"); + // Construct argument list + std::vector<Type*> ArgTypes; + createArgTypes(II1, II2, ArgTypes); + + DEBUG(errs () << "Constructing return type\n"); + // Construct return type + StructType* FRetTy = createReturnType(II1, II2); + + FunctionType* FTy = FunctionType::get(FRetTy, ArgTypes, false); + // Create a function with the new type + Function* F = Function::Create(FTy, F1->getLinkage(), + F1->getName()+"_"+F2->getName(), &M); + + DEBUG(errs () << "Copying argument names\n"); + // Copy argument names from original functions + copyArgumentNames(II1, II2, F); + // Copy argument attributes from original functions + copyAttrList(II1, II2, F); + + return F; +} + +/* Inline first node function, updating required mappings * + * - F1: first node function * + * - M: module containing the node function * + * - Ffused: fused node function * + * - VMap: maps values used in the body of F1 to those that mst be used in * + the body of the fused function instead * + * OutVs: This maps the output struct field index to the stored value */ +void FuseHPVMTensorNodes::inlineFirstNodeFunction(Module &M, Function *F1, + Function *Ffused, + ValueMap<Value*, Value*> &VMap, + std::vector<Value*> &OutVs) { + + ReturnInst *RI = cast<ReturnInst>(Ffused->getEntryBlock().getTerminator()); + + inst_iterator f1_i = inst_begin(F1); + // First, we copy the HPVM intrinsics of F1 into Ffused, applying the mapping + for (inst_iterator f1_e = inst_end(F1); f1_i != f1_e; ++f1_i) { + Instruction *I = &(*f1_i); + if (!(BuildDFG::isViscIntrinsic(I))) { + // We are done with the node computation + break; + } + + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + assert ( ((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor") + || (II->getCalledFunction()->getName()).startswith("llvm.visc.node.id") ) + && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n"); + + std::vector<Value*> Args; + for(unsigned i = 0; i < II->getNumArgOperands(); i++) { + Value *V = II->getArgOperand(i); + if (isa<Constant>(V)) { // Constants can be reused + Args.push_back(V); + } else { + assert((VMap.find(V) != VMap.end()) && + "Attempted to use value without existing mapping in VMap"); + Args.push_back(VMap[V]); + } + } + + Function *F = Intrinsic::getDeclaration(&M, II->getIntrinsicID()); + CallInst* CI = + CallInst::Create(F, Args, + F->getReturnType()->isVoidTy()? "" : "s_"+II->getName(), RI); + // Update the map with the newly created value + VMap[II] = CI; + } + + // We continue with gathering information about the return values + for (inst_iterator f1_e = inst_end(F1); f1_i != f1_e; ++f1_i) { + Instruction *I = &(*f1_i); + InsertValueInst* IV = dyn_cast<InsertValueInst>(I); + if (!IV) { + // End of insertvalue instructions. This should be a return statement + assert((dyn_cast<ReturnInst>(I)) && "Unexpected Instruction\n"); + break; // Done processing this function + } + OutVs.push_back(IV->getOperand(1)); + } + return; +} + +/* Inline second node function, updating required mappings * + * - F2: second node function * + * - M: module containing the node function * + * - Ffused: fused node function * + * - VMap: maps values used in the body of F2 to those that mst be used in * + the body of the fused function instead */ +void FuseHPVMTensorNodes::inlineSecondNodeFunction(Module &M, Function *F2, + Function *Ffused, ValueMap<Value*, Value*> &VMap) { + + ReturnInst *RI = cast<ReturnInst>(Ffused->getEntryBlock().getTerminator()); + + // Copy the body of F2 into Ffused, applying the mapping + inst_iterator f2_i = inst_begin(F2); + for (inst_iterator f2_e = inst_end(F2); f2_i != f2_e; ++f2_i) { + Instruction *I = &(*f2_i); + if ((BuildDFG::isViscIntrinsic(I))) { + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + assert( ((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor") + || (II->getCalledFunction()->getName()).startswith("llvm.visc.node.id")) + && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n"); + + if ( (II->getCalledFunction()->getName()).startswith("llvm.visc.node.id")) { + continue; // Skip adding visc.node.id calls in nodes other than first node + } + + std::vector<Value*> Args; + for(unsigned i = 0; i < II->getNumArgOperands(); i++) { + Value *V = II->getArgOperand(i); + if (isa<Constant>(V)) { // Constants can be reused + Args.push_back(V); + } else { + assert((VMap.find(V) != VMap.end()) && + "Attempted to use value without existing mapping in VMap"); + Args.push_back(VMap[V]); + } + } + Function *F = Intrinsic::getDeclaration(&M, II->getIntrinsicID()); + CallInst* CI = + CallInst::Create(F, Args, + F->getReturnType()->isVoidTy()? "" : II->getName(), + RI); + // Update the map with the newly created value + VMap[II] = CI; + } else if (InsertValueInst* IV = dyn_cast<InsertValueInst>(I)) { + Value *AggOp = IV->getAggregateOperand(); + Value *InsOp = IV->getInsertedValueOperand(); + assert(((VMap.find(AggOp) != VMap.end()) || + (isa<Constant>(AggOp)) ) && + "Attempted to use value without existing mapping in VMap"); + assert(((VMap.find(InsOp) != VMap.end()) || + (isa<Constant>(InsOp))) && + "Attempted to use value without existing mapping in VMap"); + InsertValueInst* IVI = InsertValueInst::Create( + (isa<Constant>(AggOp)) ? AggOp : VMap[AggOp], + (isa<Constant>(InsOp)) ? InsOp : VMap[InsOp], + IV->getIndices(), + IV->getName(), + RI); + // Update the map with the newly created value + VMap[IV] = IVI; + } else { + ReturnInst* RetI = dyn_cast<ReturnInst>(I); + assert(RetI && "Unexpected Instruction\n"); + Value *RetVal = RetI->getOperand(0); + ReturnInst *newRI = ReturnInst::Create(Ffused->getContext(), + VMap[RetVal]); + ReplaceInstWithInst(RI, newRI); + } + } + return; +} + +/* Create function of leaf node after fusion * + * - create type * + * - create empty function of the type * + * - inline body of first function (applying and updating appropriate * + * mappings) * + * - inline body of second function (applying and updating appropriate * + * mappings) */ +Function* FuseHPVMTensorNodes::createLeafDFNodeFunction(IntrinsicInst* II1, + IntrinsicInst* II2, + Module &M) { + DEBUG(errs () << "Creating function signature\n"); + + /* Create empty node function of the correct type */ + Function* Ffused = createEmptyDFNodeFunction(II1, II2, M); + + // Get return type, needed for building the assignmens to the return struct + StructType* FfusedRetTy = cast<StructType>(Ffused->getReturnType()); + + /* Mapping information required for using the correct values in the body of * + * the fused node function */ + + // This map maps the values used in the original function bodies with + // the ones that need to be used in the fused function body. + ValueMap<Value*, Value*> FusedValueMap; + + // Intemediate information saved for return values of first node function + // This maps the output port to the value returned through the outgoing edge + std::vector<Value*> OutValues; + + DEBUG(errs () << "Creating function body\n"); + + // Add a basic block to the new, empty function + BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", Ffused); + ReturnInst::Create(M.getContext(), UndefValue::get(FfusedRetTy), BB); + + // Get the node functions + Function* F1 = cast<Function>((II1->getOperand(0))->stripPointerCasts()); + Function* F2 = cast<Function>((II2->getOperand(0))->stripPointerCasts()); + + // Initially, update FusedValueMap: it is populated with the arguments of F1 + Function::arg_iterator fused_arg_it = Ffused->arg_begin(); + // Argument names of the first node are simply copied + for(auto& arg: F1->getArgumentList()) { + FusedValueMap[&arg] = &*fused_arg_it; + ++fused_arg_it; + } + + + // for(const auto& v: FusedValueMap) { + // errs() << "key = " << *(v.first) << "\t"; + // errs() << "value = " << *(v.second) << "\n"; + // } + + // Invoke function that inlines F1 into Ffused, using and updating mappings + inlineFirstNodeFunction(M, F1, Ffused, FusedValueMap, OutValues); + + // Compute mapping between inputs of F2 and outputs of F1 + std::vector<IntrinsicInst *> IncomingEdgeList; + getIncomingEdgeIntrinsicList(II2, IncomingEdgeList); + std::vector<unsigned> PortMap(IncomingEdgeList.size(), 0); + for (IntrinsicInst * ii : IncomingEdgeList) { + unsigned srcPort = getNumericValue(ii->getOperand(3)); + unsigned dstPort = getNumericValue(ii->getOperand(4)); + PortMap[dstPort] = srcPort; + } + + // FusedValueMap is now populated with the arguments of F2 as well + for(auto& arg: F2->getArgumentList()) { + DEBUG(errs() << arg << "\n"); + unsigned inport = arg.getArgNo(); + if (isIncomingEdgeArgument(inport, IncomingEdgeList)) { + // Get the mappings of the return values of F1 if incoming edge argument + Value *V = OutValues[PortMap[inport]]; + FusedValueMap[&arg] = (isa<Constant>(V)) ? V: FusedValueMap[V]; + } + else { + // Get new argument otherwise + FusedValueMap[&arg] = &*fused_arg_it; + ++fused_arg_it; + } + } + + // Invoke function that inlines F2 into Ffused, using and updating mappings + inlineSecondNodeFunction(M, F2, Ffused, FusedValueMap); + + // Done with fused node function + return Ffused; +} + +/* Updates parent of fused nodes to use the new node intrinsic */ +void FuseHPVMTensorNodes::updateParentNodeFunction(IntrinsicInst* II1, + IntrinsicInst* II2, + IntrinsicInst* IInew) { + + // Compute the required shifting of positions for edges/binds to the second + // fusion node. No shifting is required for the first fusion node. + Function* F1 = cast<Function>((II1->getOperand(0))->stripPointerCasts()); + Function* F2 = cast<Function>((II2->getOperand(0))->stripPointerCasts()); + std::vector<unsigned> ShiftMap(F2->getFunctionType()->getNumParams(), 0); + unsigned shiftCount = F1->getFunctionType()->getNumParams(); + + // Find all incoming edges. + std::vector<IntrinsicInst *> IncomingEdgeList; + getIncomingEdgeIntrinsicList(II2, IncomingEdgeList); + // Their source must be the first fusion node, otherwise they would not have + // been fusion candidates + for (IntrinsicInst *ii : IncomingEdgeList) { + assert((ii->getOperand(0) == II1) && "Unexpected source operand\n"); + } + + // Compute shift map for n2: maps position in F2 arg list to Ffused arg list + for(auto& arg: F2->getArgumentList()) { + DEBUG(errs() << arg << "\n"); + unsigned inport = arg.getArgNo(); + if (isIncomingEdgeArgument(inport, IncomingEdgeList)) + continue; + + ShiftMap[inport] = shiftCount; + shiftCount++; + } + + std::vector<IntrinsicInst*> IItoRemove; + + // First, iterate over uses of the first node's createNode intrinsic + for (Value::user_iterator i = II1->user_begin(), ie = II1->user_end(); + i != ie; ++i) { + Instruction *VI = dyn_cast<Instruction>(*i); + IntrinsicInst* II = dyn_cast<IntrinsicInst>(VI); + assert(II && "Use of a node handle outside of a visc intrinsic\n"); + + switch(II->getIntrinsicID()) { + case Intrinsic::visc_createEdge: + { + if (isOutgoingEdgeIntrinsic(II,II1)) { + assert(isIncomingEdgeIntrinsic(II,II2) && + "Outgoing edge of node 1 should only go to node 2\n"); + IItoRemove.push_back(II); + } + } + break; + case Intrinsic::visc_bind_input: + { + } + break; + case Intrinsic::visc_bind_output: + { + assert(false && + "Source node of node fusion not expected in bind.out\n"); + } + break; + default: + llvm_unreachable("Unknown use of HPVM createNode handle\n"); + break; + } + } + + // Delete gathered instructions - they are the edges between n1-n2 + for (std::vector<IntrinsicInst *>::iterator ib = IItoRemove.begin(), + ie = IItoRemove.end(); ib != ie; ++ib) { + DEBUG(errs() << "Erasing: " << **ib << "\n"); + (*ib)->eraseFromParent(); + } + II1->replaceAllUsesWith(IInew); + II1->eraseFromParent(); + + IItoRemove.clear(); + + // Then, iterate over uses of the second node's createNode intrinsic + for (Value::user_iterator i = II2->user_begin(), ie = II2->user_end(); + i != ie; ++i) { + Instruction *VI = dyn_cast<Instruction>(*i); + IntrinsicInst* II = dyn_cast<IntrinsicInst>(VI); + assert(II && "Use of a node handle outside of a visc intrinsic\n"); + + switch(II->getIntrinsicID()) { + case Intrinsic::visc_createEdge: + { + assert(isOutgoingEdgeIntrinsic(II,II2) && + "Node 2 is expected to have only outgoing edges at this point\n"); + } + break; + case Intrinsic::visc_bind_input: + { + /* The index must be updated to the matching argument position of * + * the fused functionm using ShiftMap */ + unsigned dstPos = cast<ConstantInt>(II->getOperand(2))->getZExtValue(); + IntrinsicInst *newII = + createIdenticalBindWithDifferentPort(II, + ShiftMap[dstPos], + false); + newII->insertBefore(II); + IItoRemove.push_back(II); + } + break; + case Intrinsic::visc_bind_output: + { + assert(false && + "Source node of node fusion not expected in bind.out\n"); + } + break; + default: + llvm_unreachable("Unknown use of HPVM createNode handle\n"); + break; + } + } + + // Delete gathered instructions - they are the old bindings of n2 + for (std::vector<IntrinsicInst *>::iterator ib = IItoRemove.begin(), + ie = IItoRemove.end(); ib != ie; ++ib) { + DEBUG(errs() << "Erasing: " << **ib << "\n"); + (*ib)->eraseFromParent(); + } + + II2->replaceAllUsesWith(IInew); + II2->eraseFromParent(); + + return; +} + +/* Performs all operations required at the IR level for fusion of HPVM tensor * + * nodes with intrinsic instructions II1 and II2 * + * - Creates fused node function * + * - Creates createNode intrinsic for it and returns it * + * - Updates parent function: * + * - - adds new intrinsic * + * - - edges and binds consistently use the new intrinsic * + * - Removes old functions */ +IntrinsicInst* FuseHPVMTensorNodes::FuseHPVMTensorNodesStep(IntrinsicInst* II1, + IntrinsicInst* II2, + Module &M) { + // Get the node functions + Function* F1 = cast<Function>((II1->getOperand(0))->stripPointerCasts()); + Function* F2 = cast<Function>((II2->getOperand(0))->stripPointerCasts()); + + // Create fused node function + Function *Ffused = createLeafDFNodeFunction(II1, II2, M); + addHint(Ffused, getPreferredTarget(F1)); + + // FIX PARENT DFNode'S FUNCTION + + // Generate createNode Intrinsic for fused node and insert it + Function* CreateNodeF = Intrinsic::getDeclaration(&M, + Intrinsic::visc_createNode); + Constant* Fp = ConstantExpr::getPointerCast(Ffused, + Type::getInt8PtrTy(M.getContext())); + CallInst *CI = CallInst::Create(CreateNodeF, + ArrayRef<Value*>(Fp), + Ffused->getName()+".node"); + IntrinsicInst* CreateNodeII = cast<IntrinsicInst>(CI); + CreateNodeII->insertBefore(II1); + + // By the assumptions about the fusion pattern structure, all edges that have + // II1 as source will have II2 as destination and vice versa. + // We can simply delete them. + + // All createEdge intrinsics with destination argument = II1 need to use + // CreateNodeII instead. + // Similarly with bind.in + + // All createEdge intrinsics with source argument = II1 need to use + // CreateNodeII instead + // Similarly with bind.out + + // By the assumptions about the fusion pattern structure, the first node + // cannot be the argument of a bind.out + // The second node can be the argument of a bind.in. + // For the bind.in, we need to adjust the destination port. + updateParentNodeFunction(II1, II2, CreateNodeII); + + // Remove old node functions + removeHint(F1, getPreferredTarget(F1)); + removeHint(F2, getPreferredTarget(F2)); + F1->replaceAllUsesWith(UndefValue::get(F1->getType())); + F1->eraseFromParent(); + F2->replaceAllUsesWith(UndefValue::get(F2->getType())); + F2->eraseFromParent(); + + return CreateNodeII; +} + +/* Fuse node sequence described by creaetNode intrinsics in IIs. * + * Contents of IIs are cleared. */ +void FuseHPVMTensorNodes::FuseHPVMTensorNodeSequence( + std::vector<IntrinsicInst*> &IIs, Module &M) { + for (IntrinsicInst *II : IIs) { + assert((II->getIntrinsicID() == Intrinsic::visc_createNode) && + "Expected createNode intrinsic in fuse intrinsic sequence\n"); + } + + if (IIs.size() < 2) { + errs() << "Warning: Attempted to fuse fewer than 2 nodes\n"; + return; + } + + for (unsigned i = 0; i + 1 < IIs.size(); i++) { + IntrinsicInst *II1 = IIs[i]; + IntrinsicInst *II2 = IIs[i+1]; + IIs[i+1] = FuseHPVMTensorNodesStep(II1, II2, M); + } + IIs.clear(); + return; +} + +/* Run method for FuseHPVMTensorNodes class, simply invokes fusion of all the * + * sequenses in member variable FTs. */ +void FuseHPVMTensorNodes::run(Module &M, FusionTargets &FTs) { + for (unsigned i = 0; i < FTs.size(); i++) { + FuseHPVMTensorNodeSequence(FTs[i], M); + } + return; +} + +// Print Fusion Targets. The argument vector contains createNode intrinsics +// of nodes to be fused). +void FuseHPVMTensorNodes::printFusionTargets(FusionTargets &FTs) { + errs() << "Print Fusion Targets\n"; + errs() << "Found " << FTs.size() << " targets\n"; + for (FuseHPVMTensorNodes::FusionTargets::iterator ii = FTs.begin(), + ie = FTs.end(); ii != ie ; ++ii) { + errs() << "Target:\n"; + std::vector<IntrinsicInst*> IIv = *ii; + for (std::vector< IntrinsicInst*>::iterator pi = IIv.begin(), + pe = IIv.end(); pi != pe; ++pi) { + errs() << "\t" << *((*pi)->getOperand(0)) << "\n"; + } + } + return; +} + +void FindFusionTargetsTraversal::codeGen(DFInternalNode *N) { + DEBUG(errs() << "Skipping Internal Node: " + << N->getFuncPointer()->getName() << "\n"); + return; +} + + +void FindFusionTargetsTraversal::codeGen(DFLeafNode *N) { + DEBUG(errs() << "Inside leaf node: " + << N->getFuncPointer()->getName() << "\n"); + + // Skip fusion check if it is a dummy node + if(N->isDummyNode()) { + DEBUG(errs() << "Skipping dummy node\n"); + return; + } + + + if(!preferredTargetIncludes(N, visc::PROMISE_TARGET)) { + // Only fuse if we plan to target PROMISE/Layers API + // The CUDNN backend would be able to generate calls for the fused node, + // but not the other way around + DEBUG(errs() << "No PROMISE hint. Skipping node: " + << N->getFuncPointer()->getName() << "\n"); + return; + } + + visc::Target StartNodePreferredTarget = getPreferredTarget(N); + // Make sure that this is a valid HPVM Tensor Node + // Find first instruction, and check that it is an HPVM tensor intrinsic + IntrinsicInst *II = isValidHPVMTensorNode(N); + + std::vector<IntrinsicInst*> CurrentNodeSequence; + + switch(II->getIntrinsicID()) { + + /*case Intrinsic::visc_node_id: + { // Found beginning of pattern conv-bias-activation-pooling. + + } + break; + */ + + case Intrinsic::visc_tensor_convolution: + { // Found beginning of pattern conv-bias-activation-pooling. + // Look for the rest + CurrentNodeSequence.push_back(N->getInstruction()); + + // Look for bias + DFNode *SN = findNextNodeInSequence(N); + if (!SN) { + return; // Did not find a node sequence starting at N. Simpy return. + } + if (getPreferredTarget(SN) != StartNodePreferredTarget) { + return; // Node in sequence has different hint. Simpy return. + } + IntrinsicInst *SII = isValidHPVMTensorNode(SN); + if (SII->getIntrinsicID() != Intrinsic::visc_tensor_add) { + // Successor is not the bias operation, thus does not fit the pattern. + return; + } + // Otherwise, push this node to the current sequence + CurrentNodeSequence.push_back(SN->getInstruction()); + + // This is a valid sequence. + // We still need to fuse activation and/or pooling if we find them + // Continue with next node, looking for activation (relu, clipped relu, tanh) + SN = findNextNodeInSequence(SN); + if (!SN) { + // Did not find a node sequence starting at N.Use current sequence. + break; + } + if (getPreferredTarget(SN) != StartNodePreferredTarget) { + break; // Node in sequence has different hint. Use current sequence. + } + SII = isValidHPVMTensorNode(SN); + + if ((SII->getIntrinsicID() == Intrinsic::visc_tensor_clipped_relu) || + (SII->getIntrinsicID() == Intrinsic::visc_tensor_relu) || + (SII->getIntrinsicID() == Intrinsic::visc_tensor_tanh)) { + // Successor is activation. Push this node to the current sequence. + CurrentNodeSequence.push_back(SN->getInstruction()); + + // Will continue, looking for pooling in the next node + SN = findNextNodeInSequence(SN); + if (!SN) { + break; // No node in sequence. Use currently found sequence. + } + if (getPreferredTarget(SN) != StartNodePreferredTarget) { + break; // Node in sequence has different hint. Use current sequence. + } + SII = isValidHPVMTensorNode(SN); + } //else {} // Look for pooling in this node + + if ((SII->getIntrinsicID() == Intrinsic::visc_tensor_pool_max) || + (SII->getIntrinsicID() == Intrinsic::visc_tensor_pool_min) || + (SII->getIntrinsicID() == Intrinsic::visc_tensor_pool_mean)) { + // Successor is a pool operation. Use currently found sequence. + CurrentNodeSequence.push_back(SN->getInstruction()); + } + } + break; + case Intrinsic::visc_tensor_mul: + { // Found beginning of pattern gemm-bias-activation. Look for the rest + CurrentNodeSequence.push_back(N->getInstruction()); + // Look for bias + DFNode *SN = findNextNodeInSequence(N); + if (!SN) { + return; // Did not find a node sequence starting at N. Simpy return. + } + if (getPreferredTarget(SN) != StartNodePreferredTarget) { + return; // Node in sequence has different hint. Simpy return. + } + IntrinsicInst *SII = isValidHPVMTensorNode(SN); + if (SII->getIntrinsicID() != Intrinsic::visc_tensor_add) { + // Successor is not the bias operation, thus does not fit the pattern. + return; + } + // Otherwise, push this node to the current sequence + CurrentNodeSequence.push_back(SN->getInstruction()); + // This is a possible fuse target, gemm-add. + // We need to reach the end of the function, where the found sequence + // is added. + + // If the next operation is activation, we fuse that as well. + // Continue with next node, looking for activation (relu, clipped relu, tanh) + SN = findNextNodeInSequence(SN); + if (SN) { + if (getPreferredTarget(SN) == StartNodePreferredTarget) { + SII = isValidHPVMTensorNode(SN); + if ((SII->getIntrinsicID() == Intrinsic::visc_tensor_clipped_relu) || + (SII->getIntrinsicID() == Intrinsic::visc_tensor_relu) || + (SII->getIntrinsicID() == Intrinsic::visc_tensor_tanh)) { + // We found activation in sequence. Push in vector as well. + CurrentNodeSequence.push_back(SN->getInstruction()); + } + } + } + } + break; + default: + DEBUG(errs() << "No pattern begins at this node\n"); + break; + } + + if (CurrentNodeSequence.size() != 0) { + // A sequence was found. Store the node sequence in FTs. + FTs.push_back(CurrentNodeSequence); + } + + return; +} + +bool FuseHPVMTensorNodesWrapper::runOnModule(Module &M) { + + errs() << "\nFUSE HPVM TENSOR NODES PASS\n"; + // Get the BuildDFG Analysis Results: + // - Dataflow graph + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + // Visitor for Fuse Target Detection Graph Traversal + FindFusionTargetsTraversal *FTTVisitor = + new FindFusionTargetsTraversal(M, DFG); + + errs() << "Find targets\n"; + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode: Roots) { + // Initiate code generation for root DFNode + FTTVisitor->visit(rootNode); + } + + FuseHPVMTensorNodes::FusionTargets &FTs = FTTVisitor->getFusionTargets(); + + FuseHPVMTensorNodes Fuse; + // Fuse.printFusionTargets(FTs); + + Fuse.run(M, FTs); + + delete FTTVisitor; + + return true; +} + +char FuseHPVMTensorNodesWrapper::ID = 0; +static RegisterPass<FuseHPVMTensorNodesWrapper> X("hpvm-fuse", + "Fuse HPVM Tensor Nodes Pass", + false /* does not modify the CFG */, + true /* transformation, not just analysis */); + +} // End of namespace + diff --git a/lib/FuseHPVMTensorNodes/FuseHPVMTensorNodes.exports b/lib/FuseHPVMTensorNodes/FuseHPVMTensorNodes.exports new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/FuseHPVMTensorNodes/LLVMBuild.txt b/lib/FuseHPVMTensorNodes/LLVMBuild.txt new file mode 100644 index 0000000000..55a6ee5150 --- /dev/null +++ b/lib/FuseHPVMTensorNodes/LLVMBuild.txt @@ -0,0 +1,21 @@ +;===- ./lib/Transforms/LocalMem/LLVMBuild.txt ------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = FuseHPVMTensorNodes +parent = Transforms diff --git a/lib/GenVISC/CMakeLists.txt b/lib/GenVISC/CMakeLists.txt new file mode 100644 index 0000000000..710e8f2729 --- /dev/null +++ b/lib/GenVISC/CMakeLists.txt @@ -0,0 +1,12 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( LLVMGenVISC + GenVISC.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) diff --git a/lib/GenVISC/GenVISC.cpp b/lib/GenVISC/GenVISC.cpp new file mode 100644 index 0000000000..a4d9f2c2a4 --- /dev/null +++ b/lib/GenVISC/GenVISC.cpp @@ -0,0 +1,1590 @@ +//=== GenVISC.cpp - Implements "Hierarchical Dataflow Graph Builder Pass" ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "genvisc" +#include "llvm/GenVISC/GenVISC.h" + +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/SupportVISC/VISCHint.h" +#include "llvm/SupportVISC/VISCUtils.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/SupportVISC/VISCUtils.h" + + +using namespace llvm; +using namespace viscUtils; + + +namespace genvisc { + +// Helper Functions + +static Function* transformReturnTypeToStruct(Function* F); +static Type* getReturnTypeFromReturnInst(Function* F); + +// Check if the dummy function call is a __visc__node call +#define IS_VISC_CALL(callName) \ + static bool isVISCCall_##callName(Instruction* I) { \ + if(!isa<CallInst>(I)) \ + return false; \ + CallInst* CI = cast<CallInst>(I); \ + return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("__visc__"#callName); \ + } + +static void ReplaceCallWithIntrinsic(Instruction* I, Intrinsic::ID IntrinsicID, std::vector<Instruction*>* Erase) { + // Check if the instruction is Call Instruction + assert(isa<CallInst>(I) && "Expecting CallInst"); + CallInst* CI = cast<CallInst>(I); + DEBUG(errs() << "Found call: " << *CI << "\n"); + + // Find the correct intrinsic call + Module* M = CI->getParent()->getParent()->getParent(); + Function* F; + std::vector<Type*> ArgTypes; + std::vector<Value*> args; + if(Intrinsic::isOverloaded(IntrinsicID)) { + // This is an overloaded intrinsic. The types must exactly match. Get the + // argument types + for(unsigned i=0; i < CI->getNumArgOperands(); i++) { + ArgTypes.push_back(CI->getArgOperand(i)->getType()); + args.push_back(CI->getArgOperand(i)); + } + F = Intrinsic::getDeclaration(M, IntrinsicID, ArgTypes); + DEBUG(errs() << *F << "\n"); + } + else { // Non-overloaded intrinsic + F = Intrinsic::getDeclaration(M, IntrinsicID); + FunctionType* FTy = F->getFunctionType(); + DEBUG(errs() << *F << "\n"); + + // Create argument list + assert(CI->getNumArgOperands() == FTy->getNumParams() + && "Number of arguments of call do not match with Intrinsic"); + for(unsigned i=0; i < CI->getNumArgOperands(); i++) { + Value* V = CI->getArgOperand(i); + // Either the type should match or both should be of pointer type + assert((V->getType() == FTy->getParamType(i) || + (V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy())) + && "Dummy function call argument does not match with Intrinsic argument!"); + // If the types do not match, then both must be pointer type and pointer + // cast needs to be performed + if(V->getType() != FTy->getParamType(i)) { + V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI); + } + args.push_back(V); + } + } + // Insert call instruction + CallInst* Inst = CallInst::Create(F, args, F->getReturnType()->isVoidTy()? "" : CI->getName(), CI); + + DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n"); + + CI->replaceAllUsesWith(Inst); + // If the previous instruction needs to be erased, insert it in the vector + // Erased + if(Erase != NULL) + Erase->push_back(CI); +} + +IS_VISC_CALL(launch) /* Exists but not required */ +IS_VISC_CALL(edge) /* Exists but not required */ +IS_VISC_CALL(createNodeND) +//IS_VISC_CALL(createNode) +//IS_VISC_CALL(createNode1D) +//IS_VISC_CALL(createNode2D) +//IS_VISC_CALL(createNode3D) +IS_VISC_CALL(bindIn) +IS_VISC_CALL(bindOut) +IS_VISC_CALL(push) +IS_VISC_CALL(pop) +IS_VISC_CALL(getNode) +IS_VISC_CALL(getParentNode) +IS_VISC_CALL(barrier) +IS_VISC_CALL(malloc) +IS_VISC_CALL(return) +IS_VISC_CALL(getNodeInstanceID_x) +IS_VISC_CALL(getNodeInstanceID_y) +IS_VISC_CALL(getNodeInstanceID_z) +IS_VISC_CALL(getNumNodeInstances_x) +IS_VISC_CALL(getNumNodeInstances_y) +IS_VISC_CALL(getNumNodeInstances_z) +// Atomics +IS_VISC_CALL(atomic_cmpxchg) +IS_VISC_CALL(atomic_add) +IS_VISC_CALL(atomic_sub) +IS_VISC_CALL(atomic_xchg) +IS_VISC_CALL(atomic_inc) +IS_VISC_CALL(atomic_dec) +IS_VISC_CALL(atomic_min) +IS_VISC_CALL(atomic_max) +IS_VISC_CALL(atomic_umin) +IS_VISC_CALL(atomic_umax) +IS_VISC_CALL(atomic_and) +IS_VISC_CALL(atomic_or) +IS_VISC_CALL(atomic_xor) +// Misc Fn +IS_VISC_CALL(floor) +IS_VISC_CALL(rsqrt) +IS_VISC_CALL(sqrt) +IS_VISC_CALL(sin) +IS_VISC_CALL(cos) + + +IS_VISC_CALL(init) +IS_VISC_CALL(node) +IS_VISC_CALL(cleanup) +IS_VISC_CALL(wait) +IS_VISC_CALL(trackMemory) +IS_VISC_CALL(untrackMemory) +IS_VISC_CALL(requestMemory) +IS_VISC_CALL(attributes) +IS_VISC_CALL(hint) + +// Tensor Operators +IS_VISC_CALL(tensor_mul) +IS_VISC_CALL(tensor_convolution) +IS_VISC_CALL(tensor_group_convolution) +IS_VISC_CALL(tensor_batchnorm) +IS_VISC_CALL(tensor_add) +IS_VISC_CALL(tensor_pool_max) +IS_VISC_CALL(tensor_pool_min) +IS_VISC_CALL(tensor_pool_mean) +IS_VISC_CALL(tensor_relu) +IS_VISC_CALL(tensor_clipped_relu) +IS_VISC_CALL(tensor_tanh) +IS_VISC_CALL(tensor_sigmoid) +IS_VISC_CALL(tensor_softmax) + +IS_VISC_CALL(node_id) + + +// Return the constant integer represented by value V +static unsigned getNumericValue(Value* V) { + assert(isa<ConstantInt>(V) + && "Value indicating the number of arguments should be a constant integer"); + return cast<ConstantInt>(V)->getZExtValue(); +} + + + +// Add <numArgs> to the argument list of Function <F>. The names for these arguments +// should be put in the string array <names>. Ideally the length of <names> +// array should be numArgs. But, even when the length is not numArgs the +// arguments would be added correctly. The names however would not be as +// intuitive. +static Function* addArgs(Function* F, unsigned numArgs, std::string names[]) { + if(numArgs == 0) return F; // Return if no arguments are to be added. + + // Create the argument type list with added argument types + std::vector<Type*> ArgTypes; + for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + ArgTypes.push_back(ai->getType()); + } + // Adding new arguments to the function argument list, would not change the + // function type. We need to change the type of this function to reflect the + // added arguments + for(unsigned i = 0; i < numArgs; ++i) { +// ArgTypes.push_back(Type::getInt32Ty(F->getContext())); + ArgTypes.push_back(Type::getInt64Ty(F->getContext())); + } + FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg()); + + // Change the function type + Function* newF = cloneFunction(F, newFT, false); + + // Add names to the extra arguments to the Function argument list + unsigned numOldArgs = F->getFunctionType()->getNumParams(); + for(Function::arg_iterator ai = newF->arg_begin(), ae = newF->arg_end(); + ai != ae; ++ai) { + if (ai->getArgNo() < numOldArgs) + continue; + ai->setName(names[(ai->getArgNo() - numOldArgs) % names->size()]); + } + + replaceNodeFunctionInIR(*F->getParent(), F, newF); + return newF; +} + + +// Take the __visc__return instruction and generate code for combining the +// values being returned into a struct and returning it. +// The first operand is the number of returned values +static Value* genCodeForReturn(CallInst* CI) { + LLVMContext& Ctx = CI->getContext(); + assert(isVISCCall_return(CI) + && "__visc__return instruction expected!"); + + // Parse the dummy function call here + assert(CI->getNumArgOperands() > 0 && "Too few arguments for __visc_return call!\n"); + unsigned numRetVals = getNumericValue(CI->getArgOperand(0)); + + assert(CI->getNumArgOperands()-1 == numRetVals && + "Too few arguments for __visc_return call!\n"); + DEBUG(errs() << "\tNum of return values = " << numRetVals << "\n"); + + std::vector<Type*> ArgTypes; + for(unsigned i=1; i < CI->getNumArgOperands(); i++) { + ArgTypes.push_back(CI->getArgOperand(i)->getType()); + } + Twine outTyName = "struct.out." + CI->getParent()->getParent()->getName(); + StructType* RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true); + + InsertValueInst* IV = InsertValueInst::Create(UndefValue::get(RetTy), + CI->getArgOperand(1), + 0, + "returnStruct", + CI); + DEBUG(errs() << "Code generation for return:\n"); + DEBUG(errs() << *IV << "\n"); + + for(unsigned i=2; i < CI->getNumArgOperands(); i++) { + IV = InsertValueInst::Create(IV, + CI->getArgOperand(i), + i-1, + IV->getName(), + CI); + DEBUG(errs() << *IV << "\n"); + } + + return IV; +} + +// The visc launch intrinsic requires all the input parameters to the kernel +// function be placed in contiguous memory and pointer to that input be passed +// as the second argument to the launch intrinsic. This generates code to bring +// together all the input and dimension arguments in one packed struct +// <InStruct>. First pack the arguments to the kernel function and then add the +// dimension arguments depending on the hierarchy of DFG user wants to generate. +static void marshallArguments(unsigned levels, unsigned numArgs, unsigned argOffset, unsigned numDims, unsigned dimOffset, Value* InStruct, CallInst* CI, Function* KernelF) { + DEBUG(errs() << "Kernel Function = " << KernelF->getName() << "\n"); + + // Get module context and i32 0 constant, as they would be frequently used in + // this function. + LLVMContext& Ctx = CI->getParent()->getContext(); + Constant* IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0); + + // Find the arguments to be passed to kernel function and pack them in a + // struct. Specifically first generate a GEP instruction to find the correct + // memory location in InStruct and then generate Store instruction to store + // the argument in that location. + Function::arg_iterator ai = KernelF->arg_begin(); + Function::arg_iterator ae = KernelF->arg_end(); + + for(unsigned i = 0; i < numArgs && ai != ae; i++, ai++) { + Value* arg = CI->getArgOperand(i+argOffset); + DEBUG(errs() << "Argument: " << ai->getName() << "\n"); + DEBUG(errs() << "Passing: " << *arg << "\n"); + // Create constant int (i) + Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i); + // Get Element pointer instruction + Value* GEPIndices[] = { IntZero, Int_i }; + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, InStruct, + ArrayRef<Value*>(GEPIndices, 2), + InStruct->getName()+"."+ai->getName(), + CI); + // Store instruction + if(GEP->getType()->getPointerElementType() != arg->getType()) { + // Arguments type might not match with the kernel function definition + // One reason might be because of default argument promotions, where all + // arguments of type float are always promoted to double and types char, + // short int are promoted to int. + // LLVM 4.0 also promotes pointers to i8*. In case both are pointer types, + // we just issue a warning and cast it to appropriate type + if(arg->getType() == Type::getDoubleTy(Ctx)) { + DEBUG(errs() << "Cast from " << *arg->getType() << " To " << + *GEP->getType()->getPointerElementType() << "\n"); + CastInst* CastI = BitCastInst::CreateFPCast(arg, + GEP->getType()->getPointerElementType(), GEP->getName()+".cast", + CI); + new StoreInst(CastI, GEP, CI); + } else if (arg->getType() == Type::getInt32Ty(Ctx)) { + CastInst* CastI = BitCastInst::CreateIntegerCast(arg, + GEP->getType()->getPointerElementType(), false, + GEP->getName()+".cast", CI); + new StoreInst(CastI, GEP, CI); + } else if (arg->getType()->isPointerTy() && GEP->getType()->getPointerElementType()->isPointerTy()) { + errs() << "WARNING: Argument type mismatch between kernel and __visc__node call. Forcing cast\n"; + CastInst* CastI = CastInst::CreatePointerCast(arg, + GEP->getType()->getPointerElementType(), GEP->getName()+".cast", + CI); + new StoreInst(CastI, GEP, CI); + } else { + errs() << "Error: Mismatch in argument types\n"; + errs() << "__visc__node call: " << *CI << "\n"; + errs() << "Argument: " << *arg << "\n"; + errs() << "Expected: " << *ai << "\n"; + llvm_unreachable("Mismatch in argument types of kernel function and __visc__node call"); + } + } else { + new StoreInst(arg, GEP, CI); + } + } + + // Based on the hierarchy of the DFG we want, we need to pass the dimension + // for each level. The number of dimensions we need to pass to the launch + // intrinsic is the product of the number of levels and dimesions at each + // level. + // Marshall dim arguments + DEBUG(errs() << *CI << "\n"); + std::string names[] = {"dimX", "dimY", "dimZ"}; + for(unsigned i=0; i< numDims*levels; i++) { + Value* arg = CI->getArgOperand(i+dimOffset); + DEBUG(errs() << "Passing: " << *arg << "\n"); + // Create constant int (i) + Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i+numArgs); + // Get Element pointer instruction + Value* GEPIndices[] = { IntZero, Int_i }; + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, InStruct, + ArrayRef<Value*>(GEPIndices, 2), + InStruct->getName()+"."+names[i%numDims]+Twine(i/levels), + CI); + // Store instruction + DEBUG(errs() << *arg << " " << *GEP << "\n"); + StoreInst* SI = new StoreInst(arg, GEP, CI); + DEBUG(errs() << *SI << "\n"); + + } +} + +// Returns vector of all wait instructions, waiting on the passed graphID value +static std::vector<CallInst*>* getWaitList(Value* GraphID) { + DEBUG(errs() << "Getting Uses of: " << *GraphID << "\n"); + std::vector<CallInst*>* WaitList = new std::vector<CallInst*>(); + // It must have been loaded from memory somewhere + for(Value::user_iterator ui = GraphID->user_begin(), + ue = GraphID->user_end(); ui!=ue; ++ui) { + if(CallInst* waitI = dyn_cast<CallInst>(*ui)) { + DEBUG(errs() << "Use: " << *waitI << "\n"); + assert(isVISCCall_wait(waitI) + && "GraphID can only be used by __visc__wait call"); + WaitList->push_back(waitI); + } + //else if (PHINode* PN = dyn_cast<PHINode>(*ui)){ + //errs() << "Found PhiNode use of graphID\n"; + //std::vector<CallInst*>* phiWaitList = getWaitList(PN); + //WaitList->insert(WaitList->end(), phiWaitList->begin(), phiWaitList->end()); + //free(phiWaitList); + //} + else { + DEBUG(errs() << *(*ui) << "\n"); + llvm_unreachable("Error: Operation on Graph ID not supported!\n"); + } + } + return WaitList; +} + +// Analyse the attribute call for this function. Add the in and out +// attributes to pointer parameters. +static void handleVISCAttributes(Function* F, CallInst* CI) { + DEBUG(errs() << "Kernel before adding In/Out VISC attributes:\n" << *F << "\n"); + // Parse the dummy function call here + unsigned offset = 0; + // Find number of In pointers + assert(CI->getNumArgOperands() > offset + && "Too few arguments for __visc__attributes call!"); + unsigned numInPtrs = getNumericValue(CI->getArgOperand(offset)); + DEBUG(errs() << "\tNum of in pointers = " << numInPtrs << "\n"); + + for(unsigned i = offset+1; i< offset+1+numInPtrs; i++) { + Value* V = CI->getArgOperand(i); + if(Argument* arg = dyn_cast<Argument>(V)) { + F->addAttribute(1+arg->getArgNo(), Attribute::In); + } + else { + errs() << "Invalid argument to __visc__attribute: " << *V << "\n"; + llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call"); + } + } + // Find number of Out Pointers + offset += 1 + numInPtrs; + assert(CI->getNumArgOperands() > offset + && "Too few arguments for __visc__attributes call!"); + unsigned numOutPtrs = getNumericValue(CI->getOperand(offset)); + DEBUG(errs() << "\tNum of out Pointers = " << numOutPtrs << "\n"); + for(unsigned i = offset+1; i< offset+1+numOutPtrs; i++) { + Value* V = CI->getArgOperand(i); + if(Argument* arg = dyn_cast<Argument>(V)) { + F->addAttribute(1+arg->getArgNo(), Attribute::Out); + } + else { + errs() << "Invalid argument to __visc__attribute: " << *V << "\n"; + llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call"); + } + } + DEBUG(errs() << "Kernel after adding In/Out VISC attributes:\n" << *F << "\n"); +} + +// Recursively generate internal nodes for all the levels. Node at each level +// will create the appropriate instances of the child node at that level using +// the visc createNode intrinsic, and pass on the remaining dimensions to the +// child node. +static Function* genInternalNode(Function* KernelF, unsigned level, + unsigned numArgs, unsigned numDims, unsigned dimOffset, CallInst* CI) { + // Create new function with the same type + Module* module = KernelF->getParent(); + Function* ChildNodeF; + + // Recursively generate node for lower level + if(level > 1) { + ChildNodeF = genInternalNode(KernelF, level-1, numArgs, numDims, dimOffset, CI); + addHint(ChildNodeF, getPreferredTarget(KernelF)); +// Internal nodes always get a CPU hint. If code geneation for them is not +// needed and can be skipped, this is handled by the accelerator backends +// addHint(ChildNodeF, visc::CPU_TARGET); + } else { + ChildNodeF = KernelF; + } + + // Generate Internal node for current level + Function* InternalF = Function::Create(ChildNodeF->getFunctionType(), + ChildNodeF->getLinkage(), + KernelF->getName()+"Internal_level"+Twine(level), + module); + // Create a basic block in this function + BasicBlock *BB = BasicBlock::Create(InternalF->getContext(), "entry", InternalF); + ReturnInst* RI = ReturnInst::Create(InternalF->getContext(), + UndefValue::get(InternalF->getReturnType()), BB); + // Copy correct attributes + InternalF->setAttributes(ChildNodeF->getAttributes()); + // Loop over the arguments, copying the names of arguments over. + Function::arg_iterator dest_iterator = InternalF->arg_begin(); + for (Function::const_arg_iterator i = ChildNodeF->arg_begin(), e = ChildNodeF->arg_end(); + i != e; ++i, ++dest_iterator) { + DEBUG(errs() << "Copying argument: " << i->getName() << "\n"); + dest_iterator->setName(i->getName()); // Copy the name over... + DEBUG(errs() << "New Argument: " << *dest_iterator << "\n"); + } + + // Add extra dimesnion arguments + std::string dimNames[] = {"dimX", "dimY", "dimZ"}; + DEBUG(errs() << "Adding extra args to function Function:\n" << *InternalF << "\n"); + InternalF = addArgs(InternalF, numDims, dimNames); + // update RI + RI = cast<ReturnInst>(InternalF->getEntryBlock().getTerminator()); + DEBUG(errs() << "After Adding extra args to function Function:\n" << *InternalF << "\n"); + + // Insert createNode intrinsic + // First generate constant expression to bitcast the function pointer to + // internal node to i8* + Value* NodeF = ConstantExpr::getPointerCast(ChildNodeF, Type::getInt8PtrTy(module->getContext())); + + // Use args vectors to get the arguments for visc createNode + // intrinsic + std::vector<Value*> args; + + // Push the i8* pointer to internal node into the args vector + args.push_back(NodeF); + + // Traverse the argument list of internal node function in reverse to get the + // dimesnions to be used to create instances of child node at this level + Function::arg_iterator ai = InternalF->arg_end(); + for(unsigned i=0; i<numDims; i++, ai--); + DEBUG(errs() << "Iterator at: " << *ai << "\n"); + + // ai now points to the first dimension argument to be passed to the + // createNode intrinsic. Follow it to push the dim argument into + // the args vector + for(unsigned i=0; i < numDims; i++, ai++) { + args.push_back(&*ai); + } + + // Based on the number of dimensions choose the appropriate visc createNode + // intrinsic + DEBUG(errs() << "Number of dims = " << numDims << "\n"); + Intrinsic::ID createNodeXD; + switch(numDims) { + case 0: + createNodeXD = Intrinsic::visc_createNode; + break; + case 1: + createNodeXD = Intrinsic::visc_createNode1D; + break; + case 2: + createNodeXD = Intrinsic::visc_createNode2D; + break; + case 3: + createNodeXD = Intrinsic::visc_createNode3D; + break; + default: + llvm_unreachable("Invalid number of dimensions!"); + break; + }; + + // Generate the visc createNode intrinsic, using the args vector as parameter + Function* CreateNodeF = Intrinsic::getDeclaration(module, createNodeXD); + DEBUG(errs() << "Function chosen:\n" << *CreateNodeF << "\n"); + CallInst *CreateNodeCall = CallInst::Create(CreateNodeF, args, ChildNodeF->getName()+".node", RI); + DEBUG(errs() << "Generate call: " << *CreateNodeCall << "\n"); + + // Generate Bind intrinsics + Function* bindInputF = Intrinsic::getDeclaration(module, Intrinsic::visc_bind_input); + DEBUG(errs() << "Generating input binding:\n" << *bindInputF << "\n"); + for(unsigned i=0; i < ChildNodeF->getArgumentList().size(); i++) { + std::vector<Value*> bindArgs; + bindArgs.push_back(CreateNodeCall); + bindArgs.push_back(ConstantInt::get(Type::getInt32Ty(module->getContext()), i)); + bindArgs.push_back(ConstantInt::get(Type::getInt32Ty(module->getContext()), i)); + bindArgs.push_back(ConstantInt::getFalse(module->getContext())); + CallInst* bindInputCall = CallInst::Create(bindInputF, bindArgs, "", RI); + DEBUG(errs() << *bindInputCall << "\n"); + } + + // Print the generated internal node for debugging + DEBUG(errs() << "Generated Function:\n" << *InternalF << "\n"); + + return InternalF; +} + +// Change the OpenCL query function calls with visc intrinsics in function F. +static void replaceOpenCLCallsWithVISCIntrinsics(Function *F) { + Module* module = F->getParent(); + std::vector<CallInst *> IItoRemove; + + // Get first instruction + inst_iterator i = inst_begin(F); + Instruction *FI = &(*i); + + // Insert getNode intrinsic + Intrinsic::ID getNodeID = Intrinsic::visc_getNode; + Function* GetNodeF = Intrinsic::getDeclaration(module, getNodeID); + std::vector<Value*> args; + CallInst *GetNodeCall = CallInst::Create(GetNodeF, args, F->getName()+".node", FI); + DEBUG(errs() << "Generate getNode intrinsic: " << *GetNodeCall << "\n"); + + // Insert getParentNode intrinsic + Intrinsic::ID getParentNodeID = Intrinsic::visc_getParentNode; + Function* GetParentNodeF = Intrinsic::getDeclaration(module, getParentNodeID); + args.push_back(GetNodeCall); + CallInst *GetParentNodeCall = CallInst::Create(GetParentNodeF, args, F->getName()+".parentNode", FI); + DEBUG(errs() << "Generate getParentNode intrinsic: " << *GetParentNodeCall << "\n"); + + // Iterate through all instructions + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &(*i); + CallInst *CI; + + // Find OpenCL function calls + if ((CI = dyn_cast<CallInst>(I))) { + if ((CI->getCalledValue()->stripPointerCasts()->getName()).equals("get_global_id")) { + DEBUG(errs() << "Found get_global_id call: " << *CI << "\n"); + CallSite OpenCLCallSite(CI); + Value *arg0 = OpenCLCallSite.getArgument(0); + // Find the intrinsic function to be called + unsigned dim = getNumericValue(arg0); + Intrinsic::ID getNodeInstanceID; + Intrinsic::ID getNumNodeInstancesID; + switch (dim) { + case 0: + getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_x; + getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_x; + break; + case 1: + getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_y; + getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_y; + break; + case 2: + getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_z; + getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_z; + break; + default: + assert(false && "Invalid dimension from valid OpenCL source!"); + break; + } + + + // Creating getNodeInstanceID intrinsic for parent node + ArrayRef<Value *> Args0(GetParentNodeCall); + Function* GetNodeInstanceIDF = Intrinsic::getDeclaration(module, getNodeInstanceID); + CallInst* ParentIDIntrinsic = CallInst::Create(GetNodeInstanceIDF, Args0, "", CI); + + // Creating getNumNodeInstances intrinsic for this node + ArrayRef<Value *> Args1(GetNodeCall); + Function* GetNumNodeInstancesF = Intrinsic::getDeclaration(module, getNumNodeInstancesID); + CallInst* InstancesIntrinsic = CallInst::Create(GetNumNodeInstancesF, Args1, "", CI); + // Creating mul instruction + BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, + ParentIDIntrinsic, + InstancesIntrinsic, + "", CI); + // Creating getNodeInstanceID intrinsic for this node + CallInst* LocalIDIntrinsic = CallInst::Create(GetNodeInstanceIDF, Args1, "", CI); + // Creating add instruction + BinaryOperator* AddInst = BinaryOperator::Create(Instruction::Add, + MulInst, + LocalIDIntrinsic, + "", CI); + CI->replaceAllUsesWith(AddInst); + IItoRemove.push_back(CI); + } + if ((CI->getCalledValue()->stripPointerCasts()->getName()).equals("get_local_id")) { + DEBUG(errs() << "Found get_local_id call: " << *CI << "\n"); + // Value *arg0 = CI->getOperand(0); + CallSite OpenCLCallSite(CI); + Value *arg0 = OpenCLCallSite.getArgument(0); + + // Argument of the function to be called + ArrayRef<Value *> Args(GetNodeCall); + + // Find the intrinsic function to be called + unsigned dim = getNumericValue(arg0); + Intrinsic::ID getNodeInstanceID; + switch (dim) { + case 0: + getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_x; + break; + case 1: + getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_y; + break; + case 2: + getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_z; + break; + default: + assert(false && "Invalid dimension from valid OpenCL source!"); + break; + } + Function* GetNodeInstanceIDF = Intrinsic::getDeclaration(module, getNodeInstanceID); + CallInst* VI = CallInst::Create(GetNodeInstanceIDF, Args, "", CI); + CI->replaceAllUsesWith(VI); + IItoRemove.push_back(CI); + } + if ((CI->getCalledValue()->stripPointerCasts()->getName()).equals("get_group_id")) { + DEBUG(errs() << "Found get_group_id call: " << *CI << "\n"); + // Value *arg0 = CI->getOperand(0); + CallSite OpenCLCallSite(CI); + Value *arg0 = OpenCLCallSite.getArgument(0); + + // Argument of the function to be called + ArrayRef<Value *> Args(GetParentNodeCall); + + // Find the intrinsic function to be called + unsigned dim = getNumericValue(arg0); + Intrinsic::ID getNodeInstanceID; + switch (dim) { + case 0: + getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_x; + break; + case 1: + getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_y; + break; + case 2: + getNodeInstanceID = Intrinsic::visc_getNodeInstanceID_z; + break; + default: + assert(false && "Invalid dimension from valid OpenCL source!"); + break; + } + Function* GetNodeInstanceIDF = Intrinsic::getDeclaration(module, getNodeInstanceID); + CallInst* VI = CallInst::Create(GetNodeInstanceIDF, Args, "", CI); + CI->replaceAllUsesWith(VI); + IItoRemove.push_back(CI); + } + if ((CI->getCalledValue()->stripPointerCasts()->getName()).equals("get_global_size")) { + DEBUG(errs() << "Found get_global_size call: " << *CI << "\n"); + CallSite OpenCLCallSite(CI); + Value *arg0 = OpenCLCallSite.getArgument(0); + // Find the intrinsic function to be called + unsigned dim = getNumericValue(arg0); + Intrinsic::ID getNumNodeInstancesID; + switch (dim) { + case 0: + getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_x; + break; + case 1: + getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_y; + break; + case 2: + getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_z; + break; + default: + assert(false && "Invalid dimension from valid OpenCL source!"); + break; + } + + + // Creating getNumNodeInstances intrinsic for parent node + ArrayRef<Value *> Args0(GetParentNodeCall); + Function* GetNumNodeInstancesF = Intrinsic::getDeclaration(module, getNumNodeInstancesID); + CallInst* ParentInstancesIntrinsic = CallInst::Create(GetNumNodeInstancesF, Args0, "", CI); + // Creating getNumNodeInstances intrinsic for this node + ArrayRef<Value *> Args1(GetNodeCall); + CallInst* InstancesIntrinsic = CallInst::Create(GetNumNodeInstancesF, Args1, "", CI); + // Creating mul instruction + BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, + ParentInstancesIntrinsic, + InstancesIntrinsic, + "", CI); + CI->replaceAllUsesWith(MulInst); + IItoRemove.push_back(CI); + + } + if ((CI->getCalledValue()->stripPointerCasts()->getName()).equals("get_local_size")) { + DEBUG(errs() << "Found get_local_size call: " << *CI << "\n"); + CallSite OpenCLCallSite(CI); + Value *arg0 = OpenCLCallSite.getArgument(0); + + // Argument of the function to be called + ArrayRef<Value *> Args(GetNodeCall); + + // Find the intrinsic function to be called + unsigned dim = getNumericValue(arg0); + Intrinsic::ID getNumNodeInstancesID; + switch (dim) { + case 0: + getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_x; + break; + case 1: + getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_y; + break; + case 2: + getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_z; + break; + default: + assert(false && "Invalid dimension from valid OpenCL source!"); + break; + } + Function* GetNumNodeInstancesF = Intrinsic::getDeclaration(module, getNumNodeInstancesID); + CallInst* VI = CallInst::Create(GetNumNodeInstancesF, Args, "", CI); + CI->replaceAllUsesWith(VI); + IItoRemove.push_back(CI); + } + if ((CI->getCalledValue()->stripPointerCasts()->getName()).equals("get_num_groups")) { + DEBUG(errs() << "Found get_num_groups call: " << *CI << "\n"); + CallSite OpenCLCallSite(CI); + Value *arg0 = OpenCLCallSite.getArgument(0); + + // Argument of the function to be called + ArrayRef<Value *> Args(GetParentNodeCall); + + // Find the intrinsic function to be called + unsigned dim = getNumericValue(arg0); + Intrinsic::ID getNumNodeInstancesID; + switch (dim) { + case 0: + getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_x; + break; + case 1: + getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_y; + break; + case 2: + getNumNodeInstancesID = Intrinsic::visc_getNumNodeInstances_z; + break; + default: + assert(false && "Invalid dimension from valid OpenCL source!"); + break; + } + Function* GetNumNodeInstancesF = Intrinsic::getDeclaration(module, getNumNodeInstancesID); + CallInst* VI = CallInst::Create(GetNumNodeInstancesF, Args, "", CI); + CI->replaceAllUsesWith(VI); + IItoRemove.push_back(CI); + } + } + } + + for (std::vector<CallInst *>::reverse_iterator ri = IItoRemove.rbegin(), + re = IItoRemove.rend(); ri != re; ++ri) + (*ri)->eraseFromParent(); + +} + + +// Public Functions of GenVISC pass +bool GenVISC::runOnModule(Module &M) { + errs() << "\nGENVISC PASS\n"; + this->M = &M; + + // Load Runtime API Module + SMDiagnostic Err; + + // Insert init context in main + DEBUG(errs() << "Locate __visc__init()\n"); + Function* VI = M.getFunction("__visc__init"); + assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); + Instruction* I = cast<Instruction>(*VI->user_begin()); + + // Insert print instruction at visc exit + DEBUG(errs() << "Locate __visc__cleanup()\n"); + Function* VC = M.getFunction("__visc__cleanup"); + assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once"); + I = cast<Instruction>(*VC->user_begin()); + + DEBUG(errs() << "-------- Searching for launch sites ----------\n"); + + std::vector<Instruction*> toBeErased; + std::vector<Function*> functions; + + for (Module::iterator mi = M.begin(), me = M.end(); mi != me; ++mi) { + Function* f = &*mi; + functions.push_back(f); + } + + // Iterate over all functions in the module + for (unsigned i = 0; i < functions.size(); i++) { + Function* f = functions[i]; + DEBUG(errs() << "Function: " << f->getName() << "\n"); + + // List with the required additions in the function's return type + std::vector<Type*> FRetTypes; + + enum mutateTypeCause { + mtc_None, + mtc_BIND, + mtc_RETURN, + mtc_NUM_CAUSES + } bind; + bind = mutateTypeCause::mtc_None; + + // Iterate over all the instructions in this function + for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e ; ++i) { + Instruction* I = &*i; // Grab pointer to Instruction + // If not a call instruction, move to next instruction + if(!isa<CallInst>(I)) + continue; + + CallInst* CI = cast<CallInst>(I); + LLVMContext& Ctx = CI->getContext(); + // If __visc__node call found, generate the test case + + if(isVISCCall_node(I)) { + errs() << "Found visc node call in Function: " << f->getName() << "\n"; + assert(CI->getNumArgOperands() >= 5 + && "__visc__node call should have atleast 5 arguments!"); + generateTest(CI); + // Place this call in the list of instructions to be erased. + toBeErased.push_back(CI); + } + if(isVISCCall_init(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_init, &toBeErased); + } + if(isVISCCall_cleanup(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_cleanup, &toBeErased); + } + if(isVISCCall_wait(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_wait, &toBeErased); + } + if(isVISCCall_trackMemory(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_trackMemory, &toBeErased); + } + if(isVISCCall_untrackMemory(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_untrackMemory, &toBeErased); + } + if(isVISCCall_requestMemory(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_requestMemory, &toBeErased); + } + if(isVISCCall_hint(I)) { + assert(isa<ConstantInt>(CI->getArgOperand(0)) + && "Argument to hint must be constant integer!"); + ConstantInt* hint = cast<ConstantInt>(CI->getArgOperand(0)); + + visc::Target t = (visc::Target) hint->getZExtValue(); + addHint(CI->getParent()->getParent(), t); + DEBUG(errs() << "Found visc hint call: " << *CI << "\n"); + toBeErased.push_back(CI); + } + if(isVISCCall_launch(I)) { + Function* LaunchF = Intrinsic::getDeclaration(&M, Intrinsic::visc_launch); + DEBUG(errs() << *LaunchF << "\n"); + // Get i8* cast to function pointer + Function* graphFunc = cast<Function>(CI->getArgOperand(1)); + graphFunc = transformReturnTypeToStruct(graphFunc); + Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); + + ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(0)); + Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + + Value* LaunchArgs[] = {F, CI->getArgOperand(2), isStreaming}; + CallInst* LaunchInst = CallInst::Create(LaunchF, + ArrayRef<Value*>(LaunchArgs, 3), + "graphID", CI); + DEBUG(errs() << "Found visc launch call: " << *CI << "\n"); + DEBUG(errs() << "\tSubstitute with: " << *LaunchInst << "\n"); + CI->replaceAllUsesWith(LaunchInst); + toBeErased.push_back(CI); + } + if(isVISCCall_push(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_push, &toBeErased); + } + if(isVISCCall_pop(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_pop, &toBeErased); + } + if(isVISCCall_createNodeND(I)) { + assert(CI->getNumArgOperands() > 0 && + "Too few arguments for __visc__createNodeND call"); + unsigned numDims = getNumericValue(CI->getArgOperand(0)); + // We need as meny dimension argments are there are dimensions + assert(CI->getNumArgOperands()-2 == numDims && + "Too few arguments for __visc_createNodeND call!\n"); + + Function* CreateNodeF; + switch (numDims) { + case 0: + CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode); + break; + case 1: + CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode1D); + break; + case 2: + CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode2D); + break; + case 3: + CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode3D); + break; + default: + llvm_unreachable("Unsupported number of dimensions\n"); + break; + } + DEBUG(errs() << *CreateNodeF << "\n"); + DEBUG(errs() << *I << "\n"); + DEBUG(errs() << "in " << I->getParent()->getParent()->getName() << "\n"); + + // Get i8* cast to function pointer + Function* graphFunc = cast<Function>(CI->getArgOperand(1)); + graphFunc = transformReturnTypeToStruct(graphFunc); + Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); + + CallInst* CreateNodeInst; + switch (numDims) { + case 0: + CreateNodeInst = CallInst::Create(CreateNodeF, + ArrayRef<Value*>(F), + graphFunc->getName()+".node", CI); + break; + case 1: + { + assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 2, expected to be i64\n"); + Value* CreateNodeArgs[] = {F, CI->getArgOperand(2)}; + CreateNodeInst = CallInst::Create(CreateNodeF, + ArrayRef<Value*>(CreateNodeArgs, 2), + graphFunc->getName()+".node", CI); + } + break; + case 2: + { + assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 2, expected to be i64\n"); + assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 3, expected to be i64\n"); + Value* CreateNodeArgs[] = {F, + CI->getArgOperand(2), + CI->getArgOperand(3)}; + CreateNodeInst = CallInst::Create(CreateNodeF, + ArrayRef<Value*>(CreateNodeArgs, 3), + graphFunc->getName()+".node", CI); + } + break; + case 3: + { + assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 2, expected to be i64\n"); + assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 3, expected to be i64\n"); + assert((CI->getArgOperand(4)->getType() == Type::getInt64Ty(Ctx)) && + "CreateNodeND dimension argument, 4, expected to be i64\n"); + Value* CreateNodeArgs[] = {F, + CI->getArgOperand(2), + CI->getArgOperand(3), + CI->getArgOperand(4)}; + CreateNodeInst = CallInst::Create(CreateNodeF, + ArrayRef<Value*>(CreateNodeArgs, 4), + graphFunc->getName()+".node", CI); + } + break; + default: + llvm_unreachable("Impossible path: number of dimensions is 0, 1, 2, 3\n"); + break; + } + + DEBUG(errs() << "Found visc createNode call: " << *CI << "\n"); + DEBUG(errs() << "\tSubstitute with: " << *CreateNodeInst << "\n"); + CI->replaceAllUsesWith(CreateNodeInst); + toBeErased.push_back(CI); + } + + if(isVISCCall_edge(I)) { + Function* EdgeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge); + DEBUG(errs() << *EdgeF << "\n"); + ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(5)); + ConstantInt* EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2)); + Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value* isAllToAll = EdgeTypeOp->isZero()? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value* EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), + isAllToAll, CI->getArgOperand(3), CI->getArgOperand(4), + isStreaming + }; + CallInst* EdgeInst = CallInst::Create(EdgeF, + ArrayRef<Value*>(EdgeArgs, 6), + "output", CI); + DEBUG(errs() << "Found visc edge call: " << *CI << "\n"); + DEBUG(errs() << "\tSubstitute with: " << *EdgeInst << "\n"); + CI->replaceAllUsesWith(EdgeInst); + toBeErased.push_back(CI); + } + if(isVISCCall_bindIn(I)) { + Function* BindInF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input); + DEBUG(errs() << *BindInF << "\n"); + // Check if this is a streaming bind or not + ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3)); + Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value* BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), isStreaming + }; + CallInst* BindInInst = CallInst::Create(BindInF, + ArrayRef<Value*>(BindInArgs, 4), + "", CI); + DEBUG(errs() << "Found visc bindIn call: " << *CI << "\n"); + DEBUG(errs() << "\tSubstitute with: " << *BindInInst << "\n"); + CI->replaceAllUsesWith(BindInInst); + toBeErased.push_back(CI); + } + if(isVISCCall_bindOut(I)) { + Function* BindOutF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output); + DEBUG(errs() << *BindOutF << "\n"); + // Check if this is a streaming bind or not + ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3)); + Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value* BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), isStreaming + }; + CallInst* BindOutInst = CallInst::Create(BindOutF, + ArrayRef<Value*>(BindOutArgs, 4), + "", CI); + DEBUG(errs() << "Found visc bindOut call: " << *CI << "\n"); + DEBUG(errs() << "\tSubstitute with: " << *BindOutInst << "\n"); + + DEBUG(errs() << "Fixing the return type of the function\n"); + // FIXME: What if the child node function has not been visited already. + // i.e., it's return type has not been fixed. + Function* F = I->getParent()->getParent(); + DEBUG(errs() << F->getName() << "\n";); + IntrinsicInst* NodeIntrinsic = cast<IntrinsicInst>(CI->getArgOperand(0)); + DEBUG(errs() << "Node intrinsic: " << *NodeIntrinsic << "\n"); + Function* ChildF = cast<Function>(NodeIntrinsic->getArgOperand(0)->stripPointerCasts()); + DEBUG(errs() << ChildF->getName() << "\n";); + int srcpos = cast<ConstantInt>(CI->getArgOperand(1))->getSExtValue(); + int destpos = cast<ConstantInt>(CI->getArgOperand(2))->getSExtValue(); + StructType* ChildReturnTy = cast<StructType>(ChildF->getReturnType()); + + Type* ReturnType = F->getReturnType(); + DEBUG(errs() << *ReturnType << "\n";); + assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType)) + && "Return type should either be a struct or void type!"); + + FRetTypes.insert(FRetTypes.begin()+destpos, ChildReturnTy->getElementType(srcpos)); + assert(((bind == mutateTypeCause::mtc_BIND) || + (bind == mutateTypeCause::mtc_None)) && + "Both bind_out and visc_return detected"); + bind = mutateTypeCause::mtc_BIND; + + CI->replaceAllUsesWith(BindOutInst); + toBeErased.push_back(CI); + } + if(isVISCCall_attributes(I)) { + Function* F = CI->getParent()->getParent(); + handleVISCAttributes(F, CI); + toBeErased.push_back(CI); + } + if (isVISCCall_getNode(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNode, &toBeErased); + } + if (isVISCCall_getParentNode(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getParentNode, &toBeErased); + } + if (isVISCCall_barrier(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_barrier, &toBeErased); + } + if (isVISCCall_malloc(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_malloc, &toBeErased); + } + if (isVISCCall_return(I)) { + DEBUG(errs() << "Function before visc return processing\n" << *I->getParent()->getParent() << "\n"); + // The operands to this call are the values to be returned by the node + Value* ReturnVal = genCodeForReturn(CI); + DEBUG(errs() << *ReturnVal << "\n"); + Type* ReturnType = ReturnVal->getType(); + assert(isa<StructType>(ReturnType) + && "Return type should be a struct type!"); + + assert(((bind == mutateTypeCause::mtc_RETURN) || + (bind == mutateTypeCause::mtc_None)) && + "Both bind_out and visc_return detected"); + + if (bind == mutateTypeCause::mtc_None) { + // If this is None, this is the first __visc__return + // instruction we have come upon. Place the return type of the + // function in the return type vector + bind = mutateTypeCause::mtc_RETURN; + StructType* ReturnStructTy = cast<StructType>(ReturnType); + for (unsigned i = 0; i < ReturnStructTy->getNumElements(); i++) + FRetTypes.push_back(ReturnStructTy->getElementType(i)); + } else { // bind == mutateTypeCause::mtc_RETURN + // This is not the first __visc__return + // instruction we have come upon. + // Check that the return types are the same + assert((ReturnType == FRetTypes[0]) + && "Multiple returns with mismatching types"); + } + + ReturnInst* RetInst = ReturnInst::Create(Ctx, ReturnVal); + DEBUG(errs() << "Found visc return call: " << *CI << "\n"); + Instruction* oldReturn = CI->getParent()->getTerminator(); + assert(isa<ReturnInst>(oldReturn) + && "Expecting a return to be the terminator of this BB!"); + DEBUG(errs() << "Found return statement of BB: " << *oldReturn << "\n"); + DEBUG(errs() << "\tSubstitute return with: " << *RetInst << "\n"); + //CI->replaceAllUsesWith(RetInst); + toBeErased.push_back(CI); + ReplaceInstWithInst(oldReturn, RetInst); + DEBUG(errs() << "Function after visc return processing\n" << *I->getParent()->getParent() << "\n"); + + } + + if (isVISCCall_getNodeInstanceID_x(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_x, &toBeErased); + } + if (isVISCCall_getNodeInstanceID_y(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_y, &toBeErased); + } + if (isVISCCall_getNodeInstanceID_z(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_z, &toBeErased); + } + if (isVISCCall_getNumNodeInstances_x(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_x, &toBeErased); + } + if (isVISCCall_getNumNodeInstances_y(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_y, &toBeErased); + } + if (isVISCCall_getNumNodeInstances_z(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_z, &toBeErased); + } + if (isVISCCall_atomic_cmpxchg(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_cmpxchg, &toBeErased); + } + if (isVISCCall_atomic_add(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_add, &toBeErased); + } + if (isVISCCall_atomic_sub(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_sub, &toBeErased); + } + if (isVISCCall_atomic_xchg(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_xchg, &toBeErased); + } + if (isVISCCall_atomic_inc(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_inc, &toBeErased); + } + if (isVISCCall_atomic_dec(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_dec, &toBeErased); + } + if (isVISCCall_atomic_min(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_min, &toBeErased); + } + if (isVISCCall_atomic_umin(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_umin, &toBeErased); + } + if (isVISCCall_atomic_max(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_max, &toBeErased); + } + if (isVISCCall_atomic_umax(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_umax, &toBeErased); + } + if (isVISCCall_atomic_and(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_and, &toBeErased); + } + if (isVISCCall_atomic_or(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_or, &toBeErased); + } + if (isVISCCall_atomic_xor(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_xor, &toBeErased); + } + if (isVISCCall_floor(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::floor, &toBeErased); + } + if (isVISCCall_rsqrt(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::nvvm_rsqrt_approx_f, &toBeErased); + } + if (isVISCCall_sqrt(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::sqrt, &toBeErased); + } + if (isVISCCall_sin(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::sin, &toBeErased); + } + if (isVISCCall_cos(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::cos, &toBeErased); + } + if (isVISCCall_tensor_convolution(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_convolution, &toBeErased); + } + if (isVISCCall_tensor_group_convolution(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_group_convolution, &toBeErased); + } + if (isVISCCall_tensor_add(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_add, &toBeErased); + } + if (isVISCCall_tensor_batchnorm(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_batchnorm, &toBeErased); + } + if (isVISCCall_tensor_mul(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_mul, &toBeErased); + } + if (isVISCCall_tensor_pool_max(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_pool_max, &toBeErased); + } + if (isVISCCall_tensor_pool_min(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_pool_min, &toBeErased); + } + if (isVISCCall_tensor_pool_mean(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_pool_mean, &toBeErased); + } + if (isVISCCall_tensor_relu(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_relu, &toBeErased); + } + if (isVISCCall_tensor_tanh(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_tanh, &toBeErased); + } + if (isVISCCall_tensor_clipped_relu(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_clipped_relu, &toBeErased); + } + if (isVISCCall_tensor_softmax(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_softmax, &toBeErased); + } + + // New Intrinsic to set Node ID + if (isVISCCall_node_id(I)) { + ReplaceCallWithIntrinsic(I, Intrinsic::visc_node_id, &toBeErased); + } + + } + + // Erase the __visc__node calls + DEBUG(errs() << "Erase " << toBeErased.size() << " Statements:\n"); + for(auto I: toBeErased) { + DEBUG(errs() << *I << "\n"); + } + while(!toBeErased.empty()) { + Instruction* I = toBeErased.back(); + DEBUG(errs() << "\tErasing " << *I << "\n"); + I->eraseFromParent(); + toBeErased.pop_back(); + } + + if(bind == mutateTypeCause::mtc_BIND || bind == mutateTypeCause::mtc_RETURN) { + DEBUG(errs() << "Function before fixing return type\n" << *f << "\n"); + // Argument type list. + std::vector<Type*> FArgTypes; + for(Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end(); + ai != ae; ++ai) { + FArgTypes.push_back(ai->getType()); + } + + // Find new return type of function + Type* NewReturnTy; + if(bind == mutateTypeCause::mtc_BIND) { + + std::vector<Type*> TyList; + for (unsigned i = 0; i < FRetTypes.size(); i++) + TyList.push_back(FRetTypes[i]); + + NewReturnTy = StructType::create(f->getContext(), TyList, Twine("struct.out."+f->getName()).str(), true); + } + else { + NewReturnTy = getReturnTypeFromReturnInst(f); + assert(NewReturnTy->isStructTy() && "Expecting a struct type!"); + } + + FunctionType* FTy = FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg()); + + // Change the function type + Function* newF = cloneFunction(f, FTy, false); + DEBUG(errs() << *newF << "\n"); + + if (bind == mutateTypeCause::mtc_BIND) { + // This is certainly an internal node, and hence just one BB with one + // return terminator instruction. Change return statement + ReturnInst* RI = cast<ReturnInst>(newF->getEntryBlock().getTerminator()); + ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(NewReturnTy)); + ReplaceInstWithInst(RI, newRI); + } + if (bind == mutateTypeCause::mtc_RETURN) { + // Nothing + } + replaceNodeFunctionInIR(*f->getParent(), f, newF); + DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n"); + } + + + } + return false; //TODO: What does returning "false" mean? +} + +// Generate Code for declaring a constant string [L x i8] and return a pointer +// to the start of it. +Value* GenVISC::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) { + Constant* SConstant = ConstantDataArray::getString(M->getContext(), S.str(), true); + Value* SGlobal = new GlobalVariable(*M, SConstant->getType(), true, + GlobalValue::InternalLinkage, SConstant, Name); + Value* Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0); + Value* GEPArgs[] = {Zero, Zero}; + GetElementPtrInst* SPtr = GetElementPtrInst::Create(nullptr, SGlobal, + ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB); + return SPtr; +} + + + +// Generate the test case using the dummy __visc__node call CI +// First parse the arguments to find the kernel function, num of levels, +// dimensions, arguments, inputs and outputs. Pass this information to genKernel +// and genInternalNode functions to generate the test case. +void GenVISC::generateTest(CallInst* CI) { + // Parse the dummy function call here + LLVMContext& Ctx = CI->getParent()->getContext(); + + unsigned offset = 1; // argument at offset 1 is the number of dimensions + // Find number of arguments + assert(CI->getNumArgOperands() > offset + && "Too few arguments for __visc__node call!"); + unsigned levels = getNumericValue(CI->getArgOperand(offset)); + errs() << "\tNum of levels = " << levels << "\n"; + + // Find number of dimensions + offset += 1; + assert(CI->getNumArgOperands() > offset + && "Too few arguments for __visc__node call!"); + unsigned numDims = getNumericValue(CI->getOperand(offset)); + errs() << "\tNum of dimensions = " << numDims << "\n"; + + + // Find number of arguments + offset += numDims*levels + 1; // skip the dimesnions + assert(CI->getNumArgOperands() > offset + && "Too few arguments for __visc__node call!"); + unsigned numArgs = getNumericValue(CI->getArgOperand(offset)); + errs() << "\tNum of kernel arguments = " << numArgs << "\n"; + + // Find number of outputs + offset += numArgs + 1; // skip the kernel arguments + assert(CI->getNumArgOperands() > offset + && "Too few arguments for __visc__node call!"); + unsigned numOutputs = getNumericValue(CI->getArgOperand(offset)); + errs() << "\tNum of kernel outputs = " << numOutputs << "\n"; + + // Find return struct type + assert(numOutputs == 0 && "Not handled case where number of outputs is non-zero!"); + // This is always zero. One should look at the number of struct elements of + // kernel function + StructType* RetTy = StructType::create(Ctx, None, "rtype"); + + Function* KernelF = genKernel(cast<Function>(CI->getArgOperand(0)->stripPointerCasts()), CI, RetTy); + genHost(CI, KernelF, levels, numDims, numArgs, numOutputs, RetTy); +} + + + +// Make all the required changes to the kernel function. This would include +// changing the function signature by adding any extra arguments required. +// Changing the return type. Changing all the OpenCL query intrinsics with the +// visc intrinsics. +Function* GenVISC::genKernel(Function* KernelF, CallInst* CI, StructType* RetTy) { + // Make changes to kernel here + DEBUG(errs() << "Modifying Node Function: " << KernelF->getName() << "\n"); + + // Find dummy __visc__attribute call in this function and add visc attributes + // in/out to pointer arguments + for (inst_iterator i = inst_begin(KernelF), e = inst_end(KernelF); i != e; ++i) { + Instruction *I = &(*i); + if(isVISCCall_attributes(I)) { + handleVISCAttributes(KernelF, cast<CallInst>(I)); + //I->eraseFromParent(); + break; + } + } + + // Change arguments and types + // Create the argument type list with added argument types + //Function::ArgumentListType& argList = KernelF->getArgumentList(); + std::vector<Type*> argTypes; + // Insert an i32 argument after every pointer argument. However adding an + // argument does not change the attribute list of function and so the + // arguments need to be shifted accordingly. + //bool shiftAttr = false; + for(Function::arg_iterator ai = KernelF->arg_begin(), ae = KernelF->arg_end(); + ai != ae; ++ai) { + + argTypes.push_back(ai->getType()); + if(ai->getType()->isPointerTy()) { + // If it is a pointer argument, add an i64 type next + argTypes.push_back(Type::getInt64Ty(KernelF->getContext())); + } + + } + // Adding new arguments to the function argument list, would not change the + // function type. We need to change the type of this function to reflect the + // added arguments + FunctionType* newFT = FunctionType::get(RetTy, argTypes, KernelF->isVarArg()); + + // Change the function type + SmallVector<ReturnInst*, 8> Returns; + Function* newKernelF = cloneFunction(KernelF, newFT, true, &Returns); + DEBUG(errs() << *newKernelF << "\n"); + + // Replace ret void instruction with ret %RetTy undef + for(auto RI: Returns) { + DEBUG(errs() << "Found return inst: "<< *RI << "\n"); + ReturnInst* newRI = ReturnInst::Create(KernelF->getContext(), UndefValue::get(RetTy)); + ReplaceInstWithInst(RI, newRI); + } + + replaceNodeFunctionInIR(*KernelF->getParent(), KernelF, newKernelF); + // Replace opencl query intrinsics with visc query intrinsics + replaceOpenCLCallsWithVISCIntrinsics(newKernelF); + return newKernelF; +} + +// Generate the code replacing the dummy __visc__node call with visc launch +// intrinsic and also generate the internal nodes required at each level +// depending on the hierarchy of DFG needed. This would also involve marhsalling +// all the input arguments to the kernel function in memory. Replaceing CI with +// launch intrinsic, and all the dummy __visc__wait calls with the visc wait +// intrinsic. +void GenVISC::genHost(CallInst* CI, Function* KernelF, unsigned levels, unsigned numDims, unsigned numArgs, unsigned numOutputs, StructType* RetTy) { + // Make host code changes here + DEBUG(errs() << "Modifying Host code for __visc__node call site: " << *CI << "\n"); + DEBUG(errs() << "Kernel Function: " << KernelF->getName() << "\n"); + LLVMContext& Ctx = CI->getParent()->getContext(); + + // Create a root funtion which has this as internal node + Function* Root = genInternalNode(KernelF, levels, numArgs, numDims, 3, CI); + + // Add hint to compile root for CPU. This is always true. + addHint(Root, visc::CPU_TARGET); + + // Generate argument struct type (All arguments followed by return struct type) + std::vector<Type*> ArgList; + unsigned offset = numDims*levels + 2 + 1 + 1; + for(Function::arg_iterator ai=KernelF->arg_begin(), ae=KernelF->arg_end(); + ai!=ae; ai++) { + Type* Ty = ai->getType(); + ArgList.push_back(Ty); + } + // Add the dimesnions arguments + for(unsigned i=0; i<numDims*levels; i++) { +// ArgList.push_back(Type::getInt32Ty(Ctx)); + ArgList.push_back(Type::getInt64Ty(Ctx)); + } + ArgList.push_back(RetTy); + StructType* ArgStructTy = StructType::create(ArgList, "struct.arg", true); + DEBUG(errs() << *ArgStructTy << "\n"); + + // Insert alloca inst for this argument struct type + AllocaInst* AI = new AllocaInst(ArgStructTy, "in.addr", CI); + + // Marshall all input arguments and dimension arguments into argument struct + // type + marshallArguments(levels, numArgs, offset, numDims, 3, AI, CI, KernelF); + + // Type cast argument struct to i8* + CastInst* BI = BitCastInst::CreatePointerCast(AI, + Type::getInt8PtrTy(Ctx), + "args", + CI); + + // Bitcast Root function to i8* + Constant* Root_i8ptr = ConstantExpr::getPointerCast(Root, Type::getInt8PtrTy(Ctx)); + // Replace CI with launch call to a Root function + Function* LaunchF = Intrinsic::getDeclaration(Root->getParent(), Intrinsic::visc_launch); + DEBUG(errs() << "Intrinsic for launch: " << *LaunchF << "\n"); + + Value* LaunchInstArgs[] = {Root_i8ptr, BI, ConstantInt::getFalse(Ctx)}; + CallInst* LaunchInst = CallInst::Create(LaunchF, + ArrayRef<Value*>(LaunchInstArgs,3), + "graph"+Root->getName(), CI); + //ReplaceInstWithInst(LI, LaunchInst); + + DEBUG(errs() << *LaunchInst << "\n"); + // Add wait call + // Replace all wait instructions with visc wait intrinsic instructions + Function* WaitF = Intrinsic::getDeclaration(Root->getParent(), Intrinsic::visc_wait); + std::vector<CallInst*>* WaitList = getWaitList(CI); + for(unsigned i=0; i < WaitList->size(); ++i) { + CallInst* waitCall = WaitList->at(i); + CallInst* waitInst = CallInst::Create(WaitF, + ArrayRef<Value*>(LaunchInst), + "", CI); + DEBUG(errs() << *waitInst << "\n"); + waitCall->eraseFromParent(); + } + + // Get result (optional) +} + +static Function* transformReturnTypeToStruct(Function* F) { + // Currently only works for void return types + DEBUG(errs() << "Transforming return type of function to Struct: " << F->getName() << "\n"); + + if (isa<StructType>(F->getReturnType())) { + DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": " << *F->getReturnType() << "\n"); + return F; + } + + assert(F->getReturnType()->isVoidTy() && "Unhandled case - Only void return type handled\n"); + + // Create the argument type list with added argument types + std::vector<Type*> ArgTypes; + for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + ArgTypes.push_back(ai->getType()); + } + + StructType* RetTy = StructType::create(F->getContext(), None, "emptyStruct", true); + FunctionType* FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg()); + + SmallVector<ReturnInst*, 8> Returns; + Function* newF = cloneFunction(F, FTy, false, &Returns); + // Replace ret void instruction with ret %RetTy undef + for(auto RI: Returns) { + DEBUG(errs() << "Found return inst: "<< *RI << "\n"); + ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy)); + ReplaceInstWithInst(RI, newRI); + } + + replaceNodeFunctionInIR(*F->getParent(), F, newF); + return newF; +} + +static Type* getReturnTypeFromReturnInst(Function* F) { + for(BasicBlock &BB: *F) { + if(ReturnInst* RI = dyn_cast<ReturnInst>(BB.getTerminator())) { + DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType() << "\n"); + return RI->getReturnValue()->getType(); + } + } +} + + +char genvisc::GenVISC::ID = 0; +static RegisterPass<genvisc::GenVISC> X("genvisc", "Pass to generate VISC IR from LLVM IR (with dummy function calls)", false, false); + +} // End of namespace genvisc + + diff --git a/lib/GenVISC/GenVISC.exports b/lib/GenVISC/GenVISC.exports new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/GenVISC/LLVMBuild.txt b/lib/GenVISC/LLVMBuild.txt new file mode 100644 index 0000000000..9266b2c597 --- /dev/null +++ b/lib/GenVISC/LLVMBuild.txt @@ -0,0 +1,21 @@ +;===- ./lib/Transforms/GenVISC/LLVMBuild.txt -------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = GenVISC +parent = Transforms diff --git a/lib/InPlaceDFG/CMakeLists.txt b/lib/InPlaceDFG/CMakeLists.txt new file mode 100644 index 0000000000..d034ae4976 --- /dev/null +++ b/lib/InPlaceDFG/CMakeLists.txt @@ -0,0 +1,12 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( LLVMInPlaceDFGAnalysis + InPlaceDFGAnalysis.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) diff --git a/lib/InPlaceDFG/InPlaceDFGAnalysis.cpp b/lib/InPlaceDFG/InPlaceDFGAnalysis.cpp new file mode 100644 index 0000000000..a45e6e3645 --- /dev/null +++ b/lib/InPlaceDFG/InPlaceDFGAnalysis.cpp @@ -0,0 +1,318 @@ +//===------------------------ InPlaceDFGAnalysis.cpp ----------------------===// +// +// +// +// The LLVM Compiler Infrastructure +// +// +// +// This file is distributed under the University of Illinois Open Source +// +// License. See LICENSE.TXT for details. +// +// +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "InPlaceDFGAnalysis" + +#include "llvm/Support/SourceMgr.h" +#include "llvm/InPlaceDFG/InPlaceDFGAnalysis.h" +#include "llvm/SupportVISC/DFG2LLVM.h" + +using namespace llvm; +using namespace builddfg; +using namespace dfg2llvm; + +namespace inplacedfg { + +/*** Classes ***/ + +// Visitor for Code generation traversal (tree traversal for now) +class AT_OCL : public CodeGenTraversal { + +private: + //Member variables + InPlaceDFGAnalysis::InPlaceDFGParameter *IPP; + + //Functions + + // Virtual Functions + void init() {} + void initRuntimeAPI() {} + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); + +public: + // Constructor + AT_OCL(Module &_M, BuildDFG &_DFG, InPlaceDFGAnalysis::InPlaceDFGParameter &_IPP) : + CodeGenTraversal(_M, _DFG), IPP(&_IPP) { + + } +}; + +/*** Helper Functions ***/ + +// Create an entry in InPlaceDFGParameter IPP for node N if it does not exist +void initializeDFNodeIPPVector(DFNode *N, + InPlaceDFGAnalysis::InPlaceDFGParameter &IPP) { + if (IPP.find(N) == IPP.end()) { + // Find the node function + Function *F = N->getFuncPointer(); + // Create a vector initialized to true + IPP[N] = std::vector<bool>(F->getFunctionType()->getNumParams(), true); + // Every scalar parameter is not eligible for an in place operation + for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + Argument *Arg = &*ai; + if (!(Arg->getType()->isPointerTy())) { + IPP[N][Arg->getArgNo()] = false; + } + } + } +} + +// Update InPlaceDFGParameter IPP based on the outgoing edges of node N +void checkOutputEdgeSources(DFNode* N, InPlaceDFGAnalysis::InPlaceDFGParameter &IPP) { + // Iterate over all outgoing edges. + for (DFNode::outdfedge_iterator oe_it = N->outdfedge_begin(), + oeEnd = N->outdfedge_end(); oe_it != oeEnd; ++oe_it) { + // For every edge, look through all subsequent edges. + // If, for some edge, have the same source position, then the output is not + // eligible for an in place operation + DFNode::outdfedge_iterator oeNext = oe_it; + + unsigned srcPos = (*oe_it)->getSourcePosition(); + for (++oeNext ; oeNext != oeEnd; ++oeNext) { + DFEdge *E = *oeNext; + // If we find edges with the same source position + if (E->getSourcePosition() == srcPos) { + // Find node and destination positions, and make the respective + // arguments not eligible for in place operations + DFNode *DN = (*oe_it)->getDestDF(); + unsigned dstPos = (*oe_it)->getDestPosition(); + initializeDFNodeIPPVector(DN, IPP); + IPP[DN][dstPos] = false; + + DN = E->getDestDF(); + dstPos = E->getDestPosition(); + initializeDFNodeIPPVector(DN, IPP); + IPP[DN][dstPos] = false; + } + } + } + +} + +// Print InPlaceDFGParameter DFG +void printInPlaceDFGParameter(InPlaceDFGAnalysis::InPlaceDFGParameter &IPP) { + + errs() << "----------------------------\n"; + errs() << "In Place DFG Analysis Result\n"; + for (InPlaceDFGAnalysis::InPlaceDFGParameter::iterator it = IPP.begin(), + ie = IPP.end(); it != ie; ++it) { + DFNode *N = it->first; + if (N->isDummyNode()) { + errs() << "(dummy) "; + } + errs() << "Node: " << N->getFuncPointer()->getName() << "\n\tMap:"; + for (unsigned i = 0; i < it->second.size() ; i++) { + errs() << " " << (it->second[i] ? "true " : "false"); + } + errs() << "\n"; + } + errs() << "----------------------------\n"; + +} + +/*** Methods ***/ + +/*** Methods of InPlaceDFGAnalysisWrapper ***/ +const InPlaceDFGAnalysis::InPlaceDFGParameter + &InPlaceDFGAnalysisWrapper::getIPP() { + return IPP; +} + +void InPlaceDFGAnalysisWrapper::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<BuildDFG>(); + AU.addPreserved<BuildDFG>(); +} + +bool InPlaceDFGAnalysisWrapper::runOnModule(Module &M) { + // Get the BuildDFG Analysis Results: + // - Dataflow graph + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + InPlaceDFGAnalysis IPA; + IPA.run(M, DFG, IPP); + + return false; +} + +/*** Methods of InPlaceDFGAnalysis ***/ +void InPlaceDFGAnalysis::run(Module &M, BuildDFG &DFG, InPlaceDFGParameter &IPP) { + + errs() << "\nIN PLACE ANALYSIS PASS\n"; + + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + + // Visitor for Graph Traversal + AT_OCL *ATVisitor = new AT_OCL(M, DFG, IPP); + + // Iterate over all the DFGs + // Analyse the edges for parameters that are valid to be used in place + for (auto rootNode: Roots) { + // Initiate analysis for root DFNode + IPP[rootNode] = + std::vector<bool>(rootNode->getFuncPointer()->getFunctionType()->getNumParams(), + false); + // All inputs from the host are marked as not in place - the host does not + // expect these values to change unpredictably. + ATVisitor->visit(rootNode); + // The analysis is optimistic, assuming everything is eligible for in place + // unless found otherwise. This happens if two edges have the same source + // node and port. Then the targets of these edges are not eligible for + // in place operations. + + /* TODO: + To enforce that host values are marked as false, we need a second pass over + the graph that does the following: + - push root in a vector: + - while the vector is not empty: + - - pop the last node, N: + - - if internal node: + - - - find its entry dummy node (easily done by isDummyNode() and iterating + over outedges of dummy, exit ummy has not outedges) + - - - for all successors of the dymmy node, + - - - - if the edge carries a false annotated value (if the source position + is marked as false in the N vector), mark as such at the successor + and push successor in the vector + - - if leaf node + - - - return + + For now, this is not required, as there is only one level in the graph. + Thus I simply iterate over outedges of entry dummy ,and mark targets as + false, at the end of codegen for leaf node. + */ + + } + +// printInPlaceDFGParameter(IPP); + + delete ATVisitor; + return; +} + +/*** Methods of AT_OCL ***/ + +/*** Analysis of internal node ***/ +void AT_OCL::codeGen(DFInternalNode* N) { + DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n"); + +// errs() << "Internal node: before initializing this node's vector\n"; +// printInPlaceDFGParameter(*IPP); + // If a vector has not been created for this node, + // create one initialized to true + initializeDFNodeIPPVector(N, *IPP); + +// errs() << "Internal node: after initializing this node's vector, before its check edges\n"; +// printInPlaceDFGParameter(*IPP); + // Check its output edges, for same destination node and port. + + checkOutputEdgeSources(N, *IPP); +// errs() << "Internal node: after this node's check edges\n"; +// printInPlaceDFGParameter(*IPP); +} + +/*** Analysis of leaf node ***/ +void AT_OCL::codeGen(DFLeafNode* N) { + DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n"); + + if(N->isAllocationNode()) { + DEBUG(errs() << "Analysis does not expect allocation node\n"); + assert(false && "Allocation nodes not expected in approxHPVM"); + return; + } + +// errs() << "Leaf node: before initializing this node's vector\n"; +// printInPlaceDFGParameter(*IPP); + // If a vector has not been created for this node, + // create one initialized to true + initializeDFNodeIPPVector(N, *IPP); +// errs() << "Leaf node: after initializing this node's vector\n"; +// printInPlaceDFGParameter(*IPP); + + // Skip internal checks if it is a dummy node + if(!(N->isDummyNode())) { + // Check that all outputs should be results of HPVM tensor intrinsics + if (N->getOutputType()->isEmptyTy()) + return; + + unsigned numOutputs = N->getOutputType()->getNumElements(); + + Function *F = N->getFuncPointer(); + BasicBlock& BB = F->getEntryBlock(); + assert(isa<ReturnInst>(BB.getTerminator()) + && "ApproxHPVM Nodes have a single BB\n"); + ReturnInst* RI = dyn_cast<ReturnInst>(BB.getTerminator()); + // Find the returned struct + Value* rval = RI->getReturnValue(); + + // Look through all outputs to make sure they are insertvalue instructions + std::vector<Value*> OutValues(numOutputs, NULL); + for (unsigned i = 0; i < numOutputs; i++) { + if(InsertValueInst* IV = dyn_cast<InsertValueInst>(rval)) { + DEBUG(errs() << "Value at out edge" << numOutputs-1-i << ": " << *rval << "\n"); + OutValues[numOutputs-1-i] = IV->getOperand(1); + rval = IV->getOperand(0); + } + else { + DEBUG(errs() << "Unexpected value at out edge: " << *rval << "\n"); + llvm_unreachable("Expecting InsertValue instruction. Error!"); + } + } + + // Look through all outputs + for (unsigned i = 0; i < numOutputs; i++) { + if (OutValues[i]->getType()->isPointerTy()) { + // All returned pointers should be results of HPVM tensor intrinsics + CallInst *CI = dyn_cast<CallInst>(OutValues[i]); + assert(CI && + "Expected return value to be the result of a call instruction\n"); + assert ((CI->getCalledFunction()->getName()).startswith("llvm.visc.tensor") && + "Node output must be the result of an HPVM tensor intrinsic\n"); + } + } + + } + +// errs() << "Leaf node: before this node's check edges\n"; +// printInPlaceDFGParameter(*IPP); + // Check its output edges, for same destination node and port. + checkOutputEdgeSources(N, *IPP); +// errs() << "Leaf node: after this node's check edges\n"; +// printInPlaceDFGParameter(*IPP); + + // Mark host values as false, explained in run + if((N->isDummyNode())) { + for (DFNode::outdfedge_iterator oe_it = N->outdfedge_begin(), + oeEnd = N->outdfedge_end(); oe_it != oeEnd; ++oe_it) { + DFNode *DN = (*oe_it)->getDestDF(); + unsigned dstPos = (*oe_it)->getDestPosition(); + initializeDFNodeIPPVector(DN, *IPP); + (*IPP)[DN][dstPos] = false; + } + } +// errs() << "Leaf node: after this (dummy) node's update host values\n"; +// printInPlaceDFGParameter(*IPP); + +} + +char InPlaceDFGAnalysisWrapper::ID = 0; +static RegisterPass<InPlaceDFGAnalysisWrapper> X("inplace", + "Pass to identifying candidates for in place operations in HPVM", + false /* does not modify the CFG */, + false /* not transformation, just analysis */); + +} // End of namespace + diff --git a/lib/InPlaceDFG/InPlaceDFGAnalysis.exports b/lib/InPlaceDFG/InPlaceDFGAnalysis.exports new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/InPlaceDFG/LLVMBuild.txt b/lib/InPlaceDFG/LLVMBuild.txt new file mode 100644 index 0000000000..b78912b9c4 --- /dev/null +++ b/lib/InPlaceDFG/LLVMBuild.txt @@ -0,0 +1,21 @@ +;===- ./lib/Transforms/LocalMem/LLVMBuild.txt ------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = InPlaceDFGAnalysis +parent = Transforms diff --git a/lib/InlineTensorCalls/CMakeLists.txt b/lib/InlineTensorCalls/CMakeLists.txt new file mode 100644 index 0000000000..51f321884f --- /dev/null +++ b/lib/InlineTensorCalls/CMakeLists.txt @@ -0,0 +1,13 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( InlineTensorCalls + InlineTensorCalls.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) + diff --git a/lib/InlineTensorCalls/InlineTensorCalls.cpp b/lib/InlineTensorCalls/InlineTensorCalls.cpp new file mode 100644 index 0000000000..d31434341c --- /dev/null +++ b/lib/InlineTensorCalls/InlineTensorCalls.cpp @@ -0,0 +1,77 @@ +//=== InlineApproxHPVMCalls.cpp ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#define ENABLE_ASSERTS + +#define DEBUG_TYPE "INLINE_APPROXHPVM_CALLS" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +#include "llvm/IR/InstIterator.h" + +#include "llvm/Support/raw_ostream.h" +#include "llvm/Analysis/InlineCost.h" + +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/IR/CallSite.h" +#include "llvm/ADT/SetVector.h" +#include <sstream> + +using namespace llvm; + + +namespace { + + struct InlineApproxHPVMCalls : public ModulePass { + static char ID; // Pass identification, replacement for typeid + InlineApproxHPVMCalls() : ModulePass(ID) {} + + bool runOnModule(Module &M) override { + + InlineFunctionInfo IFI; + SmallSetVector<CallSite, 16> Calls; + bool Changed = false; + SmallVector<Function *, 16> InlinedFunctions; + for (Function &F : M){ + if (!F.isDeclaration() && F.getName().startswith("tensor") ) { + //errs()<<"Function = "<<*&F<<"\n"; + Calls.clear(); + + for (User *U : F.users()) + if (auto CS = CallSite(U)) + if (CS.getCalledFunction() == &F) + Calls.insert(CS); + + for (CallSite CS : Calls) + // FIXME: We really shouldn't be able to fail to inline at this point! + // We should do something to log or check the inline failures here. + Changed |= InlineFunction(CS, IFI); + + } + } + + return true; + } + + }; + + +} // End of namespace + +char InlineApproxHPVMCalls::ID = 0; +static RegisterPass<InlineApproxHPVMCalls> X("inline-tensor-calls", + "Inline ApproxHPVM tensor library function calls (CPU version)", + true /* modifies the CFG */, + true /* transformation, * + * not just analysis */); + diff --git a/lib/InlineTensorCalls/InlineTensorCalls.exports b/lib/InlineTensorCalls/InlineTensorCalls.exports new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/InlineTensorCalls/LLVMBuild.txt b/lib/InlineTensorCalls/LLVMBuild.txt new file mode 100644 index 0000000000..8fff7891af --- /dev/null +++ b/lib/InlineTensorCalls/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = InlineTensorCalls +parent = Transforms + diff --git a/lib/InsertApproxInfo/CMakeLists.txt b/lib/InsertApproxInfo/CMakeLists.txt new file mode 100644 index 0000000000..2b6d41bd70 --- /dev/null +++ b/lib/InsertApproxInfo/CMakeLists.txt @@ -0,0 +1,12 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( InsertApproxInfo + InsertApproxInfo.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) diff --git a/lib/InsertApproxInfo/InsertApproxInfo.cpp b/lib/InsertApproxInfo/InsertApproxInfo.cpp new file mode 100644 index 0000000000..bde4ef8907 --- /dev/null +++ b/lib/InsertApproxInfo/InsertApproxInfo.cpp @@ -0,0 +1,498 @@ +//===------------------------ InPlaceDFGAnalysis.cpp ----------------------===// +// +// +// +// The LLVM Compiler Infrastructure +// +// +// +// This file is distributed under the University of Illinois Open Source +// +// License. See LICENSE.TXT for details. +// +// +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "InsertApproxInfo" + +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/InPlaceDFG/InPlaceDFGAnalysis.h" +#include "llvm/SupportVISC/DFG2LLVM.h" +#include "llvm/IR/InstrTypes.h" +#include <unordered_map> +#include <dirent.h> +#include <stdio.h> +#include <sstream> +#include <fstream> + + +using namespace llvm; +using namespace builddfg; +using namespace dfg2llvm; +using namespace inplacedfg; + + +namespace { + +static cl::opt<std::string> dir_name("results-dir", cl::desc(" Name of directory with Autotuner results ")); + + +struct ApproxMetrics{ + std::string op_name; + std::string category; + unsigned int rank; // rank given by autotuner + double approx_level; + // Relative L-norm metrics + double relative_l1; + double relative_l2; + double relative_linf; + // Mean L-norm metrics + double mean_l1; + double mean_l2; + double mean_linf; +}; + + + +struct InsertApproxInfoWrapperPass : public ModulePass { + static char ID; // Pass identification, replacement for typeid + InsertApproxInfoWrapperPass() : ModulePass(ID) {} + +public: + // Functions + bool runOnModule(Module &M); + void getAnalysisUsage(AnalysisUsage &AU) const; +}; + + +// Visitor for Code generation traversal (tree traversal for now) +class InsertApproxInfo : public CodeGenTraversal { + +private: + // Virtual Functions + void init() {} + void initRuntimeAPI() {} + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); + void loadTrainedApproxMetrics(std::string dir_path); + void loadMetricsFromFile(std::string dir_path, std::string file_path, std::string category); + void loadMetricsFromDir(std::string dir_path, std::string category); + void readApproxValues(const std::string line, ApproxMetrics* approx_metrics); + void initIntrinsicNames(); + void initGlobalStrings(); + + // private data + std::unordered_map<std::string, std::string> intrinsics_map; + std::unordered_map<std::string, std::vector<ApproxMetrics*>> operation_metrics; + GlobalVariable* rank_str; + GlobalVariable* category_str; + GlobalVariable* mean_l1_str; + GlobalVariable* mean_l2_str; + GlobalVariable* mean_linf_str; + GlobalVariable* rel_l1_str; + GlobalVariable* rel_l2_str; + GlobalVariable* rel_linf_str; + + + // Tracks the id of the tensor op processed + unsigned int currentID; + +public: + // Constructor + InsertApproxInfo(Module &_M, BuildDFG &_DFG); + + //void run(Module &M, BuildDFG &DFG); + void run(std::string dir_path); + +}; + + + +void InsertApproxInfo::initIntrinsicNames(){ + + intrinsics_map["llvm.visc.tensor.convolution"] = "tensorConv"; + intrinsics_map["llvm.visc.tensor.mul"] = "tensorGemm"; + intrinsics_map["llvm.visc.tensor.add"] = "tensorAdd"; + intrinsics_map["llvm.visc.tensor.pool.max"] = "tensorPooling"; + intrinsics_map["llvm.visc.tensor.tanh"] = "tensorTanh"; +} + + +void InsertApproxInfo::initGlobalStrings(){ + + /**** Creating global constant strings for each approximation metric type *******/ + + std::string rank_string = "rank"; + Constant* stringConst = ConstantDataArray::getString(M.getContext(), StringRef(rank_string.c_str()), true); + rank_str = new GlobalVariable(M, stringConst->getType(), true, + GlobalValue::ExternalLinkage, stringConst, ""); + + std::string category_string = "category"; + stringConst = ConstantDataArray::getString(M.getContext(), StringRef(category_string.c_str()), true); + category_str = new GlobalVariable(M, stringConst->getType(), true, + GlobalValue::ExternalLinkage, stringConst, ""); + + // Mean l-norm metrics + std::string metric_string = "mean_l1"; + stringConst = ConstantDataArray::getString(M.getContext(), StringRef(metric_string.c_str()), true); + mean_l1_str = new GlobalVariable(M, stringConst->getType(), true, + GlobalValue::ExternalLinkage, stringConst, ""); + + metric_string = "mean_l2"; + stringConst = ConstantDataArray::getString(M.getContext(), StringRef(metric_string.c_str()), true); + mean_l2_str = new GlobalVariable(M, stringConst->getType(), true, + GlobalValue::ExternalLinkage, stringConst, ""); + + metric_string = "mean_linf"; + stringConst = ConstantDataArray::getString(M.getContext(), StringRef(metric_string.c_str()), true); + mean_linf_str = new GlobalVariable(M, stringConst->getType(), true, + GlobalValue::ExternalLinkage, stringConst, ""); + + // Relative l-norm metrics + metric_string = "rel_l1"; + stringConst = ConstantDataArray::getString(M.getContext(), StringRef(metric_string.c_str()), true); + rel_l1_str = new GlobalVariable(M, stringConst->getType(), true, + GlobalValue::ExternalLinkage, stringConst, ""); + + metric_string = "rel_l2"; + stringConst = ConstantDataArray::getString(M.getContext(), StringRef(metric_string.c_str()), true); + rel_l2_str = new GlobalVariable(M, stringConst->getType(), true, + GlobalValue::ExternalLinkage, stringConst, ""); + + metric_string = "rel_linf"; + stringConst = ConstantDataArray::getString(M.getContext(), StringRef(metric_string.c_str()), true); + rel_linf_str = new GlobalVariable(M, stringConst->getType(), true, + GlobalValue::ExternalLinkage, stringConst, ""); + +} + + +InsertApproxInfo::InsertApproxInfo(Module &_M, BuildDFG &_DFG) : + CodeGenTraversal(_M, _DFG){ + + currentID = 1; + + initIntrinsicNames(); + initGlobalStrings(); +} + + +void InsertApproxInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<BuildDFG>(); + AU.addPreserved<BuildDFG>(); +} + + +bool InsertApproxInfoWrapperPass::runOnModule(Module &M) { + + std::string dir_path = dir_name.getValue(); + // Get the BuildDFG Analysis Results: + // - Dataflow graph + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + InsertApproxInfo IApprox(M, DFG); + IApprox.run(dir_path); + + return false; +} + + +void InsertApproxInfo::readApproxValues(const std::string line, ApproxMetrics* approx_metrics){ + + std::istringstream in(line); + std::string op_name; + + float approx_level; + + float mean_l1; + float mean_l2; + float mean_linf; + + float relative_l1; + float relative_l2; + float relative_linf; + + in >> op_name; + in >> approx_level; + + in >> mean_l1; + in >> mean_l2; + in >> mean_linf; + + in >> relative_l1; + in >> relative_l2; + in >> relative_linf; + + printf("\n *** op_name = %s \n", op_name.c_str()); + printf("approx_level = %f \n", approx_level); + printf("relative_l1 = %f \n", relative_l1); + printf("relative_l2 = %f \n", relative_l2); + printf("relative_linf = %f \n", relative_linf); + printf("mean_l1 = %f \n", mean_l1); + printf("mean_l2 = %f \n", mean_l2); + printf("mean_linf = %f \n", mean_linf); + + approx_metrics->op_name = op_name; + approx_metrics->approx_level = approx_level; + approx_metrics->mean_l1 = mean_l1; + approx_metrics->mean_l2 = mean_l2; + approx_metrics->mean_linf = mean_linf; + approx_metrics->relative_l1 = relative_l1; + approx_metrics->relative_l2 = relative_l2; + approx_metrics->relative_linf = relative_linf; + +} + + +unsigned int getFileRank(std::string file_path){ + + char file_name[100]; // Assuming no file names greater than 100 chars + strcpy(file_name, file_path.c_str()); + + char* pch = strtok(file_name, "_"); + char* last_pch; + while(pch != NULL){ + last_pch = pch; + pch = strtok(NULL, "_"); + } + + printf("NOTE: ****** last_pch = %s \n", last_pch); + + size_t sz; + int rank = std::stoi(last_pch, &sz); + + return rank + 1; // NOTE: Adding 1 to start ranks with '1' +} + + + +void InsertApproxInfo::loadMetricsFromFile(std::string dir_path, std::string file_path, std::string category){ + + std::string full_path = dir_path + "/" + file_path; + printf("full_path = %s \n", full_path.c_str()); + std::ifstream infile(full_path.c_str()); + std::string line; + + unsigned int it_count = 0; + while(std::getline(infile, line)){ + + // Skip first line with confidence information + if(it_count > 0){ + std::vector<float> approx_values; + ApproxMetrics* approx_metrics = new ApproxMetrics; + readApproxValues(line, approx_metrics); + + approx_metrics->category = category; + unsigned int rank = getFileRank(file_path); + approx_metrics->rank = rank; + + std::string unique_op_name = approx_metrics->op_name + std::to_string(it_count); + operation_metrics[unique_op_name].push_back(approx_metrics); + printf("\n ** unique_op_name = %s \n", unique_op_name.c_str()); + } + + it_count++; + } + +} + + + +void InsertApproxInfo::loadMetricsFromDir(std::string dir_path, std::string category){ + + struct dirent* entry; + dir_path = dir_path + category; + + DIR* dir = opendir(dir_path.c_str()); + if(dir == NULL){ + printf("Directory %s not found . Aborting ... \n\n ", dir_path.c_str()); + abort(); + } + + while((entry = readdir(dir)) != NULL){ + printf("f_name = %s \n", entry->d_name); + std::string f_name = entry->d_name; + loadMetricsFromFile(dir_path, f_name, category); + } +} + + + +void InsertApproxInfo::loadTrainedApproxMetrics(std::string dir_path){ + + std::string root_path = dir_path + "/high_confidence/"; + loadMetricsFromDir(root_path, "linear"); + loadMetricsFromDir(root_path, "log"); + loadMetricsFromDir(root_path, "quad"); +} + + +/*** Methods of InPlaceDFGAnalysis ***/ +void InsertApproxInfo::run(std::string dir_path) { + + loadTrainedApproxMetrics(dir_path); + + errs() << "\n NOTE: ApproxInfo INSERTION TRANSFORM \n"; + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + + // Iterate over all the DFGs + // Analyse the edges for parameters that are valid to be used in place + for (auto rootNode: Roots) { + //ATVisitor->visit(rootNode); + this->visit(rootNode); + } + + //delete ATVisitor; + return; +} + +/*** Analysis of internal node ***/ +void InsertApproxInfo::codeGen(DFInternalNode* N) { + DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n"); +} + +/*** Analysis of leaf node ***/ +void InsertApproxInfo::codeGen(DFLeafNode* N) { + DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n"); + + // Skip code generation if it is a dummy node + if(N->isDummyNode()) { + DEBUG(errs() << "Skipping dummy node\n"); + return; + } + + // Abort code generation if it is an allocation node + if(N->isAllocationNode()) { + assert(false && "Allocation Node not expected in ApproxHPVM"); + return; + } + + Function *F = N->getFuncPointer(); + Module* M = F->getParent(); + std::vector<IntrinsicInst *> IItoRemove; + + + /**** Adding operand bundles for each tensor operation in the HPVM DFG Leaf Node ****/ + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &(*i); + errs()<<*I<<"\n"; + + + if (BuildDFG::isViscIntrinsic(I)) { + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + assert((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor") + && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n"); + + std::string intrinsic_id = std::string(II->getCalledFunction()->getName().data()); + std::string runtime_func_name = intrinsics_map[intrinsic_id]; + std::string unique_name = runtime_func_name + std::to_string(currentID); + printf("\n ---- unique_name = %s \n ", unique_name.c_str()); + std::vector<ApproxMetrics*> approx_metrics; + if(operation_metrics.find(unique_name) != operation_metrics.end()){ + approx_metrics = operation_metrics[unique_name]; + } + else{ + errs()<<"Intrinsic Name NOT found in the map - Unexpected Error. Aborting ... \n\n"; + abort(); + } + + + unsigned int num_configs = approx_metrics.size(); + std::vector<OperandBundleDef> conf_bundles; + for(unsigned int i = 0; i < num_configs; i++){ + std::vector<Value*> norm_vals; + + norm_vals.push_back(category_str); + Constant* categoryConst = ConstantDataArray::getString(M->getContext(), StringRef(approx_metrics[i]->category.c_str()), true); + GlobalVariable* category_value = new GlobalVariable(*M, categoryConst->getType(), true, + GlobalValue::ExternalLinkage, categoryConst, ""); + norm_vals.push_back(category_value); + + norm_vals.push_back(rank_str); + Constant* constIntVal = ConstantInt::get(Type::getInt32Ty(M->getContext()), approx_metrics[i]->rank); + norm_vals.push_back(constIntVal); + + // Adding mean l-norm metrics + norm_vals.push_back(mean_l1_str); + Constant* constFPVal = ConstantFP::get(Type::getDoubleTy(M->getContext()), approx_metrics[i]->mean_l1); + norm_vals.push_back(constFPVal); + + norm_vals.push_back(mean_l2_str); + constFPVal = ConstantFP::get(Type::getDoubleTy(M->getContext()), approx_metrics[i]->mean_l2); + norm_vals.push_back(constFPVal); + + norm_vals.push_back(mean_linf_str); + constFPVal = ConstantFP::get(Type::getDoubleTy(M->getContext()), approx_metrics[i]->mean_linf); + norm_vals.push_back(constFPVal); + + // Relative l-norm Metrics + norm_vals.push_back(rel_l1_str); + constFPVal = ConstantFP::get(Type::getDoubleTy(M->getContext()), approx_metrics[i]->relative_l1); + norm_vals.push_back(constFPVal); + + norm_vals.push_back(rel_l2_str); + constFPVal = ConstantFP::get(Type::getDoubleTy(M->getContext()), approx_metrics[i]->relative_l2); + norm_vals.push_back(constFPVal); + + norm_vals.push_back(rel_linf_str); + constFPVal = ConstantFP::get(Type::getDoubleTy(M->getContext()), approx_metrics[i]->relative_linf); + norm_vals.push_back(constFPVal); + + + std::string config_name = "config_" + std::to_string(i+1); + OperandBundleDef norm_bundle(config_name, norm_vals); + + conf_bundles.push_back(norm_bundle); + } + + ArrayRef<OperandBundleDef> bundle_arr(conf_bundles); + + /*** Creating new Intrinsic call with Operand Bundles attached **/ + Function* calledFunction = II->getCalledFunction(); + unsigned num_args = II->getNumArgOperands(); + std::vector<Value*> args; + for(unsigned i = 0; i < num_args; i++){ + Value* argValue = II->getArgOperand(i); + args.push_back(argValue); + } + + CallInst* CI = CallInst::Create(calledFunction, + args, bundle_arr, "", II); + + errs()<<"NOTE: New CallInst = "<<*CI<<"\n"; + + II->replaceAllUsesWith(CI); + // Mark to remove at the end + IItoRemove.push_back(II); + + // Increment counter of op processed + currentID++; + } + } + + + for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(), + re = IItoRemove.rend(); ri != re; ++ri) { + DEBUG(errs() << "Erasing: " << **ri << "\n"); + errs() << "Erasing: " << **ri << "\n"; + (*ri)->eraseFromParent(); + } + + +} + +char InsertApproxInfoWrapperPass::ID = 0; +static RegisterPass<InsertApproxInfoWrapperPass> X("insert-approxinfo", + "Pass to add approximation information (l-norm metrics) in the ApproxHPVM DFG", + false /* does not modify the CFG */, + false /* not transformation, just analysis */); + + + + + +} // End of namespace + diff --git a/lib/InsertApproxInfo/LLVMBuild.txt b/lib/InsertApproxInfo/LLVMBuild.txt new file mode 100644 index 0000000000..e9cf5afd4a --- /dev/null +++ b/lib/InsertApproxInfo/LLVMBuild.txt @@ -0,0 +1,21 @@ +;===- ./lib/Transforms/LocalMem/LLVMBuild.txt ------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = InsertApproxInfo +parent = Transforms diff --git a/lib/LocalMem/CMakeLists.txt b/lib/LocalMem/CMakeLists.txt new file mode 100644 index 0000000000..fa91332594 --- /dev/null +++ b/lib/LocalMem/CMakeLists.txt @@ -0,0 +1,12 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( LLVMLocalMem + LocalMem.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) diff --git a/lib/LocalMem/LLVMBuild.txt b/lib/LocalMem/LLVMBuild.txt new file mode 100644 index 0000000000..629f9caaa9 --- /dev/null +++ b/lib/LocalMem/LLVMBuild.txt @@ -0,0 +1,21 @@ +;===- ./lib/Transforms/LocalMem/LLVMBuild.txt ------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = LocalMem +parent = Transforms diff --git a/lib/LocalMem/LocalMem.cpp b/lib/LocalMem/LocalMem.cpp new file mode 100644 index 0000000000..896c3f382a --- /dev/null +++ b/lib/LocalMem/LocalMem.cpp @@ -0,0 +1,224 @@ +//===-------------------------- LocalMem.cpp --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "LocalMem" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Constant.h" +#include "llvm/SupportVISC/DFG2LLVM.h" + +using namespace llvm; +using namespace builddfg; +using namespace dfg2llvm; + +namespace { +// Helper Functions + +static AllocationNodeProperty* isAllocationNode(DFLeafNode* N); + +// LocalMem - The first implementation. +struct LocalMem : public ModulePass { + static char ID; // Pass identification, replacement for typeid + LocalMem() : ModulePass(ID) {} + +private: + // Member variables + + // Functions + +public: + bool runOnModule(Module &M); + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<BuildDFG>(); + AU.addPreserved<BuildDFG>(); + } +}; + +// Visitor for Code generation traversal (tree traversal for now) +class AT_OCL : public CodeGenTraversal { + +private: + //Member variables + + //Functions + + // Virtual Functions + void init() {} + void initRuntimeAPI() {} + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); + +public: + // Constructor + AT_OCL(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG) { + //init(); + //initRuntimeAPI(); + } + +}; + +bool LocalMem::runOnModule(Module &M) { + errs() << "\nLOCALMEM PASS\n"; + + // Get the BuildDFG Analysis Results: + // - Dataflow graph + // - Maps from i8* hansles to DFNode and DFEdge + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + //DFInternalNode *Root = DFG.getRoot(); + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); + // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); + + // Visitor for Code Generation Graph Traversal + AT_OCL *ATVisitor = new AT_OCL(M, DFG); + + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode: Roots) { + // Initiate code generation for root DFNode + ATVisitor->visit(rootNode); + // Go ahead and replace the launch intrinsic with pthread call, otherwise return now. + // TODO: Later on, we might like to do this in a separate pass, which would + // allow us the flexibility to switch between complete static code generation + // for DFG or having a customized runtime+scheduler + } + + delete ATVisitor; + return true; +} + +void AT_OCL::codeGen(DFInternalNode* N) { + DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n"); +} + +// Code generation for leaf nodes +void AT_OCL::codeGen(DFLeafNode* N) { + DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n"); + // Skip code generation if it is a dummy node + if(N->isDummyNode()) { + DEBUG(errs() << "Skipping dummy node\n"); + return; + } + + // Check and mark as allocation node + AllocationNodeProperty* ANP = isAllocationNode(N); + if(ANP != NULL) { + // set Properties of the allocation node + N->setProperty(DFNode::Allocation, ANP); + AllocationNodeProperty* anp = (AllocationNodeProperty*) N->getProperty(DFNode::Allocation); + AllocationNodeProperty::AllocationListType AL = anp->getAllocationList(); + DEBUG(errs() << "Total allocations = " << AL.size() << "\n"); + for(auto P: AL) { + DEBUG(errs() << " EdgePort: " << P.first->getDestPosition()); + DEBUG(errs() << " Size: " << *P.second << "\n"); + } + + } +} + +// Return pointer to property if this leaf node matches the conditions for being an allocation +// node. +// Conditions +// 1. No incoming memory pointer. No in/out attribute on a pointer argument +// 2. Uses visc malloc intrinsic to allocate memory +// 3. Sends it out +// 2. (TODO:) Whether the allocated pointer escapes the parent node +AllocationNodeProperty* isAllocationNode(DFLeafNode* N) { + // Allocation node must be free from side-effects + if(N->hasSideEffects()) + return NULL; + + // Allocation node must have some outgoing edges + if(N->getOutputType()->isEmptyTy()) + return NULL; + + Function* F = N->getFuncPointer(); + + // Allocation node must use visc malloc intrinsic + bool usesVISCMalloc = false; + for(inst_iterator i = inst_begin(F), e = inst_end(F); i != e; i++) { + Instruction* I = &*i; + if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(I)) { + if(II->getIntrinsicID() == Intrinsic::visc_malloc) { + usesVISCMalloc = true; + break; + } + } + } + if(!usesVISCMalloc) + return NULL; + + // TODO: Check if allocated pointer leaves parent node + + // This is an allocation node + AllocationNodeProperty* ANP = new AllocationNodeProperty(); + // Find the return statement. + // FIXME: For now, assuming their is just one BB. Terminator instruction of + // this BB is a return statement. The value returned is what we need + BasicBlock& BB = F->getEntryBlock(); + assert(isa<ReturnInst>(BB.getTerminator()) + && "Currently we do not handle the case where Allocation Node has multiple BB"); + ReturnInst* RI = dyn_cast<ReturnInst>(BB.getTerminator()); + // Find the returned struct + Value* val = RI->getReturnValue(); + std::vector<Value*> OutValues(6, NULL); + unsigned numOutputs = N->getOutputType()->getNumElements(); + for(unsigned i = 0; i < numOutputs; i++) { + if(InsertValueInst* IV = dyn_cast<InsertValueInst>(val)) { + DEBUG(errs() << "Value at out edge" << numOutputs-1-i << ": " << *val << "\n"); + OutValues[numOutputs-1-i] = IV->getOperand(1); + val = IV->getOperand(0); + } + else { + DEBUG(errs() << "Unexpected value at out edge: " << *val << "\n"); + llvm_unreachable("Expecting InsertValue instruction. Error!"); + } + } + // OutValues vector contains all the values that will go out + // Assume that the Allocation node only sends the pointers and their sizes + // forward + unsigned i=0; + while(i < numOutputs) { + assert(OutValues[i]->getType()->isPointerTy() + && "Expected outgoing edge to be of pointer type"); + if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(OutValues[i])) { + if(II->getIntrinsicID() == Intrinsic::visc_malloc) { + // Sanity check: Size passed to malloc intrinsic is same as the value + // going into the next outgoing edge + DEBUG(errs() << "Visc malloc size: " << *II->getArgOperand(0) << "\n"); + DEBUG(errs() << "Out edge value: " << *OutValues[i+1] << "\n"); + assert(II->getArgOperand(0) == OutValues[i+1] + && "Sanity Check Failed: VISC Malloc size argument != next outgoing edge"); + ANP->insertAllocation(N->getOutDFEdgeAt(i), II->getArgOperand(0)); + i = i+2; + continue; + } + } + llvm_unreachable("Expecting visc malloc intrinsic instruction!"); + } + return ANP; +} + +} // End of namespace + +char LocalMem::ID = 0; +static RegisterPass<LocalMem> X("localmem", + "Pass to identifying nodes amenable to local memory allocation", + false /* does not modify the CFG */, + true /* transformation, not just analysis */); + diff --git a/lib/LocalMem/LocalMem.exports b/lib/LocalMem/LocalMem.exports new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/MergeDFN/CMakeLists.txt b/lib/MergeDFN/CMakeLists.txt new file mode 100644 index 0000000000..30e7330d0c --- /dev/null +++ b/lib/MergeDFN/CMakeLists.txt @@ -0,0 +1,12 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( LLVMMergeDFN + MergeDFN.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) diff --git a/lib/MergeDFN/LLVMBuild.txt b/lib/MergeDFN/LLVMBuild.txt new file mode 100644 index 0000000000..099486e6c3 --- /dev/null +++ b/lib/MergeDFN/LLVMBuild.txt @@ -0,0 +1,21 @@ +;===- ./lib/Transforms/MergeDFN/LLVMBuild.txt ------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = MergeDFN +parent = Transforms diff --git a/lib/MergeDFN/MergeDFN.cpp b/lib/MergeDFN/MergeDFN.cpp new file mode 100644 index 0000000000..35e70e35ce --- /dev/null +++ b/lib/MergeDFN/MergeDFN.cpp @@ -0,0 +1,2338 @@ +//=== DFG2LLVM_NVPTX.cpp ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define ENABLE_ASSERTS + +#define DEBUG_TYPE "MergeDFN" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/IR/Attributes.h" +#include "llvm/SupportVISC/VISCTimer.h" +#include "llvm/SupportVISC/DFG2LLVM.h" + +#include <sstream> + +using namespace llvm; +using namespace builddfg; +using namespace dfg2llvm; + +static cl::opt<std::string> Node1Name("mc1", + cl::init(""), + cl::Hidden, + cl::desc("First node candidate for merge")); +static cl::opt<std::string> Node2Name("mc2", + cl::init(""), + cl::Hidden, + cl::desc("Second node candidate for merge")); + +namespace { +// Helper class declarations + +// Helper function declarations + +// MergeDFN +struct MergeDFN : public ModulePass { + static char ID; // Pass identification, replacement for typeid + MergeDFN() : ModulePass(ID) {} + +private: + // Member variables + + // Functions + +public: + // Functions + bool runOnModule(Module &M); + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<BuildDFG>(); + AU.addPreserved<BuildDFG>(); //TODO: Check + } + +}; + +// Visitor for Code generation traversal (tree traversal for now) +class MergeTraversal : public DFNodeVisitor { + +private: + //Member variables + Module &M; + BuildDFG &DFG; + DFNode *n1; + DFNode *n2; + DFNode *m; + + //Functions + void testNodeName(DFNode* N); + +public: + // Constructor + MergeTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) { + n1 = NULL; + n2 = NULL; + m = NULL; + } + + virtual void visit(DFInternalNode* N) { + // Follows a bottom-up approach to find the nodes. + for(DFGraph::children_iterator i = N->getChildGraph()->begin(), + e = N->getChildGraph()->end(); i != e; ++i) { + DFNode* child = *i; + child->applyDFNodeVisitor(*this); + } + + DEBUG(errs() << "Testing Node (I) - " << N->getFuncPointer()->getName() << "\n"); + testNodeName(N); + DEBUG(errs() << "\tDONE - " << "\n"); + + } + + virtual void visit(DFLeafNode* N) { + DEBUG(errs() << "Testing Node (L) - " << N->getFuncPointer()->getName() << "\n"); + testNodeName(N); + DEBUG(errs() << "DONE" << "\n"); + } + + bool isValidMergeChoise(); + + void mergeDFN(); + +}; + +//===--------------------- Helper Function Declarations --------------===// +IntrinsicInst* createIdenticalCreateNodeWithDifferentFunction(Function* F, + IntrinsicInst* II); +IntrinsicInst* createNewCreateNodeBasedOn(Function* F, IntrinsicInst* II, + Function* Fargs); +IntrinsicInst* createIdenticalCreateEdgeWithDifferentPort(IntrinsicInst* II, +unsigned port, bool srcport); +IntrinsicInst* createIdenticalCreateEdgeWithDifferentNode(IntrinsicInst* II, +IntrinsicInst* IInode, bool srcnode); +IntrinsicInst* createIdenticalBindInputWithDifferentNode(IntrinsicInst* II, + IntrinsicInst* IInode); +IntrinsicInst* createIdenticalBindInputWithDifferentPort(IntrinsicInst* II, + unsigned port, + bool srcport); +IntrinsicInst* createIdenticalBindOutputWithDifferentNode(IntrinsicInst* II, + IntrinsicInst* IInode); +IntrinsicInst* createIdenticalBindOutputWithDifferentPort(IntrinsicInst* II, + unsigned port, + bool srcport); +void updateUsesOfCreateNodeInParent(IntrinsicInst* II1, + IntrinsicInst* II2, + IntrinsicInst* IInew, + std::map<unsigned, unsigned> InMap, + std::map<unsigned, unsigned> OutMap, + std::vector<DFEdge*> &DFEdgestoRemove, + BuildDFG &DFG); +bool isIncomingEdgeIntrinsic(IntrinsicInst* IIe, IntrinsicInst* IIn); +bool isOutgoingEdgeIntrinsic(IntrinsicInst* IIe, IntrinsicInst* IIn); +bool hasSuccessor(DFNode* N1, DFNode* N2); +bool hasImmediateSuccesssor(DFNode* N1, DFNode* N2); +bool checkEdgesType(DFNode* N1, DFNode* N2); +static void createArgTypes(DFNode* N1, DFNode* N2, std::vector<Type*> &ArgTypes); +void getChildNodeSplit(DFInternalNode* N, + std::vector<DFNode*> &AllocationNodes, + std::vector<DFNode*> &ComputeNodes); +void buildInputAndOutputMaps(DFNode* N1, DFNode* N2, + std::map<unsigned, unsigned> &N1InMap, + std::map<unsigned, unsigned> &N1OutMap, + std::map<unsigned, unsigned> &N2InMap, + std::map<unsigned, unsigned> &N2OutMap); +void buildInAndOutEdgeMaps(DFNode* N1, DFNode* N2, + std::map<unsigned, unsigned> &N1InMap, + std::map<unsigned, unsigned> &N1OutMap, + std::map<unsigned, unsigned> &N2InMap, + std::map<unsigned, unsigned> &N2OutMap); +static StructType* createReturnType(DFNode* N1, DFNode* N2); +static void copyAttrList(DFNode* N1, DFNode* N2, Function* F); +static void copyArgumentNames(DFNode* N1, DFNode* N2, Function* F); +void createShiftMap(Function* F, unsigned fromPos, unsigned num, + unsigned shift, std::vector<unsigned> &ShiftMap); +void shiftArgs(Function* F, unsigned fromPos, unsigned num, + unsigned shift, std::vector<unsigned> &ShiftMap); +static Function* createEmptyDFNodeFunction(DFNode* N1, DFNode* N2, Module &M); +static Function* createLeafDFNodeFunction(DFNode* N1, DFNode* N2, Module &M, + unsigned numOfN1AllocArgs, + unsigned posOfN1AllocArgs, + unsigned numOfN2AllocArgs); +static Function* createInternalDFNodeFunction(DFNode* N1, DFNode* N1an, + DFNode* N1cn, DFNode* N2, + DFNode* N2an, DFNode* N2cn, + Function* Fa, Function* Fc, + Module &M, + unsigned numOfN1AllocArgs, + unsigned posOfN1AllocArgs, + unsigned numOfN2AllocArgs); +void createNewInternalNodeIntrinsics(DFNode* N1, + DFNode* N2, + DFNode* N1a, + DFNode* N1c, + DFNode* N2a, + DFNode* N2c, + IntrinsicInst* IInewa, + IntrinsicInst* IInewc, + Function* Fa, //FIXME: Unused + Function* Fc, + std::vector<IntrinsicInst*> &IntrinsicInstructionsToAdd, + std::vector<IntrinsicInst*> &IntermediateInstructions); +Argument* getFunctionArgumentAt(Function* F, unsigned i); +void removeUnnecessaryInputEdges(DFNode* N, DFNode* N1, + unsigned numOfN1AllocArgs, + unsigned numOfN2AllocArgs); +void deleteInternalNodeFunction(DFNode* N, BuildDFG &DFG); +static visc::Target getPreferredTarget(Function* F); +static void addHint(Function* F, visc::Target T); +static void removeHint(Function* F, visc::Target T); +std::string getTestModuleName(Module &M); + + +//===--------------------- MergeDFN Outlined Functions --------------===// +void MergeTraversal::testNodeName(DFNode* N) { + + if (N->getFuncPointer()->getName() == Node1Name) { + //if (N->getFuncPointer()->getName() == "WrapperDilate_cloned") { + //if (N->getFuncPointer()->getName() == "WrapperDilate_cloned_WrapperErode_cloned") { + //if (N->getFuncPointer()->getName() == "WrapperHorizontal_cloned") { + //if (N->getFuncPointer()->getName() == "WrapperHorizontal_cloned_WrapperVertical_cloned") { + n1 = N; + } + else if (N->getFuncPointer()->getName() == Node2Name) { + //else if (N->getFuncPointer()->getName() == "WrapperErode_cloned") { + //else if (N->getFuncPointer()->getName() == "WrapperLincomb_cloned") { + //else if (N->getFuncPointer()->getName() == "WrapperVertical_cloned") { + //else if (N->getFuncPointer()->getName() == "WrapperSquareRoot_cloned") { + n2 = N; + } +} + +//TODO: use the topological sort to find merge candidates +bool MergeTraversal::isValidMergeChoise() { + if (!n1 || !n2) + return false; + + // Check that n1 and n2 have the same + // - parent + // - hint + // - number and size of dimensions of dynamic instances + bool valid = (n1->getParent() == n2->getParent()) && + (getPreferredTarget(n1->getFuncPointer()) == + getPreferredTarget(n2->getFuncPointer())) && + (n1->getNumOfDim() == n2->getNumOfDim()); + + std::vector<Value*> n1dim = n1->getDimLimits(); + std::vector<Value*> n2dim = n2->getDimLimits(); + for (unsigned i = 0; (i < n1dim.size()) && valid ; i++) + valid = valid && (n1dim[i] == n2dim[i]); + + // n1 should not be a successor of n2 + valid = valid && !hasSuccessor(n2, n1); + // n2 should not be a successor of n1, other than an immediate successor + valid = valid && (!hasSuccessor(n1, n2) || hasImmediateSuccesssor(n1, n2)); + + if (!valid) + return false; + + // Now, check specifically for one or two level cases + if (dyn_cast<DFLeafNode>(n1) && dyn_cast<DFLeafNode>(n1)) { + // For now, only allow one to one edges between them + return checkEdgesType(n1, n2); + } + + //At this point, at least one of them is internal node + + DFInternalNode* n1cast = dyn_cast<DFInternalNode>(n1); + DFInternalNode* n2cast = dyn_cast<DFInternalNode>(n2); + + // If not both of them are internal nodes, it is not a valid merging + if (!n1cast || !n2cast) + return false; + + // At this point, they are both internal nodes + // For internal nodes, we only allow one-to-one edges + valid = valid && checkEdgesType(n1->getParent(), n2->getParent()); // FIXME: n1 and n2? + + // We need to check that they have the appropriate internal structure + std::vector<DFNode*> AllocNodes1, ComputeNodes1, AllocNodes2, ComputeNodes2; + getChildNodeSplit(n1cast, AllocNodes1, ComputeNodes1); + getChildNodeSplit(n2cast, AllocNodes2, ComputeNodes2); + + // There must be at most a single allocation node within each one of them + // There must be exactly one compute node within each one of them + valid = valid && + (AllocNodes1.size() <= 1) && + (AllocNodes2.size() <= 1) && + (ComputeNodes1.size() == 1) && + (ComputeNodes2.size() == 1); + + // The compute nodes must be leaf nodes with the same number and size of + // dimensions of dynamic instances + DFLeafNode* n1cn = dyn_cast<DFLeafNode>(ComputeNodes1[0]); + DFLeafNode* n2cn = dyn_cast<DFLeafNode>(ComputeNodes2[0]); + if (!n1cn || !n2cn) + return false; + + errs() << "Checking if the sizes are same for internal nodes\n"; + + valid = valid && (n1cn->getNumOfDim() == n2cn->getNumOfDim()); + std::vector<Value*> n1cndim = n1cn->getDimLimits(); + std::vector<Value*> n2cndim = n2cn->getDimLimits(); + + for (unsigned i = 0; (i < n1cndim.size()) && valid ; i++) { + // These cannot fail, these valaues have been passed as arguments + Argument* n1arg = cast<Argument>(n1cndim[i]); + Argument* n2arg = cast<Argument>(n2cndim[i]); + unsigned n1argPos = n1arg->getArgNo(); + unsigned n2argPos = n2arg->getArgNo(); + // These values are coming from bind intrinsics, thus from the parent node + // The position of the argument is the same as the inPort of the incoming + // edge of their parent, n1 and n2. + DFEdge* n1argEdge = n1->getInDFEdgeAt(n1argPos); + DFEdge* n2argEdge = n2->getInDFEdgeAt(n2argPos); + // Get source position and node of these edges + unsigned n1SrcPos = n1argEdge->getSourcePosition(); + DFNode* n1SrcNode = n1argEdge->getSourceDF(); + unsigned n2SrcPos = n2argEdge->getSourcePosition(); + DFNode* n2SrcNode = n2argEdge->getSourceDF(); + valid = valid && (n1SrcPos == n2SrcPos) && (n1SrcNode == n2SrcNode); + } + + // We must also make sure that any edge that is incoming to the allocation + // node of n2 is not from n1 + if (AllocNodes2.size() == 1) { + DFNode* n2an = AllocNodes2[0]; + unsigned inPort = 0; + for (DFNode::const_indfedge_iterator ei = n2an->indfedge_begin(), + ee = n2an->indfedge_end(); (ei != ee) && valid ; ei++, inPort++) + if (n2an->getExtendedInDFEdgeAt(inPort)->getSourceDF() == ComputeNodes1[0]) + return false; + } + + return valid; +} + +void MergeTraversal::mergeDFN() { + + Function* Fm; + + if (dyn_cast<DFLeafNode>(n1)) { // One level node merging, + // n1 and n2 are leaf nodes + // Simply create the merged leaf function (with the calls) + Fm = createLeafDFNodeFunction(n1, n2, M, 0, 0, 0); + addHint(Fm, getPreferredTarget(n1->getFuncPointer())); + removeHint(n1->getFuncPointer(), getPreferredTarget(n1->getFuncPointer())); + removeHint(n2->getFuncPointer(), getPreferredTarget(n2->getFuncPointer())); + } else { // Two level node merging, n1 and n2 are internal nodes + // Correct form of internal nodes has been verified in isValidMerge + // Both n1 and n2 have at most two children: + // a compute node and maybe an allocation node + std::vector<DFNode*> AllocationNodes; + std::vector<DFNode*> ComputeNodes; + + getChildNodeSplit(cast<DFInternalNode>(n1), AllocationNodes, ComputeNodes); + DFLeafNode* N1ComputeNode = cast<DFLeafNode>(ComputeNodes[0]); + DFLeafNode* N1AllocationNode = + (AllocationNodes.size() == 1) ? cast<DFLeafNode>(AllocationNodes[0]): NULL; + AllocationNodes.clear(); + ComputeNodes.clear(); + getChildNodeSplit(cast<DFInternalNode>(n2), AllocationNodes, ComputeNodes); + DFLeafNode* N2ComputeNode = cast<DFLeafNode>(ComputeNodes[0]); + DFLeafNode* N2AllocationNode = + (AllocationNodes.size() == 1) ? cast<DFLeafNode>(AllocationNodes[0]): NULL; + + Function* Falloc = NULL; + if (N1AllocationNode && N2AllocationNode) + Falloc = createLeafDFNodeFunction(N1AllocationNode, + N2AllocationNode, + M, 0, 0, 0); + else if (N1AllocationNode) + Falloc = N1AllocationNode->getFuncPointer(); + else if (N2AllocationNode) + Falloc = N2AllocationNode->getFuncPointer(); + + unsigned numOfN1AllocArgs = 0; + unsigned posOfN1AllocArgs = 0; + unsigned numOfN2AllocArgs = 0; + if (N1AllocationNode) { + StructType* F1RetTy = + cast<StructType>(N1AllocationNode->getFuncPointer()->getReturnType()); + numOfN1AllocArgs = F1RetTy->getNumElements(); + // The position where the allocation node's arguments of n1 alloc go in + // the merged function's parameter list is the same as it was in n1 + // compute function, because all the incoming edges to n1 do not change. + // We need this information to shift the allocation parameters to the + // end of the merged function's parameter list + posOfN1AllocArgs = + N1AllocationNode->getOutDFEdgeAt(0)->getDestPosition(); + } + if (N2AllocationNode) { + StructType* F2RetTy = + cast<StructType>(N2AllocationNode->getFuncPointer()->getReturnType()); + numOfN2AllocArgs = F2RetTy->getNumElements(); + } + + errs () << "Working on leaf functions ...\n"; + Function* Fcompute = + createLeafDFNodeFunction(N1ComputeNode, + N2ComputeNode, + M, numOfN1AllocArgs, + posOfN1AllocArgs, numOfN2AllocArgs); + addHint(Fcompute, getPreferredTarget(N1ComputeNode->getFuncPointer())); + removeHint(N1ComputeNode->getFuncPointer(), + getPreferredTarget(N1ComputeNode->getFuncPointer())); + removeHint(N2ComputeNode->getFuncPointer(), + getPreferredTarget(N2ComputeNode->getFuncPointer())); + + errs () << "Leaf functions merged ...\n"; + Fm = createInternalDFNodeFunction(n1, N1AllocationNode, N1ComputeNode, + n2, N2AllocationNode, N2ComputeNode, + Falloc, Fcompute, + M, numOfN1AllocArgs, + posOfN1AllocArgs, numOfN2AllocArgs); + addHint(Fm, getPreferredTarget(n1->getFuncPointer())); + removeHint(n1->getFuncPointer(), getPreferredTarget(n1->getFuncPointer())); + removeHint(n2->getFuncPointer(), getPreferredTarget(n2->getFuncPointer())); + } + errs () << "Leaf functions merged and Internal Function merged ...\n"; + // This is before any code generation passes -> no genfunc + + // FIX PARENT DFNode'S FUNCTION + DFInternalNode* ParentNode = n1->getParent(); + + // Find createNode intrinsics for initial nodes + IntrinsicInst* II1 = n1->getInstruction(); + IntrinsicInst* II2 = n2->getInstruction(); + + // Generate createNode Intrinsic for new node and insert it + IntrinsicInst* CreateNodeII = + createIdenticalCreateNodeWithDifferentFunction(Fm, II1); + + // It needs to be inserted before either of the two. + // Find which one is first and add the new intrinsic before it + IntrinsicInst* IIfirst = NULL; + for (inst_iterator ib = inst_begin(ParentNode->getFuncPointer()), + ie = inst_end(ParentNode->getFuncPointer()); + (ib != ie) && !IIfirst ; ++ib) { + Instruction* I = &*ib; // Grab pointer to Instruction + if ((I == II1) || (I == II2)) { + IIfirst = cast<IntrinsicInst>(I); + } + } + CreateNodeII->insertBefore(IIfirst); + +/* The following is an alternative to using the BuildDFG interface. It only * + * creates this single node, not cnotinuing with the graph contained, thus * + * will not build the graph of the node if it is internal node. Instead, I * + * use the call DFG.handleCreateNode */ + +/* +// -------------------------------------------------------------------------- // +// Updating the graph directly + // Create the new node and add it to the graph + DFLeafNode* mergeDFNode = DFLeafNode::Create(CreateNodeII, Fm, + n1->getTargetHint(), + ParentNode, + n1->getNumOfDim(), + n1->getDimLimits()); + //Done Later: fix rank of mergeDFNode and successors, after edges are fixed + // mergeDFNode->setRank((n1->getRank() > n2->getRank()) ? + // (n1->getRank()) : (n2->getRank()) ); + + ParentNode->addChildToDFGraph(mergeDFNode); +// -------------------------------------------------------------------------- // +*/ + +// -------------------------------------------------------------------------- // +// Updating the BuildDFG result +// remove the two nodes from mapping, add the new one + errs () << "Updating intrinsics\n"; + DFG.removeElementFromHandleToDFNodeMap(II1); + DFG.removeElementFromHandleToDFNodeMap(II2); +// DFG.addElementToHandleToDFNodeMap(CreateNodeII, mergeDFNode); + DFG.handleCreateNode(ParentNode, CreateNodeII); + DFNode* mergeDFNode = DFG.getHandleToDFNodeMap()[CreateNodeII]; + +// -------------------------------------------------------------------------- // + + // Need to update every use of the createNode in the parent node function + // -- that would be in create edge and bind + std::map<unsigned, unsigned> N1InMap; + std::map<unsigned, unsigned> N1OutMap; + std::map<unsigned, unsigned> N2InMap; + std::map<unsigned, unsigned> N2OutMap; + // These maps map the old location of an argument/output (to its function's + // parameter list/out struct) to the new, after edges removed and functions + // merged + buildInputAndOutputMaps(n1, n2, N1InMap, N1OutMap, N2InMap, N2OutMap); + + // Edges from n1 to n2 need to be deleted. + // They are placed here for deletion at the end. + std::vector<DFEdge*> DFEdgestoRemove; + + // Update uses of createNode - that would be createEdge and bind intrinsics - + // to use the new createNode intrinsic + updateUsesOfCreateNodeInParent(II1, II2, CreateNodeII, N1InMap, N1OutMap, + DFEdgestoRemove, DFG); + updateUsesOfCreateNodeInParent(II2, II1, CreateNodeII, N2InMap, N2OutMap, + DFEdgestoRemove, DFG); + + // Both II1 and II2 have no uses left. It is safe to remove them. + errs() << "Erasing: " << *II1 << "\n"; + II1->eraseFromParent(); + errs() << "Erasing: " << *II2 << "\n"; + II2->eraseFromParent(); + +// -------------------------------------------------------------------------- // +// Updating the graph directly + + // Update + // - dataflow edges + // - successor lists + // - incoming and outgoing edge lists + // The edges are updated directly, therefore in the DFGraph DFEdgeList as well + + // For n1 + for (DFNode::indfedge_iterator indfedgeI = n1->indfedge_begin(), + indfedgeE = n1->indfedge_end(); indfedgeI != indfedgeE; indfedgeI++) { + DFEdge* E = *indfedgeI; + // Incoming edges are retargeted to new node in graph + E->setDestDF(mergeDFNode); + // Incoming edges are added to the incoming edge list + // ( no need to add them in the outgoing edge list of source nodes, + // they are already there ) + mergeDFNode->addInDFEdge(E); + // Merge node is added to the successor list of the sources of the edges + E->getSourceDF()->addSuccessor(mergeDFNode); + } + + for (DFNode::outdfedge_iterator outdfedgeI = n1->outdfedge_begin(), + outdfedgeE = n1->outdfedge_end(); outdfedgeI != outdfedgeE; outdfedgeI++) { + DFEdge* E = *outdfedgeI; + // Outgoing edges to n2 are deleted + if (E->getDestDF() == n2) { + ParentNode->getChildGraph()->deleteEdge(E); + continue; + } + + // Outgoing edges are retargeted to start from the new node in graph + E->setSourceDF(mergeDFNode); + // Outgoing edges' source port is updated + E->setSourcePosition(N1OutMap[E->getSourcePosition()]); + // Outgoing edges are added to the outgoing edge list + // ( no need to add them in the incoming edge list of destination nodes, + // they are already there ) + mergeDFNode->addOutDFEdge(E); + // The destination node is added to the successor list of merge node + mergeDFNode->addSuccessor(E->getDestDF()); + } + + // For n2 + for (DFNode::indfedge_iterator indfedgeI = n2->indfedge_begin(), + indfedgeE = n2->indfedge_end(); indfedgeI != indfedgeE; indfedgeI++) { + DFEdge* E = *indfedgeI; + // Incoming edges from n1 have already been removed from the graph - ignore + if (E->getSourceDF() == n1) { + DEBUG(errs() << "Edges between n1-n2 have already been removed from graph\n"); + } + + // Incoming edges are retargeted to new node in graph + E->setDestDF(mergeDFNode); + // Incoming edges' destination port is updated + E->setDestPosition(N2InMap[E->getDestPosition()]); + // Incoming edges are added to the incoming edge list + // ( no need to add them in the outgoing edge list of source nodes, + // they are already there ) + mergeDFNode->addInDFEdge(E); + // Merge node is added to the successor list of the sources of the edges + E->getSourceDF()->addSuccessor(mergeDFNode); + } + + for (DFNode::outdfedge_iterator outdfedgeI = n2->outdfedge_begin(), + outdfedgeE = n2->outdfedge_end(); outdfedgeI != outdfedgeE; outdfedgeI++) { + DFEdge* E = *outdfedgeI; + // Outgoing edges are retargeted to start from the new node in graph + E->setSourceDF(mergeDFNode); + // Outgoing edges' source port is updated + E->setSourcePosition(N2OutMap[E->getSourcePosition()]); + // Outgoing edges are added to the outgoing edge list + // ( no need to add them in the incoming edge list of destination nodes, + // they are already there ) + mergeDFNode->addOutDFEdge(E); + // The destination node is added to the successor list of merge node + mergeDFNode->addSuccessor(E->getDestDF()); + } + + +// -------------------------------------------------------------------------- // + + +// -------------------------------------------------------------------------- // +// Updating the graph directly + + // Compute rank of mergeDFNode and update rank of successors + mergeDFNode->setRank((n1->getRank() > n2->getRank()) ? + (n1->getRank()) : (n2->getRank()) ); + + // Clear their incoming and outgoing edges vectors, and the successors list + n1->clearGraphElements(); + n2->clearGraphElements(); + + // Clear them from the parent graph + ParentNode->removeChildFromDFGraph(n1); + ParentNode->removeChildFromDFGraph(n2); + + /* + delete n1; + delete n2; + for (unsigned i = 0 ; i < DFEdgestoRemove.size(); i++) + delete DFEdgestoRemove[i]; +*/ + +// -------------------------------------------------------------------------- // + errs() << "Removing similar arguments\n"; + if (dyn_cast<DFLeafNode>(n1)) { + removeUnnecessaryInputEdges(mergeDFNode, n1, 0, 0); + // Erase old functions from module + n1->getFuncPointer()->replaceAllUsesWith(UndefValue::get(n1->getFuncPointer()->getType())); + n1->getFuncPointer()->eraseFromParent(); + n2->getFuncPointer()->replaceAllUsesWith(UndefValue::get(n2->getFuncPointer()->getType())); + n2->getFuncPointer()->eraseFromParent(); + + } else { + std::vector<DFNode*> AllocationNodes; + std::vector<DFNode*> ComputeNodes; + + // Get components of n1 + getChildNodeSplit(cast<DFInternalNode>(n1), AllocationNodes, ComputeNodes); + DFLeafNode* N1ComputeNode = cast<DFLeafNode>(ComputeNodes[0]); + DFLeafNode* N1AllocationNode = + (AllocationNodes.size() == 1) ? cast<DFLeafNode>(AllocationNodes[0]): NULL; + + AllocationNodes.clear(); + ComputeNodes.clear(); + + // Get components of n2 + getChildNodeSplit(cast<DFInternalNode>(n2), AllocationNodes, ComputeNodes); + DFLeafNode* N2AllocationNode = + (AllocationNodes.size() == 1) ? cast<DFLeafNode>(AllocationNodes[0]): NULL; + DFLeafNode* N2ComputeNode = cast<DFLeafNode>(ComputeNodes[0]); + + AllocationNodes.clear(); + ComputeNodes.clear(); + + // Get components of mergeDFNode + getChildNodeSplit(cast<DFInternalNode>(mergeDFNode), AllocationNodes, + ComputeNodes); + DFLeafNode* ComputeNode = cast<DFLeafNode>(ComputeNodes[0]); + + unsigned numOfN1AllocArgs = 0; + unsigned numOfN2AllocArgs = 0; + if (N1AllocationNode) { + StructType* F1RetTy = + cast<StructType>(N1AllocationNode->getFuncPointer()->getReturnType()); + numOfN1AllocArgs = F1RetTy->getNumElements(); + } + if (N2AllocationNode) { + StructType* F2RetTy = + cast<StructType>(N2AllocationNode->getFuncPointer()->getReturnType()); + numOfN2AllocArgs = F2RetTy->getNumElements(); + } + + errs() << "Removing unnecessary input arguments\n"; + removeUnnecessaryInputEdges(ComputeNode, N1ComputeNode, numOfN1AllocArgs, + numOfN2AllocArgs); + + N1ComputeNode->getFuncPointer()->replaceAllUsesWith(UndefValue::get(N1ComputeNode->getFuncPointer()->getType())); + N1ComputeNode->getFuncPointer()->eraseFromParent(); + N2ComputeNode->getFuncPointer()->replaceAllUsesWith(UndefValue::get(N2ComputeNode->getFuncPointer()->getType())); + N2ComputeNode->getFuncPointer()->eraseFromParent(); + } + + errs() << "Deleting internal nodes\n"; + + deleteInternalNodeFunction(n1, DFG); + deleteInternalNodeFunction(n2, DFG); + + errs() << "Returning\n"; + return; +} + +bool MergeDFN::runOnModule(Module &M) { + errs() << "\nMergeDFN PASS\n"; + + // Get the BuildDFG Analysis Results: + // - Dataflow graph + // - Maps from i8* handles to DFNode and DFEdge + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + // DFInternalNode *Root = DFG.getRoot(); + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + + // Visitor for Code Generation Graph Traversal + MergeTraversal *MergeLookup = new MergeTraversal(M, DFG); + + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode: Roots) { + // Initiate code generation for root DFNode + MergeLookup->visit(rootNode); + } + + if (MergeLookup->isValidMergeChoise()) { + errs() << "Valid Merge Choise. Begin merging..\n"; + DEBUG(errs() << "Valid Merge Choise. Begin merging..\n"); + MergeLookup->mergeDFN(); + } else { + errs() << "Not Valid Merge Choise. Abort merging.\n"; + DEBUG(errs() << "Not Valid Merge Choise. Abort merging.\n"); + } + + delete MergeLookup; + + return true; +} + +/****************************************************************************** + * Helper functions * + ******************************************************************************/ + +// Creates a new createNode intrinsic, similar to II but with different +// associated function F instead +IntrinsicInst* createIdenticalCreateNodeWithDifferentFunction(Function* F, + IntrinsicInst* II) { + Module* M = F->getParent(); + + // Find which createNode intrinsic we need to create + Function* CreateNodeF = Intrinsic::getDeclaration(M, II->getIntrinsicID()); + Constant* Fp = ConstantExpr::getPointerCast(F, + Type::getInt8PtrTy(II->getContext())); + + ArrayRef<Value*> CreateNodeArgs; + switch (II->getIntrinsicID()) { + case Intrinsic::visc_createNode: + { + CreateNodeArgs = ArrayRef<Value*>(Fp); + break; + } + case Intrinsic::visc_createNode1D: + { + Value* CreateNode1DArgs[] = {Fp, II->getArgOperand(1)}; + CreateNodeArgs = ArrayRef<Value*>(CreateNode1DArgs, 2); + break; + } + case Intrinsic::visc_createNode2D: + { + Value* CreateNode2DArgs[] = {Fp, II->getArgOperand(1), + II->getArgOperand(2)}; + CreateNodeArgs = ArrayRef<Value*>(CreateNode2DArgs, 3); + break; + } + case Intrinsic::visc_createNode3D: + { + Value* CreateNode3DArgs[] = {Fp, II->getArgOperand(1), + II->getArgOperand(2), + II->getArgOperand(3)}; + CreateNodeArgs = ArrayRef<Value*>(CreateNode3DArgs, 4); + break; + } + default : + assert(false && "Unknown createNode intrinsic"); + break; + } + + CallInst* CI = CallInst::Create(CreateNodeF, + CreateNodeArgs, + F->getName()+".node"); + IntrinsicInst* CreateNodeII = cast<IntrinsicInst>(CI); + return CreateNodeII; +} + +// Creates a new createNode intrinsic based on II. +// The new intrinsic has different associated function F instead. II is used to +// determine the location (in the parameter list of function Fargs) where the +// arguments of the new intrinsic can be found. +IntrinsicInst* createNewCreateNodeBasedOn(Function* F, IntrinsicInst* II, + Function* Fargs) { + Module* M = F->getParent(); + + // Find which createNode intrinsic we need to create + Function* CreateNodeF = Intrinsic::getDeclaration(M, II->getIntrinsicID()); + Constant* Fp = ConstantExpr::getPointerCast(F, + Type::getInt8PtrTy(II->getContext())); + + std::vector<Argument*> FArgList; + for (auto& arg: Fargs->getArgumentList()) { + FArgList.push_back(&arg); + } + + ArrayRef<Value*> CreateNodeArgs; + switch (II->getIntrinsicID()) { + case Intrinsic::visc_createNode: + { + CreateNodeArgs = ArrayRef<Value*>(Fp); + break; + } + case Intrinsic::visc_createNode1D: + { + Value* CreateNode1DArgs[] = {Fp, + FArgList[cast<Argument>(II->getArgOperand(1))->getArgNo()]}; + CreateNodeArgs = ArrayRef<Value*>(CreateNode1DArgs, 2); + break; + } + case Intrinsic::visc_createNode2D: + { + Value* CreateNode2DArgs[] = {Fp, + FArgList[cast<Argument>(II->getArgOperand(1))->getArgNo()], + FArgList[cast<Argument>(II->getArgOperand(2))->getArgNo()]}; + CreateNodeArgs = ArrayRef<Value*>(CreateNode2DArgs, 3); + break; + } + case Intrinsic::visc_createNode3D: + { + Value* CreateNode3DArgs[] = {Fp, + FArgList[cast<Argument>(II->getArgOperand(1))->getArgNo()], + FArgList[cast<Argument>(II->getArgOperand(2))->getArgNo()], + FArgList[cast<Argument>(II->getArgOperand(3))->getArgNo()]}; + CreateNodeArgs = ArrayRef<Value*>(CreateNode3DArgs, 4); + break; + } + default : + assert(false && "Unknown createNode intrinsic"); + break; + } + + CallInst* CI = CallInst::Create(CreateNodeF, + CreateNodeArgs, + F->getName()+".node"); + IntrinsicInst* CreateNodeII = cast<IntrinsicInst>(CI); + return CreateNodeII; +} + + +// create an identical createEdge with different src (true) or dst (false) node +IntrinsicInst* createIdenticalCreateEdgeWithDifferentNode(IntrinsicInst* II, +IntrinsicInst* IInode, bool srcnode) { + // Argument of the function to be called + Value* SrcNode = (srcnode) ? IInode: II->getArgOperand(0); + Value* DstNode = (srcnode) ? II->getArgOperand(1): IInode; + + Value* EdgeArgs[] = {SrcNode, DstNode, + II->getArgOperand(2), + II->getArgOperand(3), + II->getArgOperand(4), + II->getArgOperand(5) + }; + +// Function* EdgeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge); + Function* EdgeF = II->getCalledFunction(); + CallInst* EdgeInst = CallInst::Create(EdgeF, + ArrayRef<Value*>(EdgeArgs, 6), + II->getName()+".repl"); + IntrinsicInst* newII = dyn_cast<IntrinsicInst>(EdgeInst); + assert(newII && "Cannot cast createEdge to IntrinsicInst"); + + return newII; +} + +// create an identical createEdge with different src (true) or dst (false) port +IntrinsicInst* createIdenticalCreateEdgeWithDifferentPort(IntrinsicInst* II, +unsigned port, bool srcport) { + // Argument of the function to be called + ConstantInt* PortConstant = + ConstantInt::get(Type::getInt32Ty(II->getContext()), port); + Value* SrcPort = (srcport) ? PortConstant: II->getArgOperand(3); + Value* DstPort = (srcport) ? II->getArgOperand(4): PortConstant; + + Value* EdgeArgs[] = {II->getArgOperand(0), + II->getArgOperand(1), + II->getArgOperand(2), + SrcPort, DstPort, + II->getArgOperand(5) + }; + +// Function* EdgeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge); + Function* EdgeF = II->getCalledFunction(); + CallInst* EdgeInst = CallInst::Create(EdgeF, + ArrayRef<Value*>(EdgeArgs, 6), + II->getName()+".repl"); + IntrinsicInst* newII = dyn_cast<IntrinsicInst>(EdgeInst); + assert(newII && "Cannot cast createEdge to IntrinsicInst"); + + return newII; +} + +// create an identical bindInput with different destination node +IntrinsicInst* createIdenticalBindInputWithDifferentNode(IntrinsicInst* II, + IntrinsicInst* IInode) { + Value* BindArgs[] = {IInode, + II->getArgOperand(1), + II->getArgOperand(2), + II->getArgOperand(3) + }; +// Function* BindF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input); + Function* BindF = II->getCalledFunction(); + CallInst* BindInst = CallInst::Create(BindF, + ArrayRef<Value*>(BindArgs, 4), + ""); + IntrinsicInst* newII = dyn_cast<IntrinsicInst>(BindInst); + assert(newII && "Cannot cast bind_output to IntrinsicInst"); + + return newII; +} + +// create an identical bindInput with different src (true) or dst (false) port +IntrinsicInst* createIdenticalBindInputWithDifferentPort(IntrinsicInst* II, + unsigned port, + bool srcport) { + // Argument of the function to be called + ConstantInt* PortConstant = + ConstantInt::get(Type::getInt32Ty(II->getContext()), port); + Value* SrcPort = (srcport) ? PortConstant: II->getArgOperand(1); + Value* DstPort = (srcport) ? II->getArgOperand(2): PortConstant; + + Value* BindArgs[] = {II->getArgOperand(0), + SrcPort, + DstPort, + II->getArgOperand(3) + }; +// Function* BindF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input); + Function* BindF = II->getCalledFunction(); + CallInst* BindInst = CallInst::Create(BindF, + ArrayRef<Value*>(BindArgs, 4), + ""); + IntrinsicInst* newII = dyn_cast<IntrinsicInst>(BindInst); + assert(newII && "Cannot cast bind_output to IntrinsicInst"); + + return newII; +} + +// create an identical bindOutput with different source node +IntrinsicInst* createIdenticalBindOutputWithDifferentNode(IntrinsicInst* II, + IntrinsicInst* IInode) { + Value* BindArgs[] = {IInode, + II->getArgOperand(1), + II->getArgOperand(2), + II->getArgOperand(3) + }; +// Function* BindF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output); + Function* BindF = II->getCalledFunction(); + CallInst* BindInst = CallInst::Create(BindF, + ArrayRef<Value*>(BindArgs, 4), + ""); + IntrinsicInst* newII = dyn_cast<IntrinsicInst>(BindInst); + assert(newII && "Cannot cast bind_output to IntrinsicInst"); + + return newII; +} + +// create an identical bindOutput with different src (true) or dst (false) port +IntrinsicInst* createIdenticalBindOutputWithDifferentPort(IntrinsicInst* II, + unsigned port, + bool srcport) { + // Argument of the function to be called + ConstantInt* PortConstant = + ConstantInt::get(Type::getInt32Ty(II->getContext()), port); + Value* SrcPort = (srcport) ? PortConstant: II->getArgOperand(1); + Value* DstPort = (srcport) ? II->getArgOperand(2): PortConstant; + + Value* BindArgs[] = {II->getArgOperand(0), + SrcPort, + DstPort, + II->getArgOperand(3) + }; +// Function* BindF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output); + Function* BindF = II->getCalledFunction(); + CallInst* BindInst = CallInst::Create(BindF, + ArrayRef<Value*>(BindArgs, 4), + ""); + IntrinsicInst* newII = dyn_cast<IntrinsicInst>(BindInst); + assert(newII && "Cannot cast bind_output to IntrinsicInst"); + + return newII; +} + +// Function to find each use of a createNode intrinsic for a node existing +// before merging and properly replace it with a use of the createNode for +// node created after node merging +// II1 is the createNode for the node that got merged, whose uses we want to replace +// II2 is the createNode for the other node that got merged +// (we need this to determine if en edge should be updated or deleted) +// InMap and Outmap maps map the old location of an argument/output to the new +// one, after edges removed and functions merged +// CreateEdge for edges from n1 to n2 need to be deleted and associated +// intrinsics to be removed. They are placed in the two vectors. +void updateUsesOfCreateNodeInParent(IntrinsicInst* II1, + IntrinsicInst* II2, + IntrinsicInst* IInew, + std::map<unsigned, unsigned> InMap, + std::map<unsigned, unsigned> OutMap, + std::vector<DFEdge*> &DFEdgestoRemove, + BuildDFG &DFG) { + std::vector<IntrinsicInst*> IItoRemove; + + for (Value::user_iterator i = II1->user_begin(), ie = II1->user_end(); + i != ie; ++i) { + Instruction *VI = dyn_cast<Instruction>(*i); + IntrinsicInst* II = dyn_cast<IntrinsicInst>(VI); + assert(II && "Use of a node handle outside of a visc intrinsic"); + + switch(II->getIntrinsicID()) { + case Intrinsic::visc_createEdge: + { + if (isOutgoingEdgeIntrinsic(II,II1)) { // check for outgoing edges + if (isIncomingEdgeIntrinsic(II,II2)) { + // edge is between merged nodes + // createEdge is marked for deletion, if not already there + if (std::find(IItoRemove.begin(),IItoRemove.end(),II) == IItoRemove.end()) { + IItoRemove.push_back(II); + // ------------------------------------------------------------ // + // Updating the BuildDFG result + // remove handle for non-existing edge in mapping + DFEdge* EdgeInMapping = DFG.getHandleToDFEdgeMap()[II]; + DFG.removeElementFromHandleToDFEdgeMap(II); + DFEdgestoRemove.push_back(EdgeInMapping); + // ------------------------------------------------------------ // + } + } else { // Edge is outgoing, but to another node in the graph + // We need to change Src and SrcPort + // create an identical createEdge with different srcport + unsigned srcPos = cast<ConstantInt>(II->getOperand(3))->getZExtValue(); + IntrinsicInst* newII = + createIdenticalCreateEdgeWithDifferentPort(II, + OutMap[srcPos], + true); + // and insert it before the current create edge + newII->insertBefore(II); + // change of operand II1 will happen at the end with replaceAllUsesWith + // mark this createEdge for deletion + IItoRemove.push_back(II); + // -------------------------------------------------------------- // + // Updating the BuildDFG result + // replace handle for edge in mapping + DFEdge* EdgeInMapping = DFG.getHandleToDFEdgeMap()[II]; + DFG.removeElementFromHandleToDFEdgeMap(II); + DFG.addElementToHandleToDFEdgeMap(newII, EdgeInMapping); + // -------------------------------------------------------------- // + } + } else { // isIncomingEdgeIntrinsic(II,II1) : check for incoming edges + if (isOutgoingEdgeIntrinsic(II,II2)) { + // edge is between merged nodes + // createEdge is marked for deletion, if not already there + if (std::find(IItoRemove.begin(),IItoRemove.end(),II) == IItoRemove.end()) { + IItoRemove.push_back(II); + // ------------------------------------------------------------ // + // Updating the BuildDFG result + // remove handle for non-existing edge in mapping + DFEdge* EdgeInMapping = DFG.getHandleToDFEdgeMap()[II]; + DFG.removeElementFromHandleToDFEdgeMap(II); + DFEdgestoRemove.push_back(EdgeInMapping); + // ------------------------------------------------------------ // + } + } else { // Edge is incoming, but from another node + // We need to change Dst node and DstPort + // create an identical createEdge with different dstport + unsigned dstPos = cast<ConstantInt>(II->getOperand(4))->getZExtValue(); + IntrinsicInst* newII = + createIdenticalCreateEdgeWithDifferentPort(II, + InMap[dstPos], + false); + // and insert it before the current create edge + newII->insertBefore(II); + // change of operand II1 will happen at the end with replaceAllUsesWith + // mark this createEdge for deletion + IItoRemove.push_back(II); + // -------------------------------------------------------------- // + // Updating the BuildDFG result + // replace handle for edge in mapping + DFEdge* EdgeInMapping = DFG.getHandleToDFEdgeMap()[II]; + DFG.removeElementFromHandleToDFEdgeMap(II); + DFG.addElementToHandleToDFEdgeMap(newII, EdgeInMapping); + // -------------------------------------------------------------- // + } + } + } + break; + case Intrinsic::visc_bind_input: + { + // incoming bind from parent node + // We need to change Dst node and DstPort + // create an identical bindInput with different dstport + unsigned dstPos = cast<ConstantInt>(II->getOperand(2))->getZExtValue(); + IntrinsicInst* newII = + createIdenticalBindInputWithDifferentPort(II, InMap[dstPos], false); + // and insert it before the current bind + newII->insertBefore(II); + // change of operand II1 will happen at the end with replaceAllUsesWith + // mark this bind for deletion + IItoRemove.push_back(II); + // ------------------------------------------------------------------ // + // Updating the BuildDFG result + // replace handle for edge in mapping + DFEdge* EdgeInMapping = DFG.getHandleToDFEdgeMap()[II]; + DFG.removeElementFromHandleToDFEdgeMap(II); + DFG.addElementToHandleToDFEdgeMap(newII, EdgeInMapping); + // ------------------------------------------------------------------ // + } + break; + case Intrinsic::visc_bind_output: + { + // outgoing bind to parent node + // We need to change Src node and SrcPort + // create an identical bindOutput with different srcport + unsigned srcPos = cast<ConstantInt>(II->getOperand(1))->getZExtValue(); + IntrinsicInst* newII = + createIdenticalBindOutputWithDifferentPort(II, OutMap[srcPos], true); + // and insert it before the current bind + newII->insertBefore(II); + // change of operand II1 will happen at the end with replaceAllUsesWith + // mark this bind for deletion + IItoRemove.push_back(II); + // ------------------------------------------------------------------ // + // Updating the BuildDFG result + // replace handle for edge in mapping + DFEdge* EdgeInMapping = DFG.getHandleToDFEdgeMap()[II]; + DFG.removeElementFromHandleToDFEdgeMap(II); + DFG.addElementToHandleToDFEdgeMap(newII, EdgeInMapping); + // ------------------------------------------------------------------ // + } + break; + default : + assert(false && "Unknown use of node handle"); + break; + } + } + + // Delete gathered instructions + for (std::vector<IntrinsicInst *>::iterator ib = IItoRemove.begin(), + ie = IItoRemove.end(); ib != ie; ++ib) { + DEBUG(errs() << "Erasing: " << **ib << "\n"); + (*ib)->eraseFromParent(); + } + + // Change all remaining edge-bind intrinsics containing n1 to the new node + II1->replaceAllUsesWith(IInew); + +} + +// Query the king of edge described by a createEdge intrinsic +// with respect to node handle IIn +bool isIncomingEdgeIntrinsic(IntrinsicInst* IIe, IntrinsicInst* IIn) { + Value* Src = IIe->getArgOperand(1); + IntrinsicInst* ArgII = cast<IntrinsicInst>(Src); +// IntrinsicInst* ArgII = cast<IntrinsicInst>(Src->stripPointerCasts()); + assert(ArgII && "First argument of createEdge is not an intrinsic"); + return (ArgII == IIn); +} + +bool isOutgoingEdgeIntrinsic(IntrinsicInst* IIe, IntrinsicInst* IIn) { + Value* Src = IIe->getArgOperand(0); + IntrinsicInst* ArgII = cast<IntrinsicInst>(Src); +// IntrinsicInst* ArgII = cast<IntrinsicInst>(Src->stripPointerCasts()); + assert(ArgII && "First argument of createEdge is not an intrinsic"); + return (ArgII == IIn); +} + +/* + * Return true if n2 is a successor of n1 + */ +bool hasSuccessor(DFNode* N1, DFNode* N2) { + for (DFNode::const_successor_iterator i = N1->successors_begin(), + e = N1->successors_end(); + i != e; i++) { + DFNode* N = *i; + if ((N == N2) || (hasSuccessor(N,N1))) return true; + } + return false; +} + +/* + * Return true if n2 is an immediate successor of n1 + */ +bool hasImmediateSuccesssor(DFNode* N1, DFNode* N2) { + for (DFNode::const_successor_iterator i = N1->successors_begin(), + e = N1->successors_end(); + i != e; i++) { + DFNode* N = *i; + if (N == N2) return true; + } + return false; +} + +/* + * Return true if all edges between n1 and n2 are one-to-one + */ +bool checkEdgesType(DFNode* N1, DFNode* N2) { + for (DFNode::const_outdfedge_iterator i = N1->outdfedge_begin(), + e = N1->outdfedge_end(); + i != e; i++) { + DFEdge* E = *i; + if ((E->getDestDF() == N2) && (E->getEdgeType())) return false; + } + return true; +} + +// Construct argument list +// Assuming that N2 cannot be an ansestor of N1 +static void createArgTypes(DFNode* N1, DFNode* N2, std::vector<Type*> &ArgTypes) { + Function* F1 = N1->getFuncPointer(); + Function* F2 = N2->getFuncPointer(); + + for(auto& arg: F1->getArgumentList()) { + DEBUG(errs() << arg << "\n"); + ArgTypes.push_back(arg.getType()); + } + + unsigned inport = 0; + for(auto& arg: F2->getArgumentList()) { + DEBUG(errs() << arg << "\n"); + if (N2->getExtendedInDFEdgeAt(inport)->getSourceDF() != N1) + ArgTypes.push_back(arg.getType()); + inport++; + } + +} + +// Returns the allocation nodes and the compute nodes of a parent dataflow node +void getChildNodeSplit(DFInternalNode* N, + std::vector<DFNode*> &AllocationNodes, + std::vector<DFNode*> &ComputeNodes) { + DFGraph::const_children_iterator ci = N->getChildGraph()->begin(); + DFGraph::const_children_iterator ce = N->getChildGraph()->end(); + + for ( ; ci != ce; ci++ ) { + DFNode* child = *ci; + if (child->isAllocationNode()) + AllocationNodes.push_back(child); + else if (!child->isDummyNode()) + ComputeNodes.push_back(child); + } + +} + +// Creates a map between the old locations of parameters and outputs in the +// functions before merging, and the new one after merge. Those that correspond +// to edges that no longer exist (between the merged nodes) are not in the maps. +void buildInputAndOutputMaps(DFNode* N1, DFNode* N2, + std::map<unsigned, unsigned> &N1InMap, + std::map<unsigned, unsigned> &N1OutMap, + std::map<unsigned, unsigned> &N2InMap, + std::map<unsigned, unsigned> &N2OutMap) { + unsigned n1NumInputs = 0; + for (unsigned i = 0; i < N1->getFuncPointer()->getArgumentList().size(); + i++, n1NumInputs++) { + N1InMap[i] = i; + } + for (unsigned i = 0, inpos = 0; + i < N2->getFuncPointer()->getArgumentList().size(); i++) { + if (N2->getExtendedInDFEdgeAt(i)->getSourceDF() != N1) { + N2InMap[i] = inpos+n1NumInputs; + inpos++; + } + } + + unsigned n1NumOutputs = 0; + StructType* F1RetTy = cast<StructType>(N1->getFuncPointer()->getReturnType()); + for (unsigned i = 0; i < F1RetTy->getNumElements(); i++) { + if (N1->getExtendedOutDFEdgeAt(i)->getDestDF() != N2) { + N1OutMap[i] = n1NumOutputs; + n1NumOutputs++; + } + } + + StructType* F2RetTy = cast<StructType>(N2->getFuncPointer()->getReturnType()); + for (unsigned i = 0; i < F2RetTy->getNumElements(); i++) { + N2OutMap[i] = i+n1NumOutputs; + } + + return; +} + +// Creates a map between the old edge ports in the +// nodes before merging, and the new one after merge. Those that correspond +// to edges that no longer exist (between the merged nodes) are not in the maps. +void buildInAndOutEdgeMaps(DFNode* N1, DFNode* N2, + std::map<unsigned, unsigned> &N1InMap, + std::map<unsigned, unsigned> &N1OutMap, + std::map<unsigned, unsigned> &N2InMap, + std::map<unsigned, unsigned> &N2OutMap) { + + unsigned n1NumInEdges = N1->getFuncPointer()->getArgumentList().size(); + for (unsigned i = 0; i < n1NumInEdges; i++) { + N1InMap[i] = i; + } + + unsigned n1NumOutEdges = 0; + StructType* F1RetTy = cast<StructType>(N1->getFuncPointer()->getReturnType()); + for (unsigned i = 0; i < F1RetTy->getNumElements(); i++) { + if (N1->getExtendedOutDFEdgeAt(i)->getDestDF() != N2) { + N1OutMap[i] = n1NumOutEdges; + n1NumOutEdges++; + } + } + + unsigned n2NumInEdges = N2->getFuncPointer()->getArgumentList().size(); + for (unsigned i = 0, inpos = 0; i < n2NumInEdges; i++) { + if (N2->getExtendedInDFEdgeAt(i)->getSourceDF() != N1) { + N2InMap[i] = inpos+n1NumInEdges; + inpos++; + } + } + + StructType* F2RetTy = cast<StructType>(N2->getFuncPointer()->getReturnType()); + for (unsigned i = 0; i < F2RetTy->getNumElements(); i++) { + N2OutMap[i] = i+n1NumOutEdges; + } + + return; +} + +// Construct return type +// Assuming that N2 cannot be an ansestor of N1 +static StructType* createReturnType(DFNode* N1, DFNode* N2) { + Function* F1 = N1->getFuncPointer(); + Function* F2 = N2->getFuncPointer(); + + StructType* F1RetTy = dyn_cast<StructType>(F1->getReturnType()); + assert(F1RetTy && "Return Type must always be a struct"); + StructType* F2RetTy = dyn_cast<StructType>(F2->getReturnType()); + assert(F2RetTy && "Return Type must always be a struct"); + + std::vector<Type*> ReturnTypeElements; + unsigned outPos1 = 0, outPos2 = 0, outPosM = 0; + for (StructType::element_iterator i = F1RetTy->element_begin(), + e = F1RetTy->element_end(); + (i != e) && (outPos1 < F1RetTy->getNumElements()); i++, outPos1++) { + if (N1->getExtendedOutDFEdgeAt(outPos1)->getDestDF() == N2) + continue; + ReturnTypeElements.push_back(*i); + outPosM++; + } + + for (StructType::element_iterator i = F2RetTy->element_begin(), + e = F2RetTy->element_end(); + i != e && outPos2 < F2RetTy->getNumElements(); i++, outPos2++) { + ReturnTypeElements.push_back(*i); + outPosM++; + } + + errs() << "Return elements = " << ReturnTypeElements.size() << "\n"; + StructType* FRetTy = StructType::create(F1->getContext(), + ArrayRef<Type*>(ReturnTypeElements), + (F1->getName()+"."+F2->getName()+".ty").str(), true); + + errs() << "Struct type created\n"; + return FRetTy; +} + +// Copy attributes +// Assuming that N2 cannot be an ansestor of N1 +static void copyAttrList(DFNode* N1, DFNode* N2, Function* F) { + Function* F1 = N1->getFuncPointer(); + Function* F2 = N2->getFuncPointer(); + + Function::arg_iterator f1_ai = F1->arg_begin(), f1_ae = F1->arg_end(); + Function::arg_iterator f2_ai = F2->arg_begin(), f2_ae = F2->arg_end(); + Function::arg_iterator f_ai = F->arg_begin(), f_ae = F->arg_end(); + + unsigned inPos1 = 0, inPos2 = 0, inPosM = 0; + for(; f1_ai != f1_ae && f_ai != f_ae; ++f1_ai, ++f_ai, inPos1++, inPosM++) { + AttributeSet AS = F1->getAttributes(); + DEBUG(errs() << "Copying attributes from " << F1->getName() << " at " << f1_ai->getArgNo() << "\n"); + AttrBuilder AB(AS, f1_ai->getArgNo()+1); + AttributeSet argAS = AttributeSet::get(F1->getContext(), f_ai->getArgNo()+1, AB); + F->addAttributes(f_ai->getArgNo()+1, argAS); + } + for(; f2_ai != f2_ae && f_ai != f_ae; ++f2_ai, inPos2++) { + if (N2->getExtendedInDFEdgeAt(inPos2)->getSourceDF() == N1) + continue; + + AttributeSet AS = F2->getAttributes(); + DEBUG(errs() << "Copying attributes from " << F2->getName() << " at " << f2_ai->getArgNo() << "\n"); + AttrBuilder AB(AS, f2_ai->getArgNo()+1); + AttributeSet argAS = AttributeSet::get(F2->getContext(), f_ai->getArgNo()+1, AB); + F->addAttributes(f_ai->getArgNo()+1, argAS); + ++f_ai; + inPosM++; + } +} + +// Copy argument names +static void copyArgumentNames(DFNode* N1, DFNode* N2, Function* F) { + Function* F1 = N1->getFuncPointer(); + Function* F2 = N2->getFuncPointer(); + + Function::arg_iterator dest_it = F->arg_begin(); + + for(auto& arg: F1->getArgumentList()) { + dest_it->setName("n1_" + arg.getName()); + dest_it++; + } + + unsigned inport = 0; + for(auto& arg: F2->getArgumentList()) { + if (N2->getExtendedInDFEdgeAt(inport)->getSourceDF() != N1) { + dest_it->setName("n2_" + arg.getName()); + dest_it++; + } + inport++; + } +} + +// Creates shift map, which maps old position to new, after shifting num +// arguments starting from fromPos by shift positions to the right. +void createShiftMap(Function* F, unsigned fromPos, unsigned num, + unsigned shift, std::vector<unsigned> &ShiftMap) { + + for (unsigned i = 0; i < F->getArgumentList().size(); i++) + ShiftMap.push_back(i); + + for (unsigned i = fromPos; i < fromPos + num; i++) + ShiftMap[i] += shift; + + for (unsigned i = fromPos + num; i < fromPos + num + shift; i++) + ShiftMap[i] -= num; + +} + +// Shifts num arguments starting from fromPos by shift positions to the right, +// replacing with the arguments at those positions. +// Updates shift map, which maps old position to new. +void shiftArgs(Function* F, unsigned fromPos, unsigned num, + unsigned shift, std::vector<unsigned> &ShiftMap) { + Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + Function::arg_iterator from = ai; + + unsigned cnt; + for (cnt = 0; from != ae && cnt < fromPos; from++, cnt++) { + } + assert((cnt == fromPos) && "Invalid start position for argument shifting"); + + Function::arg_iterator af = from; + std::vector<Type*> ArgTypes; + std::vector<StringRef> ArgNames; + unsigned argNo = 0; + + //TODO: check if this copies attributes as well + ValueToValueMapTy VMap; + Function* F_copy = CloneFunction(F, VMap); + F_copy->removeFromParent(); + + // Arguments up until before from + for ( ; ai != from && ai != ae; ai++, argNo++) { + ArgTypes.push_back(ai->getType()); + ArgNames.push_back(ai->getName()); + } + + // Arguments to be shifted (num arguments) are skipped for now + for (unsigned i = 0; (i < num) && (ai != ae); i++, ai++, argNo++) { + ShiftMap[argNo] += shift; + } + + // Later arguments (#shift arguments) are pushed until we fill shift positions + for (unsigned i = 0; (ai != ae) && (i < shift); i++, ai++, argNo++) { + ArgTypes.push_back(ai->getType()); + ArgNames.push_back(ai->getName()); + ShiftMap[argNo] -= num; + } + + // Arguments that were to be shifted (num arguments) are now pushed + for (unsigned i = 0; (i < num) && (af != ae); i++, af++, argNo++) { + ArgTypes.push_back(af->getType()); + ArgNames.push_back(af->getName()); + } + + // Remaining arguments are pushed + for (; ai != ae; ai++) { + ArgTypes.push_back(ai->getType()); + ArgNames.push_back(ai->getName()); + } + + // Change function type + FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg()); + PointerType* PTy = FTy->getPointerTo(); + F->mutateType(PTy); + + // Shift argument names + ai = F->arg_begin(); + for (unsigned i = 0; ai != ae; ai++, i++) { + (*ai).setName(ArgNames[i]); + } + + // Shift attributes by deleting them from F and copying them from F_copy + + //Initialize required iterators for shift elements: from -> from_copy+shift + af = from; + Function::arg_iterator af_copy; + for (unsigned i = 0; i < af->getArgNo() + shift; i++, af_copy++) { + } + for (unsigned i = 0; i < num; i++, af++, af_copy++) { + AttributeSet ASf = F->getAttributes(); + AttributeSet ASfc = F_copy->getAttributes(); + + AttrBuilder ABfc(ASfc, af_copy->getArgNo()+1); + AttributeSet argASfc = AttributeSet::get(F_copy->getContext(), af->getArgNo()+1, ABfc); + F->removeAttributes(af->getArgNo()+1,ASf.getParamAttributes(af->getArgNo()+1)); + F->addAttributes(af->getArgNo()+1, argASfc); + } + //Initialize required iterators for num elements: to -> to_copy-num + af_copy = from; + for (unsigned i = 0; i < shift; i++, af++, af_copy++) { + AttributeSet ASf = F->getAttributes(); + AttributeSet ASfc = F_copy->getAttributes(); + + AttrBuilder ABfc(ASfc, af_copy->getArgNo()+1); + AttributeSet argASfc = AttributeSet::get(F_copy->getContext(), af->getArgNo()+1, ABfc); + F->removeAttributes(af->getArgNo()+1,ASf.getParamAttributes(af->getArgNo()+1)); + F->addAttributes(af->getArgNo()+1, argASfc); + } +} + +/* + * Create type of merged function + * - input arguments type + * - struct return type + * Get Attributes from original functions + * Get parameter names from original functions + * Insert an empty function of this type in the module + */ +static Function* createEmptyDFNodeFunction(DFNode* N1, DFNode* N2, Module &M) { + Function* F1 = N1->getFuncPointer(); + Function* F2 = N2->getFuncPointer(); + + errs () << "Constructing argument list\n"; + // Construct argument list + std::vector<Type*> ArgTypes; + createArgTypes(N1, N2, ArgTypes); + + errs () << "Constructing return type\n"; + // Construct return type + StructType* FRetTy = createReturnType(N1, N2); + + FunctionType* FTy = FunctionType::get(FRetTy, ArgTypes, false); + // Create a function with the new type + Function* F = Function::Create(FTy, F1->getLinkage(), + F1->getName()+"_"+F2->getName(), &M); + + errs () << "Copying argument names\n"; + // Copy argument names from original functions + copyArgumentNames(N1, N2, F); + // Copy argument attributes from original functions + copyAttrList(N1, N2, F); + + return F; +} + +/* + * Create function of leaf node after merging + * - create type + * - Create the call instructions + * - Create intermediate assignments + * - Create assignments to output struct + */ +static Function* createLeafDFNodeFunction(DFNode* N1, DFNode* N2, Module &M, + unsigned numOfN1AllocArgs, unsigned posOfN1AllocArgs, + unsigned numOfN2AllocArgs) { + + errs () << "Creating function signature\n"; + /* + * Create empty node function of the correct type + */ + Function* F = createEmptyDFNodeFunction(N1, N2, M); + + // Get return type, needed for building the assignmens to the return struct + StructType* FRetTy = cast<StructType>(F->getReturnType()); + + Function* F1 = N1->getFuncPointer(); + Function* F2 = N2->getFuncPointer(); + + errs () << "Creating function body\n"; + // This maps i: position in F argument list, to new position in F argument + // list (after shifting arguments maybe). Initially, no shift. + std::vector<unsigned> FArgsShiftMap(F->getArgumentList().size()); + for (unsigned i = 0; i < FArgsShiftMap.size(); i++) + FArgsShiftMap[i] = i; + + if (numOfN1AllocArgs) { + // Number of remaining f2 parameters is initial parameter number of f2 + // minus the number of edges between n1 and n2. We can also find this by + // getting the number of parameters of the new function F and subtract the + // number of parameters of F1, since this did not change. + unsigned shiftOfN1AllocArgs = F->getArgumentList().size() - + F1->getArgumentList().size() - + numOfN2AllocArgs; + shiftArgs(F, posOfN1AllocArgs, numOfN1AllocArgs, shiftOfN1AllocArgs, + FArgsShiftMap); + } + + + // Add a basic block to the new, empty function + BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F); + ReturnInst* RI = ReturnInst::Create(M.getContext(), + UndefValue::get(FRetTy), BB); + + + errs () << "Creating function call\n"; + // Get Argument list of new function into a vector (for easier indexing) + std::vector<Value*> FArgs; + for (auto& arg: F->getArgumentList()) { + FArgs.push_back(&arg); + } + + // Create call instruction for first node + std::vector<Value*> Args; + for (unsigned i = 0; i < F1->getArgumentList().size(); i++) { + Args.push_back(FArgs[FArgsShiftMap[i]]); + } + CallInst* CI1 = CallInst::Create(F1, + ArrayRef<Value*>(Args), + "merged."+F1->getName(), + RI); + Args.clear(); + + errs () << "Creating function call for second node\n"; + // Create call instruction for second node + for(unsigned fargNo = 0, i = 0; + i < F2->getArgumentList().size(); i++) { + Value* Arg; + if (N2->getExtendedInDFEdgeAt(i)->getSourceDF() == N1) { + ExtractValueInst *EI = + ExtractValueInst::Create(CI1, + N2->getExtendedInDFEdgeAt(i)->getSourcePosition(), + "", + RI); + Arg = EI; + } else { + Arg = FArgs[FArgsShiftMap[F1->getArgumentList().size() + fargNo++]]; + } + Args.push_back(Arg); + } + + CallInst* CI2 = CallInst::Create(F2, + ArrayRef<Value*>(Args), + "merged."+F2->getName(), + RI); + + errs () << "Creating extract element instructions\n"; + // Create extract element instructions for elements of output struct + std::vector<ExtractValueInst *> ExtractValueInstVec; + + // First, from node n1: exclude those that go to n2 + StructType *F1RetTy = dyn_cast<StructType>(F1->getReturnType()); + for (unsigned i = 0; i < F1RetTy->getNumElements(); i++) { + if (N1->getExtendedOutDFEdgeAt(i)->getDestDF() != N2) { + ExtractValueInst *EI = ExtractValueInst::Create(CI1, i, "", RI); + ExtractValueInstVec.push_back(EI); + } + } + // Then, from node n2 + StructType *F2RetTy = dyn_cast<StructType>(F2->getReturnType()); + for (unsigned i = 0; i < F2RetTy->getNumElements(); i++) { + ExtractValueInst *EI = ExtractValueInst::Create(CI2, i, "", RI); + ExtractValueInstVec.push_back(EI); + } + + errs () << "Creating output struct\n"; + // Create output struct of type FRetTy + assert(FRetTy->getNumElements() == ExtractValueInstVec.size() && + "Size of output struct does not match expected number of EE instructions"); + Value* retVal = UndefValue::get(F->getReturnType()); + + for (unsigned i = 0; i < ExtractValueInstVec.size(); i++) { + InsertValueInst *IVI = + InsertValueInst::Create(retVal, ExtractValueInstVec[i], i, "", RI); + retVal = IVI; + } + ReturnInst* newRI = ReturnInst::Create(M.getContext(), retVal); + ReplaceInstWithInst(RI, newRI); + + // Inline the two calls + InlineFunctionInfo IFI1, IFI2; + InlineFunction(CI1, IFI1, nullptr, false); + InlineFunction(CI2, IFI2, nullptr, false); + + return F; +} + +static Function* createInternalDFNodeFunction(DFNode* N1, DFNode* N1an, + DFNode* N1cn, DFNode* N2, DFNode* N2an, DFNode* N2cn, Function* Fa, + Function* Fc, Module &M, unsigned numOfN1AllocArgs, unsigned posOfN1AllocArgs, + unsigned numOfN2AllocArgs) { + + /* + * Create empty node function of the correct type + */ + Function* F = createEmptyDFNodeFunction(N1, N2, M); + + // Get return type, needed for building the assignmens to the return struct + StructType* FRetTy = cast<StructType>(F->getReturnType()); + +// Function* F1 = N1->getFuncPointer(); +// Function* F2 = N2->getFuncPointer(); + + // Add a basic block to the new, empty function + BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F); + ReturnInst* RI = ReturnInst::Create(M.getContext(), + UndefValue::get(FRetTy), BB); + + // Get Argument list of new function into a vector (for easier indexing) + std::vector<Value*> FArgs; + for (auto& arg: F->getArgumentList()) { + FArgs.push_back(&arg); + } + + // Get pointers to functions inthe original graph +// Function* F1a = (N1an) ? N1an->getFuncPointer() : NULL; +// Function* F2a = (N2an) ? N2an->getFuncPointer() : NULL; +// Function* F1c = N1cn->getFuncPointer(); +// Function* F2c = N2cn->getFuncPointer(); + + // Create the required createNode intrinsics + IntrinsicInst* AllocII = NULL; + if (N1an) + AllocII = createIdenticalCreateNodeWithDifferentFunction(Fa, + N1an->getInstruction()); + else if (N2an) + AllocII = createIdenticalCreateNodeWithDifferentFunction(Fa, + N2an->getInstruction()); + if (AllocII) + AllocII->insertBefore(RI); + + // The position in F (new node function) of the node dimensions parameters is + // the same as it was in n1 internal node function, because n1 is the first + // one to be added to the resulting merged node. + IntrinsicInst* ComputeII = + createNewCreateNodeBasedOn(Fc, N1cn->getInstruction(), F); + ComputeII->insertBefore(RI); + + // Vector to be populated with instructions to be added to internal node + std::vector<IntrinsicInst*> IntrinsicInstructionsToAdd; + std::vector<IntrinsicInst*> IntermediateInstructions; + + createNewInternalNodeIntrinsics(N1, N2, N1an, N1cn, N2an, N2cn, + AllocII, ComputeII, + Fa /* FIXME: Unused */, Fc, + IntrinsicInstructionsToAdd, + IntermediateInstructions); + + // Insert generated intrinsics at new internal function + for (auto& Inst: IntrinsicInstructionsToAdd) { + Inst->insertBefore(RI); + } + + // Insert generated intrinsics at new internal function and erase + for (auto& Inst: IntermediateInstructions) { + Inst->insertBefore(RI); + Inst->eraseFromParent(); + } + + return F; +} + +void createNewInternalNodeIntrinsics(DFNode* N1, + DFNode* N2, + DFNode* N1a, + DFNode* N1c, + DFNode* N2a, + DFNode* N2c, + IntrinsicInst* IInewa, + IntrinsicInst* IInewc, + Function* Fa, //FIXME: Unused + Function* Fc, + std::vector<IntrinsicInst*>& IntrinsicInstructionsToAdd, + std::vector<IntrinsicInst*>& IntermediateInstructions) { + IntrinsicInst* II1a = (N1a) ? N1a->getInstruction() : NULL; + IntrinsicInst* II1c = N1c->getInstruction(); + IntrinsicInst* II2a = (N2a) ? N2a->getInstruction() : NULL; + IntrinsicInst* II2c = N2c->getInstruction(); + + Function* F1a = (N1a) ? N1a->getFuncPointer() : NULL; + Function* F1c = N1c->getFuncPointer(); + Function* F2a = (N2a) ? N1a->getFuncPointer() : NULL; + + unsigned n1aNumOfInputs = 0; + unsigned n1aNumOfOutputs = 0; + unsigned n1aPosOfOutputs = 0; + if (N1a) { + n1aNumOfInputs = F1a->getArgumentList().size(); + n1aNumOfOutputs = cast<StructType>(F1a->getReturnType())->getNumElements(); + n1aPosOfOutputs = N1a->getOutDFEdgeAt(0)->getDestPosition(); + } + unsigned n2aNumOfOutputs = 0; + if (N2a) { + n2aNumOfOutputs = cast<StructType>(F2a->getReturnType())->getNumElements(); + } + + unsigned shiftOfN1AllocOutputs = Fc->getArgumentList().size() - + F1c->getArgumentList().size() - + n2aNumOfOutputs; + + std::map<unsigned, unsigned> N1cInMap; + std::map<unsigned, unsigned> N1cOutMap; + std::map<unsigned, unsigned> N2cInMap; + std::map<unsigned, unsigned> N2cOutMap; + // These maps map the old location of an argument/output (to its function's + // parameter list/out struct) to the new, after edges removed and functions + // merged + + // This accounts for argument shifting, due to allocation node n1 + std::vector<unsigned> FcShiftMap; + + buildInputAndOutputMaps(N1c, N2c, N1cInMap, N1cOutMap, N2cInMap, N2cOutMap); + createShiftMap(Fc, n1aPosOfOutputs, n1aNumOfOutputs, shiftOfN1AllocOutputs, + FcShiftMap); + + + std::map<unsigned, unsigned> N1InDFEdgeMap; + std::map<unsigned, unsigned> N1OutDFEdgeMap; + std::map<unsigned, unsigned> N2InDFEdgeMap; + std::map<unsigned, unsigned> N2OutDFEdgeMap; + buildInAndOutEdgeMaps(N1, N2, N1InDFEdgeMap, N1OutDFEdgeMap, N2InDFEdgeMap, + N2OutDFEdgeMap); + + + // Start with the intrinsics for allocation nodes n1a and n2a + + // TODO: This is only for testing, not needed for functionality + std::map<IntrinsicInst*, IntrinsicInst*> CreateEdgeAndBindMap; + + if (N1a) { // If there is an allocation node for the first node + for (Value::user_iterator i = II1a->user_begin(), ie = II1a->user_end(); + i != ie; ++i) { + Value *v = *i; + Instruction *VI = dyn_cast<Instruction>(v); + IntrinsicInst* II = dyn_cast<IntrinsicInst>(VI); + assert(II && "Use of a node handle outside of a visc intrinsic"); + + switch(II->getIntrinsicID()) { + case Intrinsic::visc_createEdge: + // This is between allocation and compute node of n1. + { + // Change source to new allocation node + IntrinsicInst* IItemp1 = + createIdenticalCreateEdgeWithDifferentNode(II, IInewa, true); + // Do not change source port + // Change destination node to new compute node + IntrinsicInst* IItemp2 = + createIdenticalCreateEdgeWithDifferentNode(IItemp1, IInewc, false); + // Change destination port to new port, after inmap and shift + unsigned dstPos = cast<ConstantInt>(II->getOperand(4))->getZExtValue(); + IntrinsicInst* EI = + createIdenticalCreateEdgeWithDifferentPort(IItemp2, + FcShiftMap[N1cInMap[dstPos]], false); + IntrinsicInstructionsToAdd.push_back(EI); + IntermediateInstructions.push_back(IItemp1); + IntermediateInstructions.push_back(IItemp2); + CreateEdgeAndBindMap[II] = EI; + } + break; + case Intrinsic::visc_bind_input: + // These are the inputs from the parent node. + { + // The destination ports will not change, only the destination will + // be changed to point to the new allocation node + IntrinsicInst* BI = + createIdenticalBindInputWithDifferentNode(II, IInewa); + IntrinsicInstructionsToAdd.push_back(BI); + CreateEdgeAndBindMap[II] = BI; + } + break; + case Intrinsic::visc_bind_output: + assert(false && "Allocation node handle found in visc_bind_output"); + break; + default: + assert(false && "Unknown use of node handle"); + break; + } + } + } + + if (N2a) { // If there is an allocation node fot the second node + for (Value::user_iterator i = II2a->user_begin(), ie = II2a->user_end(); + i != ie; ++i) { + Value *v = *i; + Instruction *VI = dyn_cast<Instruction>(v); + IntrinsicInst* II = dyn_cast<IntrinsicInst>(VI); + assert(II && "Use of a node handle outside of a visc intrinsic"); + + switch(II->getIntrinsicID()) { + case Intrinsic::visc_createEdge: + // This is between allocation and compute node of n2. + { + // Change source to new allocation node + IntrinsicInst* IItemp1 = + createIdenticalCreateEdgeWithDifferentNode(II, IInewa, true); + // Change source port to after all outputs of n1a + unsigned srcPos = cast<ConstantInt>(II->getOperand(3))->getZExtValue(); + IntrinsicInst* IItemp2 = + createIdenticalCreateEdgeWithDifferentPort(IItemp1, + srcPos + n1aNumOfOutputs, true); + // Change destination node to new compute node + IntrinsicInst* IItemp3 = + createIdenticalCreateEdgeWithDifferentNode(IItemp2, IInewc, false); + // Change destination port to new port, after inmap and shift + // Use of FcShiftMap is not required here - allocation outputs of + // n2a will not get shifted, but it is OK to use (1-1 at this point) + unsigned dstPos = cast<ConstantInt>(II->getOperand(4))->getZExtValue(); + IntrinsicInst* EI = + createIdenticalCreateEdgeWithDifferentPort(IItemp3, + FcShiftMap[N2cInMap[dstPos]], false); + IntrinsicInstructionsToAdd.push_back(EI); + IntermediateInstructions.push_back(IItemp1); + IntermediateInstructions.push_back(IItemp2); + IntermediateInstructions.push_back(IItemp3); + CreateEdgeAndBindMap[II] = EI; + } + break; + case Intrinsic::visc_bind_input: + // These are the inputs from the parent node. + { + // Change destination node to new allocation node + IntrinsicInst* IItemp1 = + createIdenticalBindInputWithDifferentNode(II, IInewa); + // Change source port to new port, after edgeinmap + unsigned srcPos = cast<ConstantInt>(II->getOperand(1))->getZExtValue(); + IntrinsicInst* IItemp2 = + createIdenticalBindInputWithDifferentPort(IItemp1, + N2InDFEdgeMap[srcPos], true); + // Change destination port to new port, after inmap and shift + unsigned dstPos = cast<ConstantInt>(II->getOperand(2))->getZExtValue(); + IntrinsicInst* BI = + createIdenticalBindInputWithDifferentPort(IItemp2, + dstPos + n1aNumOfInputs, false); + IntrinsicInstructionsToAdd.push_back(BI); + IntermediateInstructions.push_back(IItemp1); + IntermediateInstructions.push_back(IItemp2); + CreateEdgeAndBindMap[II] = BI; + } + break; + case Intrinsic::visc_bind_output: + assert(false && "Allocation node handle found in visc_bind_output"); + break; + default: + assert(false && "Unknown use of node handle"); + break; + } + } + } + + // Continue with the intrinsics for compute nodes n1c and n2c + + for (Value::user_iterator i = II1c->user_begin(), ie = II1c->user_end(); + i != ie; ++i) { // Handle inputs and outputs of n1 compute node + Value *v = *i; + Instruction *VI = dyn_cast<Instruction>(v); + IntrinsicInst* II = dyn_cast<IntrinsicInst>(VI); + assert(II && "Use of a node handle outside of a visc intrinsic"); + + switch(II->getIntrinsicID()) { + case Intrinsic::visc_createEdge: + // This is between allocation and compute node of n1. + { + // These edges should have been handled when dealing with the + // allocation nodes + assert(CreateEdgeAndBindMap.find(II) != CreateEdgeAndBindMap.end() && + "Edge between A-C node should have been handled while processing A"); + } + break; + case Intrinsic::visc_bind_input: + // These are the inputs from the parent node. + { + // The destination ports will not change, only the destination will + // be changed to point to the new compute node + IntrinsicInst* BI = + createIdenticalBindInputWithDifferentNode(II, IInewc); + IntrinsicInstructionsToAdd.push_back(BI); + CreateEdgeAndBindMap[II] = BI; + } + break; + case Intrinsic::visc_bind_output: + // These are the outputs to the parent node. + { + // If this goes to n2, ignore edge completely + unsigned srcPos = cast<ConstantInt>(II->getOperand(1))->getZExtValue(); + if (N1c->getExtendedOutDFEdgeAt(srcPos)->getDestDF() != N2c) { + // this bind creates an edge that ends up to another node in the graph + // Change source to new compute node + IntrinsicInst* IItemp1 = + createIdenticalBindOutputWithDifferentNode(II, IInewc); + // Change source port to new port after outmap + IntrinsicInst* IItemp2 = + createIdenticalBindOutputWithDifferentPort(IItemp1, + N1cOutMap[srcPos], true); + // Change destination port to new port after edgeoutmap + unsigned dstPos = cast<ConstantInt>(II->getOperand(2))->getZExtValue(); + IntrinsicInst* BI = + createIdenticalBindOutputWithDifferentPort(IItemp2, + N1OutDFEdgeMap[dstPos], false); + IntrinsicInstructionsToAdd.push_back(BI); + IntermediateInstructions.push_back(IItemp1); + IntermediateInstructions.push_back(IItemp2); + CreateEdgeAndBindMap[II] = BI; + } + } + break; + default: + errs() << "Unknown use: " << *II << "\n"; + assert(false && "Unknown use of node handle"); + break; + } + } + + for (Value::user_iterator i = II2c->user_begin(), ie = II2c->user_end(); + i != ie; ++i) { // Handle inputs and outputs of n2 compute node + Value *v = *i; + Instruction *VI = dyn_cast<Instruction>(v); + IntrinsicInst* II = dyn_cast<IntrinsicInst>(VI); + assert(II && "Use of a node handle outside of a visc intrinsic"); + + switch(II->getIntrinsicID()) { + case Intrinsic::visc_createEdge: + // This is between allocation and compute node of n2. + { + // These edges should have been handled when dealing with the + // allocation nodes + assert(CreateEdgeAndBindMap.find(II) != CreateEdgeAndBindMap.end() && + "Edge between A-C node should have been handled while processing A"); + } + break; + case Intrinsic::visc_bind_input: + // These are the inputs from the parent node. + { + // If this is incoming from n1 compute node, ignore completely + unsigned dstPos = cast<ConstantInt>(II->getOperand(2))->getZExtValue(); + if (N2c->getExtendedInDFEdgeAt(dstPos)->getSourceDF() != N1c) { + // this bind creates an edge that comes from another node in the graph + // Change destination to new compute node + IntrinsicInst* IItemp1 = + createIdenticalBindInputWithDifferentNode(II, IInewc); + // Change source port to new port after edgeinmap + unsigned srcPos = cast<ConstantInt>(II->getOperand(1))->getZExtValue(); + IntrinsicInst* IItemp2 = + createIdenticalBindInputWithDifferentPort(IItemp1, + N2InDFEdgeMap[srcPos], true); + // Change destination port to new port after inmap and shift + unsigned dstPos = cast<ConstantInt>(II->getOperand(2))->getZExtValue(); + IntrinsicInst* BI = + createIdenticalBindInputWithDifferentPort(IItemp2, + FcShiftMap[N2cInMap[dstPos]], false); + IntrinsicInstructionsToAdd.push_back(BI); + IntermediateInstructions.push_back(IItemp1); + IntermediateInstructions.push_back(IItemp2); + CreateEdgeAndBindMap[II] = BI; + } + } + break; + case Intrinsic::visc_bind_output: + // These are the outputs to the parent node. + { + // this bind creates an edge that ends up to another node in the graph + // Change source to new compute node + IntrinsicInst* IItemp1 = + createIdenticalBindOutputWithDifferentNode(II, IInewc); + // Change source port to new port after outmap + unsigned srcPos = cast<ConstantInt>(II->getOperand(1))->getZExtValue(); + IntrinsicInst* IItemp2 = + createIdenticalBindOutputWithDifferentPort(IItemp1, + N2cOutMap[srcPos], true); + // Change destination port to new port after edgeoutmap + unsigned dstPos = cast<ConstantInt>(II->getOperand(2))->getZExtValue(); + IntrinsicInst* BI = + createIdenticalBindOutputWithDifferentPort(IItemp2, + N2OutDFEdgeMap[dstPos], false); + IntrinsicInstructionsToAdd.push_back(BI); + IntermediateInstructions.push_back(IItemp1); + IntermediateInstructions.push_back(IItemp2); + CreateEdgeAndBindMap[II] = BI; + } + break; + default: + assert(false && "Unknown use of node handle"); + break; + } + } + +} + +void deleteInternalNodeFunction(DFNode* N, BuildDFG &DFG) { + + if (dyn_cast<DFLeafNode>(N)) + return; + + for (inst_iterator i = inst_begin(N->getFuncPointer()), + e = inst_end(N->getFuncPointer()); i != e ; ++i) { + Instruction* I = &*i; // Grab pointer to Instruction + if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(I)) { + switch(II->getIntrinsicID()) { + case Intrinsic::visc_createNode: + case Intrinsic::visc_createNode1D: + case Intrinsic::visc_createNode2D: + case Intrinsic::visc_createNode3D: + // ---------------------------------------------------------------- // + // Updating the BuildDFG result + // remove the node from mapping + DFG.removeElementFromHandleToDFNodeMap(II); + // ---------------------------------------------------------------- // + break; + case Intrinsic::visc_createEdge: + case Intrinsic::visc_bind_input: + case Intrinsic::visc_bind_output: + // ---------------------------------------------------------------- // + // Updating the BuildDFG result + // remove the edge from mapping + DFG.removeElementFromHandleToDFEdgeMap(II); + // ---------------------------------------------------------------- // + break; + default: + errs() << "Error: Invalid VISC Intrinsic inside Internal node!\n\t" << *II << "\n"; + break; + } + } + } + + // Erase Functions associated with node N + Function* F = N->getFuncPointer(); + +errs() << "Removing " << F->getName() << "\n"; + F->replaceAllUsesWith(UndefValue::get(F->getType())); + F->eraseFromParent(); + +} + +/* +void shiftAttrsToLeftBy(Function* F, unsigned shift, unsigned argNo) { + // Source attr location : i+shift (+1), dst : i (+1) + for (unsigned i = argno; i + shift < F->getArgumentList().size(); i++) { + AttributeSet AS = F->getAttributes(); + AttrBuilder AB(AS, i+shift+1); + AttributeSet argAS = AttributeSet::get(F->getContext(), i+1, AB); + F->removeAttributes(i+1,AS.getParamAttributes(i+1)); + F->addAttributes(i+1, argAS); + } + +} + +void shiftArgumentNamesToLeftBy(Function* F, unsigned shift, unsigned argNo) { + // Source attr location : i+shift (+1), dst : i (+1) + + // Skip arguments up until argNo + Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(), as = F->arg_begin(); + for ( ; (ai != ae) && (ai->getArgNo() < argNo); ++ai, ++as) { } + + // Find source of name + for ( unsigned i = 0; (i < shift) && (as != ae); i++) { + ++as; + } + + for ( ; (ai != ae) && (as != ae); ++ai, ++as) { + ai->setName(as->getName()); + } + +} + +void removeFunctionArgument(Function* F, Argument *ArgToRemove) { + + // Shift attributes one to the left + shiftAttrsToLeftBy(F, 1, ArgToRemove->getArgNo()); + // Shift argument names one to the left + shiftArgumentNamesToLeftBy(F, 1, ArgToRemove->getArgNo()); + // Update the type of F + std::vector<Type*> ArgTypes; + for(auto& arg: F->getArgumentList()) { + DEBUG(errs() << arg << "\n"); + if (&arg != ArgToRemove) + ArgTypes.push_back(arg.getType()); + } + FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg()); + PointerType* PTy = FTy->getPointerTo(); + F->mutateType(PTy); + +} +*/ + +Argument* getFunctionArgumentAt(Function* F, unsigned i) { + assert((i < F->getArgumentList().size()) && "Requesting argument in invalid position"); + for (auto& arg: F->getArgumentList()) { + if (arg.getArgNo() == i) + return &arg; + } + return NULL; +} + +// TODO +void removeUnnecessaryInputEdges(DFNode* N, DFNode* N1, + unsigned numOfN1AllocArgs, + unsigned numOfN2AllocArgs) { + Function* F = N->getFuncPointer(); + Function* F1 = N1->getFuncPointer(); + // Compute these once - they may change while in the loop + unsigned f1ArgListSize = F1->getArgumentList().size(); + unsigned fArgListSize = F->getArgumentList().size(); + // Iterate over input parameters of F1 without allocation arguments + for (unsigned i = 0; i < f1ArgListSize - numOfN1AllocArgs; i++) { + DFEdge* N1InEdge = N->getInDFEdgeAt(i); + unsigned n1SrcPos = N1InEdge->getSourcePosition(); + for (unsigned j = f1ArgListSize - numOfN1AllocArgs, + pos = f1ArgListSize - numOfN1AllocArgs; + j < fArgListSize - numOfN2AllocArgs; j++, pos++) { + DFEdge* N2InEdge = N->getInDFEdgeAt(pos); + unsigned n2SrcPos = N2InEdge->getSourcePosition(); + Argument* n1arg = getFunctionArgumentAt(F, i); + Argument* n2arg = getFunctionArgumentAt(F, j); + DEBUG(errs() << "Comparing " << *n1arg << " with " << *n2arg << "\n"); + // If the edges are coming from the same position of the same source node + // If the arguments are not pointer arguments, or if they are pointer + // arguments without the out attribute (they are only used as inputs) + if ((N1InEdge->getSourceDF() == N2InEdge->getSourceDF()) && + (n1SrcPos == n2SrcPos) && + ((!(n1arg->getType()->isPointerTy()) && + !(n2arg->getType()->isPointerTy())) || + (!(hasAttribute(F, i, Attribute::Out)) && + !(hasAttribute(F, pos, Attribute::Out))) ) ) { + DEBUG(errs() << "Replacing " << *n1arg << " with " << *n2arg << "\n"); + // It is safe to remove the second argument and replace its uses with + // the first one + n2arg->replaceAllUsesWith(n1arg); +// removeFunctionArgument(F, n2arg); TODO +// removeInputEdgeAt(F, pos); TODO + } else { + // It is not safe to remove the second argument. Update position +// pos++; TODO increase here instead of loop increment + } + } + } +} + +// This function checks the metadata in visc code for a function's target hint +static visc::Target getPreferredTarget(Function* F) { + DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n"); + Module* M = F->getParent(); + // checking for GPU hint + NamedMDNode* HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* N = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::GPU_TARGET; + } + + // checking for SPIR hint + HintNode = M->getOrInsertNamedMetadata("visc_hint_spir"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* N = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::SPIR_TARGET; + } + + // checking for CPU hint + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); + for(unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* N = HintNode->getOperand(i); + Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue(); + if(F == FHint) + return visc::CPU_TARGET; + } + return visc::None; +} + +// This function adds the hint as metadata in visc code +static void addHint(Function* F, visc::Target T) { + // Get Module + Module* M = F->getParent(); + DEBUG(errs() << "Set preferred target for " << F->getName() << ": " << T << "\n"); + + // Based on the hint, get the hint metadata + NamedMDNode* HintNode; + switch (T) { + case visc::GPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); + break; + case visc::SPIR_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_spir"); + break; + case visc::CPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); + break; + default: + llvm_unreachable("Unsupported Target Hint!"); + break; + } + + // Create a node for the function and add it to the hint node + MDNode* N = MDNode::get(M->getContext(), ArrayRef<Metadata*>(ValueAsMetadata::get(F))); + HintNode->addOperand(N); +} + +// This function removes the hint as metadata in visc code +static void removeHint(Function* F, visc::Target T) { + // Get Module + Module* M = F->getParent(); + DEBUG(errs() << "Remove preferred target for " << F->getName() << ": " << T << "\n"); + + // Based on the hint, get the hint metadata + NamedMDNode* HintNode; + switch (T) { + case visc::GPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu"); + break; + case visc::SPIR_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_spir"); + break; + case visc::CPU_TARGET: + HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu"); + break; + default: + llvm_unreachable("Unsupported Target Hint!"); + break; + } + + // Gather metadata nodes, and keep those not associated with this function + MDNode* N = MDNode::get(M->getContext(), ArrayRef<Metadata*>(ValueAsMetadata::get(F))); + std::vector<MDNode*> MDNodes; + + for (unsigned i = 0; i < HintNode->getNumOperands(); i++) { + MDNode* MDN = HintNode->getOperand(i); + if (MDN == N) { + continue; + } + MDNodes.push_back(MDN); + } + + HintNode->dropAllReferences(); + + for (unsigned i = 0; i < MDNodes.size(); i++) { + HintNode->addOperand(MDNodes[i]); + } + +} + +std::string getTestModuleName(Module &M) { + std::string mid = M.getModuleIdentifier(); + return mid.append(".original.ll"); +} + +} // End of namespace mergedfn + +char MergeDFN::ID = 0; +static RegisterPass<MergeDFN> X("mergedfn", + "Dataflow node merging optimization", + true /* modifies the CFG */, + true /* transformation, * + * not just analysis */); + diff --git a/lib/MergeDFN/MergeDFN.exports b/lib/MergeDFN/MergeDFN.exports new file mode 100644 index 0000000000..e69de29bb2 diff --git a/lib/ReplaceIntrinsics/CMakeLists.txt b/lib/ReplaceIntrinsics/CMakeLists.txt new file mode 100644 index 0000000000..0bfb2bf221 --- /dev/null +++ b/lib/ReplaceIntrinsics/CMakeLists.txt @@ -0,0 +1,13 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( ReplaceIntrinsics + ReplaceIntrinsics.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) + diff --git a/lib/ReplaceIntrinsics/LLVMBuild.txt b/lib/ReplaceIntrinsics/LLVMBuild.txt new file mode 100644 index 0000000000..6450fa1714 --- /dev/null +++ b/lib/ReplaceIntrinsics/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = ReplaceIntrinsics +parent = Transforms + diff --git a/lib/ReplaceIntrinsics/ReplaceIntrinsics.cpp b/lib/ReplaceIntrinsics/ReplaceIntrinsics.cpp new file mode 100644 index 0000000000..ef649d8e17 --- /dev/null +++ b/lib/ReplaceIntrinsics/ReplaceIntrinsics.cpp @@ -0,0 +1,516 @@ +//=== ReplaceApproxHPVMIntrinsicsWithFCalls.cpp ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#define ENABLE_ASSERTS + +#define DEBUG_TYPE "REPLACE_APPROXHPVM_INTRINSICS_WITH_FCALLS" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/IR/Attributes.h" +#include "llvm-c/Core.h" +#include "llvm/SupportVISC/VISCTimer.h" +#include "llvm/SupportVISC/DFG2LLVM.h" +#include "llvm/InPlaceDFG/InPlaceDFGAnalysis.h" +#include <sstream> + +using namespace llvm; +using namespace builddfg; +using namespace dfg2llvm; + +// TODO: We still need in place analysis, if calls have the same interface +using namespace inplacedfg; + +namespace { +// Helper class declarations + +// Replace ApproxHPVM intrinsics with LLVM function calls. +// aiming to go through the CPU backend code generation. + +struct DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls : public DFG2LLVM { + static char ID; // Pass identification, replacement for typeid + DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls() : DFG2LLVM(ID) {} +private: + +public: + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<BuildDFG>(); + AU.addRequired<InPlaceDFGAnalysisWrapper>(); + AU.addPreserved<BuildDFG>(); + AU.addPreserved<InPlaceDFGAnalysisWrapper>(); + } + + bool runOnModule(Module &M); +}; + +// Visitor for Code generation traversal (tree traversal for now) +class CGT_ReplaceApproxHPVMIntrinsicsWithFCalls : public CodeGenTraversal { + +private: + //Member variables + InPlaceDFGAnalysis::InPlaceDFGParameter *IPP; + + // VISC Runtime API and Tensor runtime API + + /* TODO: I believe that TensorRt is not needed, since we will have llvm + implementations linked in, so init and cleanup calls can be removed and + relevant code also, but I leave in in for now until verified. */ + Constant* llvm_hpvm_initTensorRt; + Constant* llvm_hpvm_cleanupTensorRt; +// Constant* hpvm_request_tensor; DONE: request tensor will not be used + + // Functions + bool isValidOperandForInPlaceOperation(Value *Op, Function *Fgen, DFNode *N); + + // Virtual Functions + void init(); + void initRuntimeAPI(); + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); + +public: + + // Constructor + CGT_ReplaceApproxHPVMIntrinsicsWithFCalls(Module &_M, BuildDFG &_DFG, InPlaceDFGAnalysis::InPlaceDFGParameter &_IPP) + : CodeGenTraversal(_M, _DFG), IPP(&_IPP) { + initRuntimeAPI(); + } + +}; + +bool CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::isValidOperandForInPlaceOperation(Value *Op, + Function *Fgen, + DFNode *N) { + // We only expect the if branch to be taken + if (Argument *Arg = dyn_cast<Argument>(Op)) { + DEBUG(errs() << *Arg << "\t: argument, candidate for in place\n"); + assert((Arg->getParent() == Fgen) && + "Extra Parameter in body of Function\n"); + // Candidae parameter is a function argument + // In this case, consult the result of in place analysis + // Find position in arg list + unsigned pos = Arg->getArgNo(); + // If this parameter cannot be used for in place operation + // code gen cannot continue + if (IPP->at(N)[pos]) { + DEBUG(errs() << *Arg << "\t: argument, suitable for in place\n"); + return true; + } else { + DEBUG(errs() << *Arg << "\t: argument, not suitable for in place\n"); + return false; + } + } + else { + // If it is not an argument, then it needs to be the result of + // another intrinsic. These are new objects that are allocated, + // and consumed by next intrinsic. Alternatively, the intrinsic + // could have been replaced by a call to an LLVM function. + // We do not expect a merge pass to have run before the replacement pass, + // therefore we do not expect to go in the else branch. + DEBUG(errs() << *Op << "\t: Test for result of intrinsic operation\n"); + if (dyn_cast<IntrinsicInst>(Op)) { + DEBUG(errs() << *Arg << "\t: local, suitable for in place\n"); + return true; + } else if (CallInst *CI = dyn_cast<CallInst>(Op)) { + if ((CI->getCalledFunction()->getName()).startswith("tensor")) + return true; + else + return false; + } + else { + DEBUG(errs() << *Arg << "\t: local, not suitable for in place\n"); + return false; + } + } +} + + +void CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::init() { +} + +// Initialize the VISC runtime API. This makes it easier to insert these calls +void CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::initRuntimeAPI() { + + // Load Runtime API Module + SMDiagnostic Err; + + char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!\n"); + + // FIXME: set correct path + Twine llvmSrcRoot = LLVM_SRC_ROOT; + Twine runtimeAPI = llvmSrcRoot+"/projects/hpvm-tensor-rt/lib/tensor_cpu_runtime.ll"; + runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); + if(runtimeModule == nullptr) + DEBUG(errs() << Err.getMessage()); + else + DEBUG(errs() << "Successfully loaded hpvm-tensor-rt API module\n"); + + // Get or insert Global declarations for + // - initialization + // - cleanup + // - request a tensor + DECLARE(llvm_hpvm_initTensorRt); + DECLARE(llvm_hpvm_cleanupTensorRt); +// DECLARE(hpvm_request_tensor); + + // Find visc.init and visc.cleanup calls, and add placeholder methods + // for initialization and cleanup of the hpvm tensor runtime + + Function* VI = M.getFunction("llvm.visc.init"); + assert(VI->getNumUses() == 1 && "__visc__init should only be used once\n"); + InitCall = cast<Instruction>(*VI->user_begin()); + CallInst::Create(llvm_hpvm_initTensorRt, + ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(M.getContext()), 0)), + "", InitCall); + + Function* VC = M.getFunction("llvm.visc.cleanup"); + assert(VC->getNumUses() == 1 && "__visc__clear should only be used once\n"); + CleanupCall = cast<Instruction>(*VC->user_begin()); + CallInst::Create(llvm_hpvm_cleanupTensorRt, ArrayRef<Value*>(), "", CleanupCall); + +} + +void CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::codeGen(DFInternalNode* N) { + errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n"; + errs () << "Skipping internal node\n"; +} + + +void CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::codeGen(DFLeafNode* N) { + + // Skip if it is a dummy node + if(N->isDummyNode()) { + DEBUG(errs() << "Skipping dummy node\n"); + return; + } + + // Abort if it is an allocation node + if(N->isAllocationNode()) { + assert(false && "Allocation Node not expected in ApproxHPVM"); + return; + } + + // Search for intrinsic only if it has the right hint + if (!checkPreferredTarget(N, visc::CPU_TARGET)) { + errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; + return; + } + + // Get the function associated with the dataflow node + Function *F = N->getFuncPointer(); + errs()<<"function name = "<< F->getName()<<"\n"; + + std::vector<IntrinsicInst *> IItoRemove; + + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &(*i); + if (BuildDFG::isViscIntrinsic(I)) { + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + assert((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor") + && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n"); + /********************* Handle VISC Tensor intrinsics ********************/ + // We replace them with calls to functions with implementations at the LLVM level + switch (II->getIntrinsicID()) { + + case Intrinsic::visc_tensor_convolution: + { /* llvm.hpvm.tensor.convolution */ + DEBUG(errs() << F->getName() << "\t: Handling tensor convolution \n"); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + Args.push_back(II->getOperand(1)); + Args.push_back(II->getOperand(2)); + Args.push_back(II->getOperand(3)); + Args.push_back(II->getOperand(4)); + Args.push_back(II->getOperand(5)); + + Constant* conv_mode = ConstantInt::get(Type::getInt32Ty(M.getContext()), 1); + Constant* conv_precision = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0); + + Args.push_back(conv_mode); + Args.push_back(conv_precision); + + // Create function call + Constant* tensorConvolutionCPU; + DECLARE(tensorConvolutionCPU); + + CallInst* CI = CallInst::Create(tensorConvolutionCPU, + Args, "", II); + // We can replace the call to hpvm.tensor.mul with the LLVM call + II->replaceAllUsesWith(CI); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + + case Intrinsic::visc_tensor_mul: + { /* llvm.hpvm.tensor.mul */ + DEBUG(errs() << F->getName() << "\t: Handling tensor mul\n"); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + Args.push_back(II->getOperand(1)); + + // Create function call + Constant* tensorGemmCPU; + DECLARE(tensorGemmCPU); + + CallInst* CI = CallInst::Create(tensorGemmCPU, + Args, "", II); + // We can replace the call to hpvm.tensor.mul with the LLVM call + II->replaceAllUsesWith(CI); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + + case Intrinsic::visc_tensor_add: + { /* llvm.hpvm.tensor.add */ + DEBUG(errs() << F->getName() << "\t: Handling tensor add\n"); + // Tensor add(a,b) is in place for argument a. + Value *Op = II->getOperand(0); + + // Test the intrinsic operand for in place operation. + bool inplace = isValidOperandForInPlaceOperation(Op, F, N); + // Code generation cannot continue if this is false, because the target + // only provides an in place operation + + // FIXME: remove this comment - must check for in-place + //assert(inplace && + // "Operand not valid for in place operation. Code gen aborted.\n"); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + Args.push_back(II->getOperand(1)); + + // Create function call + Constant* tensorAddCPU; + DECLARE(tensorAddCPU); + CallInst::Create(tensorAddCPU, Args, "", II); + // We can replace the call to hpvm.tensor.add with the 1st argument + // that, due to in place operation, now contains the result + II->replaceAllUsesWith(II->getOperand(0)); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + + case Intrinsic::visc_tensor_pool_max: + case Intrinsic::visc_tensor_pool_mean: + { /* llvm.visc.tensor.relu */ + DEBUG(errs() << F->getName() << "\t: Handling tensor_pool_max\n"); + // Tensor relu(a) is in place for argument a. + Value *Op = II->getOperand(0); + + // Test the intrinsic operand for in place operation. + bool inplace = isValidOperandForInPlaceOperation(Op, F, N); + // Code generation cannot continue if this is false, because the target + // only provides an in place operation + assert(inplace && + "Operand not valid for in place operation. Code gen aborted.\n"); + + // Argument list - tensorPooling(input, poolFunction, window_height, window_width, vertical_pad, horizontal_pad, + // vertical_stride, horizontal_stride); + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + + int pool_type = 0; + if (II->getIntrinsicID() == Intrinsic::visc_tensor_pool_max){ + pool_type = 0; + } + if (II->getIntrinsicID() == Intrinsic::visc_tensor_pool_mean){ + pool_type = 1; + } + + Constant* constPoolType = ConstantInt::get(Type::getInt32Ty(M.getContext()), pool_type); + Args.push_back(constPoolType); // ID for max pool. Min/Avg have different IDs (non-zero) + Args.push_back(II->getOperand(1)); + Args.push_back(II->getOperand(2)); + Args.push_back(II->getOperand(3)); + Args.push_back(II->getOperand(4)); + Args.push_back(II->getOperand(5)); + Args.push_back(II->getOperand(6)); + + // Create function call + Constant* tensorPoolingCPU; + DECLARE(tensorPoolingCPU); + CallInst* CI = CallInst::Create(tensorPoolingCPU, Args, "", II); + + // Replacing intrinsic result uses with the result of the LLVM call + II->replaceAllUsesWith(CI); + + // Mark to remove at the end + IItoRemove.push_back(II); + }break; + + case Intrinsic::visc_tensor_relu: + case Intrinsic::visc_tensor_clipped_relu: + case Intrinsic::visc_tensor_tanh: + { /* llvm.visc.tensor.relu */ + DEBUG(errs() << F->getName() << "\t: Handling tensor activation functions \n"); + // Tensor relu(a) is in place for argument a. + Value *Op = II->getOperand(0); + + // Test the intrinsic operand for in place operation. + bool inplace = isValidOperandForInPlaceOperation(Op, F, N); + // Code generation cannot continue if this is false, because the target + // only provides an in place operation + assert(inplace && + "Operand not valid for in place operation. Code gen aborted.\n"); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + + if (II->getIntrinsicID() == Intrinsic::visc_tensor_relu){ + // Create function call + Constant* tensorReluCPU; + DECLARE(tensorReluCPU); + CallInst::Create(tensorReluCPU, Args, "", II); + } + else if (II->getIntrinsicID() == Intrinsic::visc_tensor_clipped_relu){ + // Create function call + //-- Constant* tensorClippedRelu; + Constant* tensorRelu2CPU; + DECLARE(tensorRelu2CPU); + CallInst::Create(tensorRelu2CPU, Args, "", II); + } + else if (II->getIntrinsicID() == Intrinsic::visc_tensor_tanh){ + // Create function call + Constant* tensorTanhCPU; + errs()<<"tensorTanh Call = \n\n"; + DECLARE(tensorTanhCPU); + //errs()<<"tensorTanh Call = "<<*tensorTanh<<"\l"; + CallInst::Create(tensorTanhCPU, Args, "", II); + } + + // We can replace the call to hpvm.tensor.relu with the 1st argument + // that, due to in place operation, now contains the result + II->replaceAllUsesWith(II->getOperand(0)); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + + case Intrinsic::visc_tensor_softmax: + { /* llvm.visc.tensor.softmax */ + DEBUG(errs() << F->getName() << "\t: Handling tensor softmax\n"); + // Tensor relu(a) is in place for argument a. + Value *Op = II->getOperand(0); + + // Test the intrinsic operand for in place operation. + bool inplace = isValidOperandForInPlaceOperation(Op, F, N); + // Code generation cannot continue if this is false, because the target + // only provides an in place operation + assert(inplace && + "Operand not valid for in place operation. Code gen aborted.\n"); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + + // Create function call + Constant* tensorSoftmaxCPU; + DECLARE(tensorSoftmaxCPU); + CallInst::Create(tensorSoftmaxCPU, Args, "", II); + // We can replace the call to hpvm.tensor.softmax with the 1st argument + // that, due to in place operation, now contains the result + II->replaceAllUsesWith(II->getOperand(0)); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + + default: + llvm_unreachable("Unknown VISC Intrinsic!"); + break; + + } + + } + + } + + // We need to do this explicitly: DCE pass may not remove them. + // Traverse the vector backwards, otherwise definitions are deleted while + // their subsequent uses are still around. + for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(), + re = IItoRemove.rend(); ri != re; ++ri) { + DEBUG(errs() << "Erasing: " << **ri << "\n"); + errs() << "Erasing: " << **ri << "\n"; + (*ri)->eraseFromParent(); + } + + return; +} + +bool DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls::runOnModule(Module &M) { + errs() << "\nDFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls PASS\n"; + + // Get the BuildDFG Analysis Results: + // - Dataflow graph + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + // Get the In Place Analysis Results + InPlaceDFGAnalysis::InPlaceDFGParameter IPP = + (getAnalysis<InPlaceDFGAnalysisWrapper>()).getIPP(); + // Print results + printInPlaceDFGParameter(IPP); + + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + + // Visitor for Code Generation Graph Traversal + CGT_ReplaceApproxHPVMIntrinsicsWithFCalls *CGTVisitor = + new CGT_ReplaceApproxHPVMIntrinsicsWithFCalls(M, DFG, IPP); + + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode: Roots) { + // Initiate code generation for root DFNode + CGTVisitor->visit(rootNode); + } + + //TODO: Edit module epilogue to remove the VISC intrinsic declarations + delete CGTVisitor; + + return true; +} + + +/****************************************************************************** + * Helper functions * + ******************************************************************************/ + + +} // End of namespace + +char DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls::ID = 0; +static RegisterPass<DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls> X("replace-intrinsics", + "Replace ApproxHPVM intrinsics with LLVM calls", + false /* does not modify the CFG */, + true /* transformation, * + * not just analysis */); + diff --git a/lib/ReplaceIntrinsics/ReplaceIntrinsics.exports b/lib/ReplaceIntrinsics/ReplaceIntrinsics.exports new file mode 100644 index 0000000000..e69de29bb2 -- GitLab