diff --git a/llvm/lib/Transforms/CMakeLists.txt b/llvm/lib/Transforms/CMakeLists.txt index a189075fff07ece58c2da01909fde4f77cf8954d..9dd826ef4371298bb880fad5de9094335fb86779 100644 --- a/llvm/lib/Transforms/CMakeLists.txt +++ b/llvm/lib/Transforms/CMakeLists.txt @@ -21,3 +21,6 @@ add_subdirectory(ApproxScheduler) add_subdirectory(GenVISC) add_subdirectory(MergeDFN) add_subdirectory(FuseHPVMTensorNodes) +add_subdirectory(ReplaceIntrinsics) +add_subdirectory(DFG2LLVM_X86_dsoc) +add_subdirectory(InlineTensorCalls) diff --git a/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/CMakeLists.txt b/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..75569adddae7232ff10988d89f5a7f98626a12c9 --- /dev/null +++ b/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/CMakeLists.txt @@ -0,0 +1,13 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( DFG2LLVM_X86_dsoc + DFG2LLVM_X86_dsoc.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) + diff --git a/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports b/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp b/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fbe5e4f6bd836a31550b784d8a88730a6984a7be --- /dev/null +++ b/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp @@ -0,0 +1,2128 @@ +//===-------------------------- DFG2LLVM_X86.cpp --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "DFG2LLVM_X86" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Constant.h" +#include "llvm/SupportVISC/DFG2LLVM.h" + +using namespace llvm; +using namespace builddfg; +using namespace dfg2llvm; + +// VISC Command line option to use timer or not +static cl::opt<bool> +VISCTimer_X86("visc-timers-x86", cl::desc("Enable visc timers")); +// Command line option to enable device abstraction or not +static cl::opt<bool> +DeviceAbstraction("visc-eda", cl::init(false), cl::Hidden, + cl::desc("Enable visc device abstraction")); + + +namespace { + +// Helper Functions +static bool isVISCCall_llvm_visc_policy_getVersion(Instruction *I) { + if (!isa<CallInst>(I)) + return false; + CallInst *CI = cast<CallInst>(I); + return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("llvm_visc_policy_getVersion"); +} + +CallInst *get_llvm_visc_policy_getVersion_call(Function *F) { + for (inst_iterator ib = inst_begin(F), ie = inst_end(F); ib != ie; ++ib) { + Instruction *I = &*ib; + if (isVISCCall_llvm_visc_policy_getVersion(I)) + return cast<CallInst>(I); + } + return NULL; +} + +// DFG2LLVM_X86 - The first implementation. +struct DFG2LLVM_X86 : public DFG2LLVM { + static char ID; // Pass identification, replacement for typeid + DFG2LLVM_X86() :DFG2LLVM(ID) {} + +private: + // Member variables + + // Functions + +public: + bool runOnModule(Module &M); +}; + +// Visitor for Code generation traversal (tree traversal for now) +class CGT_X86 : public CodeGenTraversal { + +private: + //Member variables + + Constant* malloc; + // VISC Runtime API + Constant* llvm_visc_x86_launch; + Constant* llvm_visc_x86_wait; + Constant* llvm_visc_x86_argument_ptr; + + Constant* llvm_visc_streamLaunch; + Constant* llvm_visc_streamPush; + Constant* llvm_visc_streamPop; + Constant* llvm_visc_streamWait; + Constant* llvm_visc_createBindInBuffer; + Constant* llvm_visc_createBindOutBuffer; + Constant* llvm_visc_createEdgeBuffer; + Constant* llvm_visc_createLastInputBuffer; + Constant* llvm_visc_createThread; + //Constant* llvm_visc_freeThreads; + Constant* llvm_visc_bufferPush; + Constant* llvm_visc_bufferPop; + Constant* llvm_visc_x86_dstack_push; + Constant* llvm_visc_x86_dstack_pop; + Constant* llvm_visc_x86_getDimLimit; + Constant* llvm_visc_x86_getDimInstance; + + //Functions + std::vector<IntrinsicInst*>* getUseList(Value* LI); + Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = ""); + void addDoWhileLoop(Instruction*, Instruction*, Value*); + void addWhileLoop(Instruction*, Instruction*, Instruction*, Value*); + Instruction *addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *); + Argument* getArgumentFromEnd(Function* F, unsigned offset); + Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, + Instruction* InsertBefore); + void invokeChild_X86(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap, + Instruction* InsertBefore); + void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap, + Instruction* InsertBefore); + StructType* getArgumentListStructTy(DFNode*); + Function* createFunctionFilter(DFNode* C); + void startNodeThread(DFNode*, std::vector<Value*>, DenseMap<DFEdge*, Value*>, + Value*, Value*, Instruction*); + Function* createLaunchFunction(DFInternalNode*); + Function* createPushFunction(DFInternalNode*); + Function* createPopFunction(DFInternalNode*); + Function* createWaitFunction(DFInternalNode*); + + // Virtual Functions + void init() { + VISCTimer = VISCTimer_X86; + TargetName = "X86"; + } + void initRuntimeAPI(); + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); + Function* codeGenStreamPush(DFInternalNode* N); + Function* codeGenStreamPop(DFInternalNode* N); + +public: + // Constructor + CGT_X86(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG) { + init(); + initRuntimeAPI(); + } + + void codeGenLaunch(DFInternalNode* Root); + void codeGenLaunchStreaming(DFInternalNode* Root); +}; + +bool DFG2LLVM_X86::runOnModule(Module &M) { + errs() << "\nDFG2LLVM_X86 PASS\n"; + + // Get the BuildDFG Analysis Results: + // - Dataflow graph + // - Maps from i8* hansles to DFNode and DFEdge + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + //DFInternalNode *Root = DFG.getRoot(); + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); + // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); + + // Visitor for Code Generation Graph Traversal + CGT_X86 *CGTVisitor = new CGT_X86(M, DFG); + + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode: Roots) { + // Initiate code generation for root DFNode + CGTVisitor->visit(rootNode); + // Go ahead and replace the launch intrinsic with pthread call, otherwise return now. + // TODO: Later on, we might like to do this in a separate pass, which would + // allow us the flexibility to switch between complete static code generation + // for DFG or having a customized runtime+scheduler + + // Do streaming code generation if root node is streaming. Usual otherwise + if(rootNode->isChildGraphStreaming()) + CGTVisitor->codeGenLaunchStreaming(rootNode); + else + CGTVisitor->codeGenLaunch(rootNode); + } + + delete CGTVisitor; + return true; +} + +// Initialize the VISC runtime API. This makes it easier to insert these calls +void CGT_X86::initRuntimeAPI() { + + // Load Runtime API Module + SMDiagnostic Err; + + char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); + + // FIXME: hardcoded path to 'build_dsoc' - should probably be a environment variable + Twine llvmSrcRoot = LLVM_SRC_ROOT; + Twine runtimeAPI = llvmSrcRoot+"/../build_dsoc/projects/visc-rt/visc-rt.ll"; + + runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); + + if(runtimeModule == NULL) + DEBUG(errs() << Err.getMessage()); + else + DEBUG(errs() << "Successfully loaded visc-rt API module\n"); + + // Get or insert the global declarations for launch/wait functions + DECLARE(llvm_visc_x86_launch); + DECLARE(malloc); + DECLARE(llvm_visc_x86_wait); + DECLARE(llvm_visc_x86_argument_ptr); + DECLARE(llvm_visc_streamLaunch); + DECLARE(llvm_visc_streamPush); + DECLARE(llvm_visc_streamPop); + DECLARE(llvm_visc_streamWait); + DECLARE(llvm_visc_createBindInBuffer); + DECLARE(llvm_visc_createBindOutBuffer); + DECLARE(llvm_visc_createEdgeBuffer); + DECLARE(llvm_visc_createLastInputBuffer); + DECLARE(llvm_visc_createThread); + //DECLARE(llvm_visc_freeThreads); + DECLARE(llvm_visc_bufferPush); + DECLARE(llvm_visc_bufferPop); + DECLARE(llvm_visc_x86_dstack_push); + DECLARE(llvm_visc_x86_dstack_pop); + DECLARE(llvm_visc_x86_getDimLimit); + DECLARE(llvm_visc_x86_getDimInstance); + + // Get or insert timerAPI functions as well if you plan to use timers + initTimerAPI(); + + // Insert init context in main + Function* VI = M.getFunction("llvm.visc.init"); + assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); + DEBUG(errs() << "Inserting x86 timer initialization\n"); + Instruction* I = cast<Instruction>(*VI->user_begin()); + initializeTimerSet(I); + switchToTimer(visc_TimerID_NONE, I); + // Insert code for initializing the sceduling policy + Function *IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_init", + runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType())); + CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I); + DEBUG(errs() << *IPCallInst << "\n"); + + // If device abstraction is enabled, we add a runtime call to start the + // device status simulation + if (DeviceAbstraction) { + Function *ID = + cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_start", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_start")->getFunctionType())); + CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I); + DEBUG(errs() << *IDCallInst << "\n"); + } + + // Insert print instruction at visc exit + Function* VC = M.getFunction("llvm.visc.cleanup"); + assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once"); + + // Insert code for clearing the sceduling policy + I = cast<Instruction>(*VC->user_begin()); + IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_clear", + runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType())); + IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I); + errs() << *IPCallInst << "\n"; + + DEBUG(errs() << "Inserting x86 timer print\n"); + printTimerSet(I); + + // If device abstraction is enabled, we add a runtime call to end the + // device status simulation + if (DeviceAbstraction) { + Function *ID = + cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_end", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_end")->getFunctionType())); + CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I); + DEBUG(errs() << *IDCallInst << "\n"); + } + +} + +/* Returns vector of all wait instructions + */ +std::vector<IntrinsicInst*>* CGT_X86::getUseList(Value* GraphID) { + std::vector<IntrinsicInst*>* UseList = new std::vector<IntrinsicInst*>(); + // It must have been loaded from memory somewhere + for(Value::user_iterator ui = GraphID->user_begin(), + ue = GraphID->user_end(); ui!=ue; ++ui) { + if(IntrinsicInst* waitI = dyn_cast<IntrinsicInst>(*ui)) { + UseList->push_back(waitI); + } + //else if (PHINode* PN = dyn_cast<PHINode>(*ui)){ + //errs() << "Found PhiNode use of graphID\n"; + //std::vector<IntrinsicInst*>* phiUseList = getUseList(PN); + //UseList->insert(UseList->end(), phiUseList->begin(), phiUseList->end()); + //free(phiUseList); + //} + else { + llvm_unreachable("Error: Operation on Graph ID not supported!\n"); + } + } + return UseList; +} + +/* Traverse the function argument list in reverse order to get argument at a + * distance offset fromt he end of argument list of function F + */ +Argument* CGT_X86::getArgumentFromEnd(Function* F, unsigned offset) { + assert((F->getFunctionType()->getNumParams() >= offset && offset > 0) + && "Invalid offset to access arguments!"); + Function::arg_iterator e = F->arg_end(); + // Last element of argument iterator is dummy. Skip it. + e--; + Argument* arg; + for( ; offset != 0; e--) { + offset--; + arg = &*e; + } + return arg; +} + +/* Add Loop around the instruction I + * Algorithm: + * (1) Split the basic block of instruction I into three parts, where the + * middleblock/body would contain instruction I. + * (2) Add phi node before instruction I. Add incoming edge to phi node from + * predecessor + * (3) Add increment and compare instruction to index variable + * (4) Replace terminator/branch instruction of body with conditional branch + * which loops over bidy if true and goes to end if false + * (5) Update phi node of body + */ +void CGT_X86::addWhileLoop(Instruction* CondBlockStart, Instruction* BodyStart, + Instruction* BodyEnd, Value* TerminationCond) { + BasicBlock* Entry = CondBlockStart->getParent(); + BasicBlock* CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition"); + BasicBlock* WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body"); + BasicBlock* WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end"); + + // Replace the terminator instruction of conditional with new conditional + // branch which goes to while.body if true and branches to while.end otherwise + BranchInst* BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond); + ReplaceInstWithInst(CondBlock->getTerminator(), BI); + + // While Body should jump to condition block + BranchInst* UnconditionalBranch = BranchInst::Create(CondBlock); + ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch); + +} + +Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond, + BasicBlock *Body) { + Module *M = Entry->getParent()->getParent(); + Type *Int64Ty = Type::getInt64Ty(M->getContext()); + + // Insert a PHI instruction at the beginning of the condition block + Instruction *IB = Cond->getFirstNonPHI(); + PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB); + + ConstantInt *IConst = + ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true); + Instruction *CounterIncr = + BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst, + "cnt_incr", Body->getTerminator()); + + // Set incoming values for Phi node + IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true); + CounterPhi->addIncoming(IConst, Entry); + CounterPhi->addIncoming(CounterIncr, Body); + + // Return the pointer to the created PHI node in the corresponding argument + return CounterPhi; +} + +/* Add Loop around the instruction I + * Algorithm: + * (1) Split the basic block of instruction I into three parts, where the + * middleblock/body would contain instruction I. + * (2) Add phi node before instruction I. Add incoming edge to phi node from + * predecessor + * (3) Add increment and compare instruction to index variable + * (4) Replace terminator/branch instruction of body with conditional branch + * which loops over bidy if true and goes to end if false + * (5) Update phi node of body + */ +void CGT_X86::addDoWhileLoop(Instruction* From, Instruction* To, Value* TerminationCond) { + BasicBlock* Entry = From->getParent(); + BasicBlock* ForBody = Entry->splitBasicBlock(From, "for.body"); + + // To Instruction should also belong to the same basic block as the From basic + // block will have a terminator instruction + assert(To->getParent() == ForBody + && "To Instruction should also belong to the same basic block!"); + BasicBlock* ForEnd = ForBody->splitBasicBlock(To, "for.end"); + + // Replace the terminator instruction of for.body with new conditional + // branch which loops over body if true and branches to for.end otherwise + BranchInst* BI = BranchInst::Create(ForEnd, ForBody, TerminationCond); + ReplaceInstWithInst(ForBody->getTerminator(), BI); + +} + +/* Add Loop around the instruction I + * Algorithm: + * (1) Split the basic block of instruction I into three parts, where the + * middleblock/body would contain instruction I. + * (2) Add phi node before instruction I. Add incoming edge to phi node from + * predecessor + * (3) Add increment and compare instruction to index variable + * (4) Replace terminator/branch instruction of body with conditional branch + * which loops over bidy if true and goes to end if false + * (5) Update phi node of body + */ +Value* CGT_X86::addLoop(Instruction* I, Value* limit, const Twine& indexName) { + BasicBlock* Entry = I->getParent(); + BasicBlock* ForBody = Entry->splitBasicBlock(I, "for.body"); + + BasicBlock::iterator i(I); + ++i; + Instruction* NextI = &*i; + // Next Instruction should also belong to the same basic block as the basic + // block will have a terminator instruction + assert(NextI->getParent() == ForBody + && "Next Instruction should also belong to the same basic block!"); + BasicBlock* ForEnd = ForBody->splitBasicBlock(NextI, "for.end"); + + + // Add Phi Node for index variable + PHINode* IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()), + 2, "index."+indexName, I); + + // Add incoming edge to phi + IndexPhi->addIncoming(ConstantInt::get(Type::getInt64Ty(I->getContext()), 0), + Entry); + // Increment index variable + BinaryOperator* IndexInc = BinaryOperator::Create(Instruction::Add, + IndexPhi, ConstantInt::get(Type::getInt64Ty(I->getContext()), 1), + "index."+indexName+".inc", ForBody->getTerminator()); + + // Compare index variable with limit + CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc, + limit, "cond."+indexName, ForBody->getTerminator()); + + // Replace the terminator instruction of for.body with new conditional + // branch which loops over body if true and branches to for.end otherwise + BranchInst* BI = BranchInst::Create(ForBody, ForEnd, Cond); + ReplaceInstWithInst(ForBody->getTerminator(), BI); + + // Add incoming edge to phi node in body + IndexPhi->addIncoming(IndexInc, ForBody); + return IndexPhi; +} + +// Returns a packed struct type. The structtype is created by packing the input +// types, output types and isLastInput buffer type. All the streaming +// inputs/outputs are converted to i8*, since this is the type of buffer +// handles. +StructType* CGT_X86::getArgumentListStructTy(DFNode* C) { + std::vector<Type*> TyList; + // Input types + Function* CF = C->getFuncPointer(); + for(Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end(); + ai != ae; ++ai) { + if(C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge()) + TyList.push_back(Type::getInt8PtrTy(CF->getContext())); + else + TyList.push_back(ai->getType()); + } + // Output Types + StructType* OutStructTy = cast<StructType>(CF->getReturnType()); + for (unsigned i = 0; i < OutStructTy->getNumElements(); i++) { + // All outputs of a node are streaming edge + assert(C->getOutDFEdgeAt(i)->isStreamingEdge() + && "All output edges of child node have to be streaming"); + TyList.push_back(Type::getInt8PtrTy(CF->getContext())); + } + // isLastInput buffer element + TyList.push_back(Type::getInt8PtrTy(CF->getContext())); + + StructType* STy = StructType::create(CF->getContext(), TyList, + Twine("struct.thread."+CF->getName()).str(), true); + return STy; + +} + +void CGT_X86::startNodeThread(DFNode* C, std::vector<Value*> Args, DenseMap<DFEdge*, Value*> + EdgeBufferMap, Value* isLastInputBuffer, Value* graphID, + Instruction* IB) { + DEBUG(errs() << "Starting Pipeline for child node: " << C->getFuncPointer()->getName() << "\n"); + // Create a filter/pipeline function for the child node + Function* C_Pipeline = createFunctionFilter(C); + Function* CF = C->getFuncPointer(); + + // Get module context and i32 0 constant, as they would be frequently used in + // this function. + LLVMContext& Ctx = IB->getParent()->getContext(); + Constant* IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0); + + // Marshall arguments + // Create a packed struct type with inputs of C followed by outputs and then + // another i8* to indicate isLastInput buffer. Streaming inputs are replaced + // by i8* + // + StructType* STy = getArgumentListStructTy(C); + // Allocate the struct on heap *NOT* stack and bitcast i8* to STy* + CallInst* CI = CallInst::Create(malloc, ArrayRef<Value*>(ConstantExpr::getSizeOf(STy)), + C->getFuncPointer()->getName()+".inputs", IB); + CastInst* Struct = BitCastInst::CreatePointerCast(CI, STy->getPointerTo(), CI->getName()+".i8ptr", IB); + //AllocaInst* AI = new AllocaInst(STy, C->getFuncPointer()->getName()+".inputs", IB); + // Insert elements in the struct + DEBUG(errs() << "Marshall inputs for child node: " << C->getFuncPointer()->getName() << "\n"); + // Marshall Inputs + for(unsigned i=0; i < CF->getFunctionType()->getNumParams(); i++) { + // Create constant int (i) + Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i); + // Get Element pointer instruction + Value* GEPIndices[] = { IntZero, Int_i }; + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, + ArrayRef<Value*>(GEPIndices, 2), + Struct->getName()+".arg_"+Twine(i), + IB); + DFEdge* E = C->getInDFEdgeAt(i); + if (E->getSourceDF()->isEntryNode()) { + // This is a Bind Input Edge + if(E->isStreamingEdge()) { + // Streaming Bind Input edge. Get buffer corresponding to it + assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming Bind DFEdge!"); + new StoreInst(EdgeBufferMap[E], GEP, IB); + } + else { + // Non-streaming Bind edge + new StoreInst(Args[i], GEP, IB); + } + } + else { + // This is an edge between siblings. + // This must be an streaming edge. As it is our assumption that all edges + // between two nodes in a DFG are streaming. + assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming DFEdge!"); + new StoreInst(EdgeBufferMap[E], GEP, IB); + } + } + unsigned numInputs = CF->getFunctionType()->getNumParams(); + unsigned numOutputs = cast<StructType>(CF->getReturnType())->getNumElements(); + // Marshall Outputs + DEBUG(errs() << "Marshall outputs for child node: " << C->getFuncPointer()->getName() << "\n"); + for(unsigned i = 0; i < numOutputs; i++ ) { + // Create constant int (i+numInputs) + Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i+numInputs); + // Get Element pointer instruction + Value* GEPIndices[] = { IntZero, Int_i }; + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, + ArrayRef<Value*>(GEPIndices, 2), + Struct->getName()+".out_"+Twine(i), + IB); + DFEdge* E = C->getOutDFEdgeAt(i); + assert(E->isStreamingEdge() && "Output Edge must be streaming of all nodes"); + assert(EdgeBufferMap.count(E) && "No mapping buffer for a Out Streaming DFEdge!"); + new StoreInst(EdgeBufferMap[E], GEP, IB); + } + // Marshall last argument. isLastInput buffer + DEBUG(errs() << "Marshall isLastInput for child node: " << C->getFuncPointer()->getName() << "\n"); + // Create constant int (i+numInputs) + Constant* Int_index = ConstantInt::get(Type::getInt32Ty(Ctx), numInputs+numOutputs); + // Get Element pointer instruction + Value* GEPIndices[] = { IntZero, Int_index }; + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, + ArrayRef<Value*>(GEPIndices, 2), + Struct->getName()+".isLastInput", IB); + new StoreInst(isLastInputBuffer, GEP, IB); + + // AllocaInst AI points to memory with all the arguments packed + // Call runtime to create the thread with these arguments + DEBUG(errs() << "Start Thread for child node: " << C->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << *llvm_visc_createThread << "\n"); + DEBUG(errs() << *graphID->getType() << "\n"); + DEBUG(errs() << *C_Pipeline->getType() << "\n"); + DEBUG(errs() << *Struct->getType() << "\n"); + // Bitcast AI to i8* + CastInst* BI = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx), Struct->getName(), IB); + Value* CreateThreadArgs[] = {graphID, C_Pipeline, BI}; + CallInst* CreateThread = CallInst::Create(llvm_visc_createThread, + ArrayRef<Value*>(CreateThreadArgs, 3), + "", + IB); + +} + +Function* CGT_X86::createLaunchFunction(DFInternalNode* N) { + DEBUG(errs() << "Generating Streaming Launch Function\n"); + // Get Function associated with Node N + Function* NF = N->getFuncPointer(); + + // Map from Streaming edge to buffer + DenseMap<DFEdge*, Value*> EdgeBufferMap; + + /* Now we have all the necessary global declarations necessary to generate the + * Launch function, pointer to which can be passed to pthread utils to execute + * DFG. The Launch function has just one input: i8* data.addr + * This is the address of the all the input data that needs to be passed to + * this function. In our case it contains the input arguments of the Root + * function in the correct order. + * (1) Create an empty Launch function of type void (i8* args, i8* GraphID) + * (2) Extract each of inputs from data.addr + * (3) create Buffers for all the streaming edges + * - Put buffers in the context + * (4) Go over each child node + * - marshall its arguments together (use buffers in place of streaming + * arguments) + * - Start the threads + * (5) The return value from Root is stored in memory, pointer to which is + * passed to pthread_exit call. + */ + // (1) Create Launch Function of type void (i8* args, i8* GraphID) + Type* i8Ty = Type::getInt8Ty(M.getContext()); + Type* ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()}; + FunctionType* LaunchFuncTy = FunctionType::get(Type::getVoidTy(NF->getContext()), + ArrayRef<Type*>(ArgTypes, 2), false); + Function* LaunchFunc = Function::Create(LaunchFuncTy, + NF->getLinkage(), + NF->getName()+".LaunchFunction", + &M); + DEBUG(errs() << "Generating Code for Streaming Launch Function\n"); + // Give a name to the argument which is used pass data to this thread + Argument* data = &*LaunchFunc->arg_begin(); + Argument* graphID = &*(++LaunchFunc->arg_begin()); + data->setName("data.addr"); + graphID->setName("graphID"); + // Add a basic block to this empty function and a return null statement to it + DEBUG(errs() << *LaunchFunc->getReturnType() << "\n"); + BasicBlock *BB = BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc); + ReturnInst* RI = ReturnInst::Create(LaunchFunc->getContext(), + BB); + + DEBUG(errs() << "Created Empty Launch Function\n"); + + // (2) Extract each of inputs from data.addr + std::vector<Type*> TyList; + std::vector<std::string> names; + std::vector<Value*> Args; + + for (Function::arg_iterator ai = NF->arg_begin(), ae = NF->arg_end(); + ai != ae; ++ai) { + if(N->getChildGraph()->getEntry()->getOutDFEdgeAt(ai->getArgNo())->isStreamingEdge()) { + TyList.push_back(i8Ty->getPointerTo()); + names.push_back(Twine(ai->getName()+"_buffer").str()); + continue; + } + TyList.push_back(ai->getType()); + names.push_back(ai->getName()); + } + Args = extractElements(data, TyList, names, RI); + DEBUG(errs() << "Launch function for " << NF->getName() << *LaunchFunc << "\n"); + // (3) Create buffers for all the streaming edges + for(DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(), + de = N->getChildGraph()->dfedge_end(); di != de; ++di) { + DFEdge* Edge = *di; + DEBUG(errs() << *Edge->getType() << "\n"); + Value* size = ConstantExpr::getSizeOf(Edge->getType()); + Value* CallArgs[] = {graphID, size}; + if (Edge->isStreamingEdge()) { + CallInst* CI; + // Create a buffer call + if(Edge->getSourceDF()->isEntryNode()) { + // Bind Input Edge + Constant* Int_ArgNo = ConstantInt::get(Type::getInt32Ty(RI->getContext()), + Edge->getSourcePosition()); + Value* BindInCallArgs[] = {graphID, size, Int_ArgNo}; + CI = CallInst::Create(llvm_visc_createBindInBuffer, ArrayRef<Value*>(BindInCallArgs, 3), + "BindIn."+Edge->getDestDF()->getFuncPointer()->getName(), + RI); + } + else if(Edge->getDestDF()->isExitNode()) { + // Bind Output Edge + CI = CallInst::Create(llvm_visc_createBindOutBuffer, ArrayRef<Value*>(CallArgs, 2), + "BindOut."+Edge->getSourceDF()->getFuncPointer()->getName(), + RI); + } + else { + // Streaming Edge + CI = CallInst::Create(llvm_visc_createEdgeBuffer, + ArrayRef<Value*>(CallArgs, 2), + Edge->getSourceDF()->getFuncPointer()->getName()+"." + +Edge->getDestDF()->getFuncPointer()->getName(), + RI); + } + EdgeBufferMap[Edge] = CI; + } + } + // Create buffer for isLastInput for all the child nodes + DFGraph* G = N->getChildGraph(); + DenseMap<DFNode*, Value*> NodeLastInputMap; + for(DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce; ++ci) { + DFNode* child = *ci; + if(child->isDummyNode()) + continue; + Value* size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext())); + Value* CallArgs[] = {graphID, size}; + CallInst* CI = CallInst::Create(llvm_visc_createLastInputBuffer, ArrayRef<Value*>(CallArgs, 2), + "BindIn.isLastInput."+child->getFuncPointer()->getName(), + RI); + NodeLastInputMap[child] = CI; + } + DEBUG(errs() << "Start Each child node filter\n"); + // (4) Marshall arguments for each child node and start the thread with its + // pipeline funtion + for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); ci != ce; ++ci) { + DFNode* C = *ci; + // Skip dummy node call + if (C->isDummyNode()) + continue; + + // Marshall all the arguments for this node into an i8* + // Pass to the runtime to create the thread + // Start the thread for child node C + startNodeThread(C, Args, EdgeBufferMap, NodeLastInputMap[C], graphID, RI); + } + + DEBUG(errs() << "Launch function:\n"); + DEBUG(errs() << *LaunchFunc << "\n"); + + return LaunchFunc; +} + + +Function* CGT_X86::createPushFunction(DFInternalNode* N) { + DEBUG(errs() << "Generating Push function\n"); + Function* PushFunc; + return PushFunc; +} + +Function* CGT_X86::createPopFunction(DFInternalNode* N) { + DEBUG(errs() << "Generating Pop function\n"); + Function* PushFunc; + return PushFunc; +} + +Function* CGT_X86::createWaitFunction(DFInternalNode* N) { + DEBUG(errs() << "Generating Wait function\n"); + Function* PushFunc; + return PushFunc; +} +/* This fuction does the steps necessary to launch a streaming graph + * Steps + * Create Pipeline/Filter function for each node in child graph of Root + * Create Functions DFGLaunch, DFGPush, DFGPop, DFGWait + * Modify each of the instrinsic in host code + * Launch, Push, Pop, Wait + */ +void CGT_X86::codeGenLaunchStreaming(DFInternalNode* Root) { + IntrinsicInst* LI = Root->getInstruction(); + Function* RootLaunch = createLaunchFunction(Root); + //Function* RootPush = createPushFunction(Root); + //Function* RootPop = createPopFunction(Root); + //Function* RootWait = createWaitFunction(Root); + // Substitute launch intrinsic main + DEBUG(errs() << "Substitute launch intrinsic\n"); + Value* LaunchInstArgs[] = {RootLaunch, + LI->getArgOperand(1) + }; + CallInst* LaunchInst = CallInst::Create(llvm_visc_streamLaunch, + ArrayRef<Value*>(LaunchInstArgs,2), + "graph"+Root->getFuncPointer()->getName(), LI); + //ReplaceInstWithInst(LI, LaunchInst); + + DEBUG(errs() << *LaunchInst << "\n"); + // Replace all wait instructions with x86 specific wait instructions + DEBUG(errs() << "Substitute wait, push, pop intrinsics\n"); + std::vector<IntrinsicInst*>* UseList = getUseList(LI); + for(unsigned i=0; i < UseList->size(); ++i) { + IntrinsicInst* II = UseList->at(i); + CallInst* CI; + Value* PushArgs[] = {LaunchInst, II->getOperand(1)}; + switch(II->getIntrinsicID()) { + case Intrinsic::visc_wait: + CI = CallInst::Create(llvm_visc_streamWait, + ArrayRef<Value*>(LaunchInst), + ""); + break; + case Intrinsic::visc_push: + CI = CallInst::Create(llvm_visc_streamPush, + ArrayRef<Value*>(PushArgs, 2), + ""); + break; + case Intrinsic::visc_pop: + CI = CallInst::Create(llvm_visc_streamPop, + ArrayRef<Value*>(LaunchInst), + ""); + break; + default: + llvm_unreachable("GraphID is used by an instruction other than wait, push, pop"); + }; + DEBUG(errs() << "Replace:\n\t" << *II << "\n"); + ReplaceInstWithInst(II, CI); + DEBUG(errs() << "\twith " << *CI << "\n"); + } + + +} + +void CGT_X86::codeGenLaunch(DFInternalNode* Root) { + // TODO: Place an assert to check if the constant passed by launch intrinsic + // as the number of arguments to DFG is same as the number of arguments of the + // root of DFG + DEBUG(errs() << "Generating Launch Function\n"); + // Get Launch Instruction + IntrinsicInst* LI = Root->getInstruction(); + switchToTimer(visc_TimerID_PTHREAD_CREATE, LI); + DEBUG(errs() << "Generating Launch Function\n"); + + /* Now we have all the necessary global declarations necessary to generate the + * Launch function, pointer to which can be passed to pthread utils to execute + * DFG. The Launch function has just one input: i8* data.addr + * This is the address of the all the input data that needs to be passed to + * this function. In our case it contains the input arguments of the Root + * function in the correct order. + * (1) Create an empty Launch function of type i8*(i8*) + * (2) Extract each of inputs from data.addr and pass them as arguments to the + * call to Root function + * (3) The return value from Root is stored in memory, pointer to which is + * passed to pthread_exit call. + */ + // Create Launch Function of type i8*(i8*) which calls the root function + Type* i8Ty = Type::getInt8Ty(M.getContext()); + FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(), + ArrayRef<Type*>(i8Ty->getPointerTo()), + false); + Function* AppFunc = Function::Create(AppFuncTy, + Root->getFuncPointer()->getLinkage(), + "LaunchDataflowGraph", + &M); + DEBUG(errs() << "Generating Launch Function\n"); + // Give a name to the argument which is used pass data to this thread + Value* data = &*AppFunc->arg_begin(); + data->setName("data.addr"); + // Add a basic block to this empty function and a return null statement to it + BasicBlock *BB = BasicBlock::Create(AppFunc->getContext(), "entry", AppFunc); + ReturnInst* RI = ReturnInst::Create(AppFunc->getContext(), + Constant::getNullValue(AppFunc->getReturnType()), + BB); + switchToTimer(visc_TimerID_ARG_UNPACK, RI); + + DEBUG(errs() << "Created Empty Launch Function\n"); + // Find the X86 function generated for Root and +// Function* RootF_X86 = Root->getGenFunc(); + Function* RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET); + assert(RootF_X86 && "Error: No generated CPU function for Root node\n"); + assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) && + "Error: Generated Function for Root node with no x86 wrapper\n"); + + // Generate a call to RootF_X86 with null parameters for now + std::vector<Value*>Args; + for(unsigned i=0; i< RootF_X86->getFunctionType()->getNumParams(); i++) { + Args.push_back(Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i))); + } + CallInst* CI = CallInst::Create(RootF_X86, Args, RootF_X86->getName()+".output", RI); + + // Extract input data from i8* data.addr and patch them to correct argument of + // call to RootF_X86. For each argument + std::vector<Type*> TyList; + std::vector<std::string> names; + for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end(); + ai != ae; ++ai) { + TyList.push_back(ai->getType()); + names.push_back(ai->getName()); + } + std::vector<Value*> elements = extractElements(data, TyList, names, CI); + // Patch the elements to the call arguments + for(unsigned i=0; i<CI->getNumArgOperands(); i++) + CI->setArgOperand(i, elements[i]); + + // Add timers around Call to RootF_X86 function + switchToTimer(visc_TimerID_COMPUTATION, CI); + switchToTimer(visc_TimerID_OUTPUT_PACK, RI); + + // Code for returning the output + CastInst* OutputAddrCast = CastInst::CreatePointerCast(data, + CI->getType()->getPointerTo(), + CI->getName()+".addr", + RI); + new StoreInst(CI, OutputAddrCast, RI); + switchToTimer(visc_TimerID_NONE, RI); + + DEBUG(errs() << "Application specific function:\n"); + DEBUG(errs() << *AppFunc << "\n"); + + // Substitute launch intrinsic main + Value* LaunchInstArgs[] = {AppFunc, + LI->getArgOperand(1) + }; + CallInst* LaunchInst = CallInst::Create(llvm_visc_x86_launch, + ArrayRef<Value*>(LaunchInstArgs,2), + "graph"+Root->getFuncPointer()->getName(), LI); + //ReplaceInstWithInst(LI, LaunchInst); + + DEBUG(errs() << *LaunchInst << "\n"); + // Replace all wait instructions with x86 specific wait instructions + std::vector<IntrinsicInst*>* UseList = getUseList(LI); + for(unsigned i=0; i < UseList->size(); ++i) { + IntrinsicInst* II = UseList->at(i); + CallInst* CI; + switch(II->getIntrinsicID()) { + case Intrinsic::visc_wait: + CI = CallInst::Create(llvm_visc_x86_wait, + ArrayRef<Value*>(LaunchInst), + ""); + break; + case Intrinsic::visc_push: + CI = CallInst::Create(llvm_visc_bufferPush, + ArrayRef<Value*>(LaunchInst), + ""); + break; + case Intrinsic::visc_pop: + CI = CallInst::Create(llvm_visc_bufferPop, + ArrayRef<Value*>(LaunchInst), + ""); + break; + default: + llvm_unreachable("GraphID is used by an instruction other than wait, push, pop"); + }; + ReplaceInstWithInst(II, CI); + DEBUG(errs() << *CI << "\n"); + } + +} + +Value* CGT_X86::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore) { + // TODO: Assumption is that each input port of a node has just one + // incoming edge. May change later on. + + // Find the incoming edge at the requested input port + DFEdge* E = Child->getInDFEdgeAt(i); + assert(E && "No incoming edge or binding for input element!"); + // Find the Source DFNode associated with the incoming edge + DFNode* SrcDF = E->getSourceDF(); + + // If Source DFNode is a dummyNode, edge is from parent. Get the + // argument from argument list of this internal node + Value* inputVal; + if(SrcDF->isEntryNode()) { + inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition()); + DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + } + else { + // edge is from a sibling + // Check - code should already be generated for this source dfnode + assert(OutputMap.count(SrcDF) + && "Source node call not found. Dependency violation!"); + + // Find CallInst associated with the Source DFNode using OutputMap + Value* CI = OutputMap[SrcDF]; + + // Extract element at source position from this call instruction + std::vector<unsigned> IndexList; + IndexList.push_back(E->getSourcePosition()); + DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); + ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, + "", InsertBefore); + inputVal = EI; + } + return inputVal; +} + +void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86, + ValueToValueMapTy &VMap,Instruction* IB) { + Function* CF = C->getFuncPointer(); + +// Function* CF_X86 = C->getGenFunc(); + Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET); + assert(CF_X86 != NULL + && "Found leaf node for which code generation has not happened yet!\n"); + assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) && + "The generated function to be called from x86 backend is not an x86 function\n"); + DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n"); + + std::vector<Value*> Args; + // Create argument list to pass to call instruction + // First find the correct values using the edges + // The remaing six values are inserted as constants for now. + for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) { + Args.push_back(getInValueAt(C, i, F_X86, IB)); + } + + Value* I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0); + for(unsigned j=0; j<6; j++) + Args.push_back(I64Zero); + + errs() << "Gen Function type: " << *CF_X86->getType() << "\n"; + errs() << "Node Function type: " << *CF->getType() << "\n"; + errs() << "Arguments: " << Args.size() << "\n"; + + // Call the F_X86 function associated with this node + CallInst* CI = CallInst::Create(CF_X86, Args, + CF_X86->getName()+"_output", + IB); + DEBUG(errs() << *CI << "\n"); + OutputMap[C] = CI; + + // Find num of dimensions this node is replicated in. + // Based on number of dimensions, insert loop instructions + std::string varNames[3] = {"x", "y", "z"}; + unsigned numArgs = CI->getNumArgOperands(); + for(unsigned j=0; j < C->getNumOfDim(); j++) { + Value* indexLimit = NULL; + // Limit can either be a constant or an arguement of the internal node. + // In case of constant we can use that constant value directly in the + // new F_X86 function. In case of an argument, we need to get the mapped + // value using VMap + if(isa<Constant>(C->getDimLimits()[j])) { + indexLimit = C->getDimLimits()[j]; + DEBUG(errs() << "In Constant case:\n" + << " indexLimit type = " << *indexLimit->getType() << "\n"); + } + else { + indexLimit = VMap[C->getDimLimits()[j]]; + DEBUG(errs() << "In VMap case:" + <<" indexLimit type = " << *indexLimit->getType() << "\n"); + } + assert(indexLimit && "Invalid dimension limit!"); + // Insert loop + Value* indexVar = addLoop(CI, indexLimit, varNames[j]); + DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n"); + // Insert index variable and limit arguments + CI->setArgOperand(numArgs-6+j, indexVar); + CI->setArgOperand(numArgs-3+j, indexLimit); + } + // Insert call to runtime to push the dim limits and instanceID on the depth + // stack + Value* args[] = { + ConstantInt::get(Type::getInt32Ty(CI->getContext()), C->getNumOfDim()), // numDim + CI->getArgOperand(numArgs-3+0), // limitX + CI->getArgOperand(numArgs-6+0), // iX + CI->getArgOperand(numArgs-3+1), // limitY + CI->getArgOperand(numArgs-6+1), // iY + CI->getArgOperand(numArgs-3+2), // limitZ + CI->getArgOperand(numArgs-6+2) // iZ + }; + + CallInst* Push = CallInst::Create(llvm_visc_x86_dstack_push, ArrayRef<Value*>(args, 7), "", CI); + DEBUG(errs() << "Push on stack: " << *Push << "\n"); + // Insert call to runtime to pop the dim limits and instanceID from the depth + // stack + BasicBlock::iterator i(CI); + ++i; + Instruction* NextI = &*i; + // Next Instruction should also belong to the same basic block as the basic + // block will have a terminator instruction + assert(NextI->getParent() == CI->getParent() + && "Next Instruction should also belong to the same basic block!"); + + CallInst* Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI); + DEBUG(errs() << "Pop from stack: " << *Pop << "\n"); + DEBUG(errs() << *CI->getParent()->getParent()); +} + +/* This function takes a DFNode, and creates a filter function for it. By filter + * function we mean a function which keeps on getting input from input buffers, + * applying the function on the inputs and then pushes data on output buffers + */ +// Create a function with void* (void*) type. +// Create a new basic block +// Add a return instruction to the basic block +// extract arguments from the aggregate data input. Type list would be +// Replace the streaming inputs with i8* types signifying handle to +// corresponding buffers +// Add a boolean argument isLastInput +// Add runtime API calls to get input for each of the streaming inputs +// Add a call to the generated function of the child node +// Add runtime API calls to push output for each of the streaming outputs +// Add loop around the basic block, which exits the loop if isLastInput is false + +Function* CGT_X86::createFunctionFilter(DFNode* C) { + DEBUG(errs() << "*********Creating Function filter for " << C->getFuncPointer()->getName() << "*****\n"); + + /* Create a function with same argument list as child.*/ + DEBUG(errs() << "\tCreate a function with the same argument list as child\n"); + // Get the generated function for child node + Function* CF = C->getFuncPointer(); + // Create Filter Function of type i8*(i8*) which calls the root function + Type* i8Ty = Type::getInt8Ty(M.getContext()); + FunctionType* CF_PipelineTy = FunctionType::get(i8Ty->getPointerTo(), + ArrayRef<Type*>(i8Ty->getPointerTo()), + false); + Function* CF_Pipeline = Function::Create(CF_PipelineTy, + CF->getLinkage(), + CF->getName()+"_Pipeline", + &M); + DEBUG(errs() << "Generating Pipline Function\n"); + // Give a name to the argument which is used pass data to this thread + Value* data = &*CF_Pipeline->arg_begin(); + data->setName("data.addr"); + // Create a new basic block + DEBUG(errs() << "\tCreate new BB and add a return function\n"); + // Add a basic block to this empty function + BasicBlock *BB = BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline); + // Add a return instruction to the basic block + ReturnInst* RI = ReturnInst::Create(CF_Pipeline->getContext(), + UndefValue::get(CF_Pipeline->getReturnType()), BB); + + + /* Extract the elements from the aggregate argument to the function. + * Replace the streaming inputs with i8* types signifying handle to + * corresponding buffers + * Add outputs to the list as well + * Add isLastInput to the list + */ + DEBUG(errs() << "\tReplace streaming input arguments with i8* type\n"); + // These Args will be used when passing arguments to the generated function + // inside loop, and reading outputs as well. + std::vector<Value*> Args; + std::vector<Type*> TyList; + std::vector<std::string> names; + // Adding inputs + for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); + i != e; ++i) { + if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { + TyList.push_back(i8Ty->getPointerTo()); + names.push_back((Twine(i->getName())+"_buffer").str()); + } + else { + TyList.push_back(i->getType()); + names.push_back(i->getName()); + } + } + // Adding outputs. FIXME: Since we assume all outputs to be streaming edges, + // because we get there buffer handles + StructType* RetTy = cast<StructType>(CF->getReturnType()); + for (unsigned i=0; i<RetTy->getNumElements(); i++) { + TyList.push_back(i8Ty->getPointerTo()); + names.push_back("out"); + } + /* Add a boolean argument isLastInput */ + DEBUG(errs() << "\tAdd a boolean argument called isLastInput to function\n"); + TyList.push_back(i8Ty->getPointerTo()); + names.push_back("isLastInput_buffer"); + + // Extract the inputs, outputs and + Args = extractElements(data, TyList, names, RI); + for(unsigned i=0; i<Args.size(); i++) { + DEBUG(errs() << *Args[i] << "\n"); + } + + // Split the Args vector into, input output and isLastInput + unsigned numInputs = CF->getFunctionType()->getNumParams(); + unsigned numOutputs = RetTy->getNumElements(); + std::vector<Value*> InputArgs(Args.begin(), Args.begin() + numInputs); + std::vector<Value*> OutputArgs(Args.begin() + numInputs, Args.begin() + numInputs + numOutputs); + Instruction* isLastInput = cast<Instruction>(Args[Args.size()-1]); + + /* Add runtime API calls to get input for each of the streaming input edges */ + DEBUG(errs() << "\tAdd runtime API calls to get input for each of the streaming input edges\n"); + // First read the termination condition variable islastInput + CallInst* isLastInputPop = CallInst::Create(llvm_visc_bufferPop, + ArrayRef<Value*>(isLastInput), + "", + RI); + + CastInst* BI = BitCastInst::CreateIntegerCast(isLastInputPop, + Type::getInt64Ty(CF_Pipeline->getContext()), + false, + "isLastInput", + RI); + isLastInput = BI; + // Create a loop termination condition + CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, + isLastInput, Constant::getNullValue(Type::getInt64Ty(CF->getContext())), "isLastInputNotZero", + RI); + + // Get input from buffers of all the incoming streaming edges + for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); + i != e; ++i) { + if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { + CallInst* bufferIn = CallInst::Create(llvm_visc_bufferPop, + ArrayRef<Value*>(InputArgs[i->getArgNo()]), + "", + RI); + CastInst* BI; + if(i->getType()->isPointerTy()) { + BI = CastInst::Create(CastInst::IntToPtr, + bufferIn, + i->getType(), + i->getName()+".addr", + RI); + } + else if(i->getType()->isFloatTy()) { + BI = CastInst::CreateFPCast(bufferIn, + i->getType(), + i->getName()+".addr", + RI); + } + else { + BI = CastInst::CreateIntegerCast(bufferIn, + i->getType(), + false, + i->getName()+".addr", + RI); + } + // Replace the argument in Args vector. We would be using the vector as + // parameters passed to the call + InputArgs[i->getArgNo()] = BI; + } + } + /* Add a call to the generated function of the child node */ + DEBUG(errs() << "\tAdd a call to the generated function of the child node\n"); +// DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n"); +// CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs, +// C->getGenFunc()->getName()+".output", RI); + Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET); + DEBUG(errs() << "Type: " + << *CGenF->getType() + << "\n"); + CallInst* CI = CallInst::Create(CGenF, + InputArgs, + CGenF->getName()+".output", + RI); + + /* Add runtime API calls to push output for each of the streaming outputs */ + // FIXME: Assumption + // All edges between siblings are streaming edges + DEBUG(errs() << "\tAdd runtime API calls to push output for each of the streaming outputs\n"); + for (unsigned i=0; i< numOutputs; i++) { + // Extract output + ExtractValueInst* EI = ExtractValueInst::Create(CI, ArrayRef<unsigned>(i), + "",RI); + // Convert to i64 + CastInst* BI; + if(EI->getType()->isPointerTy()) + BI = CastInst::Create(CastInst::PtrToInt,EI, + Type::getInt64Ty(CF_Pipeline->getContext()), + "", + RI); + else + BI = CastInst::CreateIntegerCast(EI, Type::getInt64Ty(CF_Pipeline->getContext()), + false, "", RI); + // Push to Output buffer + Value* bufferOutArgs[] = {OutputArgs[i], BI}; + CallInst* bufferOut = CallInst::Create(llvm_visc_bufferPush, + ArrayRef<Value*>(bufferOutArgs, 2), + "", + RI); + } + + // Add loop around the basic block, which exits the loop if isLastInput is false + //addDoWhileLoop(cast<Instruction>(Cond)->getNextNode(), RI, Cond); +// addWhileLoop(cast<Instruction>(isLastInputPop), cast<Instruction>(Cond)->getNextNode(), +// RI, Cond); + + // Add loop around the basic block, which exits the loop if isLastInput is false + // Pointers to keep the created loop structure + BasicBlock *EntryBB, *CondBB, *BodyBB; + Instruction *CondStartI = cast<Instruction>(isLastInputPop); + Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode(); + EntryBB = CondStartI->getParent(); + + addWhileLoop(CondStartI, BodyStartI, RI, Cond); + CondBB = CondStartI->getParent(); + BodyBB = CI->getParent(); + Instruction *CntI = NULL; + CallInst *GetPolicyCI = get_llvm_visc_policy_getVersion_call(CGenF); + + // If the node function calls the visc runtime call to get policy, we update + // it with the counter information. This means we need to pass an additional + // argument to the generated function, that is the iteration number, and then + // use it as an argument to the policy_getVersion call + if (GetPolicyCI) { + CntI = addWhileLoopCounter(EntryBB, CondBB, BodyBB); + assert(CntI && "Counter instruction not found\n"); + + // Create new function type (with additional argument for iteration number) + Type *NewRetTy = CGenF->getFunctionType()->getReturnType(); + std::vector<Type*> NewArgTypes; + for (Function::arg_iterator ai = CGenF->arg_begin(), ae = CGenF->arg_end(); + ai != ae ; ++ai) { + NewArgTypes.push_back(ai->getType()); + } + NewArgTypes.push_back(Type::getInt64Ty(M.getContext())); + FunctionType *NewFT = FunctionType::get(NewRetTy, NewArgTypes, false); + Function *NewCGenF = viscUtils::cloneFunction(CGenF, NewFT, false); + // At least one (the last) argument exists (we added it) + Function::arg_iterator ae = NewCGenF->arg_end(); + --ae; + Argument *CntArg = &*ae; + CntArg->setName("iteration"); + // Replace the old cpu gen func with this one + C->addGenFunc(NewCGenF, visc::CPU_TARGET, true); + + // Add counter to the actual parameter list, to create the new call + InputArgs.push_back(CntI); + CallInst* newCI = CallInst::Create(NewCGenF, + InputArgs, + NewCGenF->getName()+".output"); + ReplaceInstWithInst(CI, newCI); + + // Set second operand of the policy_getVersion call to the last function + // argument + GetPolicyCI = get_llvm_visc_policy_getVersion_call(NewCGenF); + GetPolicyCI->setArgOperand(1, CntArg); + } + + // Return the Function pointer + DEBUG(errs() << "Pipeline Version of " << CF->getName() << ":\n"); + DEBUG(errs() << *CF_Pipeline << "\n"); + return CF_Pipeline; +} + +void CGT_X86::codeGen(DFInternalNode* N) { + // Check if N is root node and its graph is streaming. We do not do codeGen + // for Root in such a case + if(N->isRoot() && N->isChildGraphStreaming()) + return; + + // Check if clone already exists. If it does, it means we have visited this + // function before and nothing else needs to be done for this leaf node. +// if(N->getGenFunc() != NULL) +// return; + if (!preferredTargetIncludes(N, visc::CPU_TARGET)) { + errs() << "No CPU hint for node " << N->getFuncPointer()->getName() << + " : skipping it\n"; + return; + } + + assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL && + "Error: Visiting a node for which code already generated\n"); + + // Sort children in topological order before code generation + N->getChildGraph()->sortChildren(); + + // Only process if all children have a CPU x86 function + // Otherwise skip to end + bool codeGen = true; + for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); ci != ce; ++ci) { + DFNode* C = *ci; + // Skip dummy node call + if (C->isDummyNode()) + continue; + + if (!(C->hasX86GenFuncForTarget(visc::CPU_TARGET))) { + errs() << "No CPU x86 version for child node " + << C->getFuncPointer()->getName() + << "\n Skip code gen for parent node " + << N->getFuncPointer()->getName() << "\n"; + codeGen = false; + } + } + + if (codeGen) { + Function* F = N->getFuncPointer(); + // Create of clone of F with no instructions. Only the type is the same as F + // without the extra arguments. + Function* F_X86; + + // Clone the function, if we are seeing this function for the first time. We + // only need a clone in terms of type. + ValueToValueMapTy VMap; + + // Create new function with the same type + F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); + + // Loop over the arguments, copying the names of arguments over. + Function::arg_iterator dest_iterator = F_X86->arg_begin(); + for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); + i != e; ++i) { + dest_iterator->setName(i->getName()); // Copy the name over... + // Increment dest iterator + ++dest_iterator; + } + + // Add a basic block to this empty function + BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86); + ReturnInst* RI = ReturnInst::Create(F_X86->getContext(), + UndefValue::get(F_X86->getReturnType()), BB); + + // Add Index and Dim arguments except for the root node and the child graph of + // parent node is not streaming + if(!N->isRoot() && !N->getParent()->isChildGraphStreaming()) + F_X86 = addIdxDimArgs(F_X86); + + BB = &*F_X86->begin(); + RI = cast<ReturnInst>(BB->getTerminator()); + + //Add generated function info to DFNode +// N->setGenFunc(F_X86, visc::CPU_TARGET); + N->addGenFunc(F_X86, visc::CPU_TARGET, true); + + // Loop over the arguments, to create the VMap. + dest_iterator = F_X86->arg_begin(); + for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); + i != e; ++i) { + // Add mapping and increment dest iterator + VMap[&*i] = &*dest_iterator; + ++dest_iterator; + } + + // Iterate over children in topological order + for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); ci != ce; ++ci) { + DFNode* C = *ci; + // Skip dummy node call + if (C->isDummyNode()) + continue; + + // Create calls to CPU function of child node + invokeChild_X86(C, F_X86, VMap, RI); + + } + + DEBUG(errs() << "*** Generating epilogue code for the function****\n"); + // Generate code for output bindings + // Get Exit node + DFNode* C = N->getChildGraph()->getExit(); + // Get OutputType of this node + StructType* OutTy = N->getOutputType(); + Value *retVal = UndefValue::get(F_X86->getReturnType()); + // Find all the input edges to exit node + for (unsigned i=0; i < OutTy->getNumElements(); i++) { + DEBUG(errs() << "Output Edge " << i << "\n"); + // Find the incoming edge at the requested input port + DFEdge* E = C->getInDFEdgeAt(i); + + assert(E && "No Binding for output element!"); + // Find the Source DFNode associated with the incoming edge + DFNode* SrcDF = E->getSourceDF(); + + DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); + + // If Source DFNode is a dummyNode, edge is from parent. Get the + // argument from argument list of this internal node + Value* inputVal; + if(SrcDF->isEntryNode()) { + inputVal = getArgumentAt(F_X86, i); + DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + } + else { + // edge is from a internal node + // Check - code should already be generated for this source dfnode + assert(OutputMap.count(SrcDF) + && "Source node call not found. Dependency violation!"); + + // Find Output Value associated with the Source DFNode using OutputMap + Value* CI = OutputMap[SrcDF]; + + // Extract element at source position from this call instruction + std::vector<unsigned> IndexList; + IndexList.push_back(E->getSourcePosition()); + DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); + ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, + "",RI); + inputVal = EI; + } + std::vector<unsigned> IdxList; + IdxList.push_back(i); + retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI); + } + DEBUG(errs() << "Extracted all\n"); + retVal->setName("output"); + ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); + ReplaceInstWithInst(RI, newRI); + + } + + //-------------------------------------------------------------------------// + // Here, we need to check if this node (N) has more than one versions + // If so, we query the policy and have a call to each version + // If not, we see which version exists, check that it is in fact an x86 + // function and save it as the CPU_TARGET function + + // TODO: visc_id per node, so we can use this for id for policies + // For now, use node function name and change it later + Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET); + Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET); + Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET); + + bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); + bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); + bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET); + + errs() << "Node: " << N->getFuncPointer()->getName() + << " with tag " << N->getTag() << "\n"; + errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n"; + errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n"; + errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n"; + + + if (N->getTag() == visc::None) { + // No code is available for this node. This (usually) means that this + // node is a node that + // - from the accelerator backends has been mapped to an intermediate + // node, and thus they have not produced a genFunc + // - a child node had no CPU hint, thus no code gen for CPU could + // take place + errs() << "No GenFunc - Skipping CPU code generation for node " + << N->getFuncPointer()->getName() << "\n"; + } else if (viscUtils::isSingleTargetTag(N->getTag())) { + // There is a single version for this node according to code gen hints. + // Therefore, we do not need to check the policy, we simply use the + // available implementation, whichever target it is for. + + // Sanity check - to be removed TODO + switch (N->getTag()) { + case visc::CPU_TARGET: + assert(N->getGenFuncForTarget(visc::CPU_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && ""); + assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && ""); + break; + case visc::GPU_TARGET: + assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(N->getGenFuncForTarget(visc::GPU_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && ""); + assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && ""); + break; + case visc::SPIR_TARGET: + assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && ""); + assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); + assert(N->getGenFuncForTarget(visc::SPIR_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::SPIR_TARGET) && ""); + break; + default: + assert(false && "Unreachable: we checked that tag was single target!\n"); + break; + } + + // If device abstraction is enabled, then we may need to edit the node + // function. In case this is a GPU or SPIR gen func, we issue a call to + // the runtime that waits for the device to be available + if (DeviceAbstraction) { + Function *NodeGenFunc = NULL; + switch (N->getTag()) { + case visc::GPU_TARGET: + NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET); + break; + case visc::SPIR_TARGET: + NodeGenFunc = N->getGenFuncForTarget(visc::SPIR_TARGET); + break; + default: + break; + } + + if (NodeGenFunc) { + // If we found a function to edit, we add the call to the runtime as + // its first statement + BasicBlock *BB = &*NodeGenFunc->begin(); + std::vector<Value *> Args; // TODO: add the device type as argument? + Function *RTF = + cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType())); + CallInst *RTFInst = CallInst::Create(RTF, Args, "", BB->getFirstNonPHI()); + } + + } + + Function *Ftmp = N->getGenFuncForTarget(N->getTag()); + N->removeGenFuncForTarget(visc::GPU_TARGET); + N->removeGenFuncForTarget(visc::SPIR_TARGET); + N->setTag(visc::None); + N->addGenFunc(Ftmp, visc::CPU_TARGET, true); + N->setTag(visc::CPU_TARGET); + + // Sanity checks - to be removed TODO + CF = N->getGenFuncForTarget(visc::CPU_TARGET); + GF = N->getGenFuncForTarget(visc::GPU_TARGET); + SF = N->getGenFuncForTarget(visc::SPIR_TARGET); + + CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); + GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); + SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET); + + errs() << "After editing\n"; + errs() << "Node: " << N->getFuncPointer()->getName() + << " with tag " << N->getTag() << "\n"; + errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n"; + errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n"; + errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n"; + errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n"; + + // assert(false && "got to the point where we have to select\n"); + } else { + // We have more than one targets + + errs() << "Node Name (for policy) : " + << N->getFuncPointer()->getName() << "\n"; + + Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET); + Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET); + Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET); + + bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); + bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); + bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET); + + // These assertions express what we can support with the current runtime. + // Code generation works the same way even for other target combinations. + // For now, we want either CPU and GPU, or CPU and SPIR + assert((CF && (GF && !SF || !GF && SF)) && "Invalid target selection\n"); + assert((CFx86 && (GFx86 && !SFx86 || !GFx86 && SFx86)) && + "Generated functions without appropriate x86 wrapper\n"); + + FunctionType *FT = CF->getFunctionType(); + if (GF) + assert(FT == GF->getFunctionType() && + "Type mismatch between generated functions for GPU and CPU targets.\n"); + if (SF) + assert(FT == SF->getFunctionType() && + "Type mismatch between generated functions for SPIR and CPU targets.\n"); + + // Code generation of wrapper function + Function *F_wrapper; + ValueToValueMapTy VMap; + F_wrapper = Function::Create(FT, CF->getLinkage(), CF->getName()+"_wrapper", &M); + + // Copy argument names over + Function::arg_iterator dest_iterator = F_wrapper->arg_begin(); + for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); + i != e; ++i) { + dest_iterator->setName(i->getName()); + VMap[&*i] = &*dest_iterator; + ++dest_iterator; + } + // Gather all arguments of wrapper in a vector, to prepare the call to + // the individual gen functions + std::vector<Value *> GenFuncCallArgs; + for (Function::arg_iterator i = F_wrapper->arg_begin(), e = F_wrapper->arg_end(); + i != e; ++i) { + GenFuncCallArgs.push_back(&*i); + } + + BasicBlock *BBcurrent, *BBtrue, *BBfalse; + + BBcurrent = BasicBlock::Create(M.getContext(), "entry", F_wrapper); + + StringRef FName = N->getFuncPointer()->getName(); + size_t nameSize = FName.size()+1; + std::vector<Constant *> NameV; + for (char c: FName) { + NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), c)); + } + NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), '\0')); + ArrayType *NameType = + ArrayType::get(IntegerType::get(M.getContext(), 8), nameSize); + AllocaInst *AI = new AllocaInst(NameType, nullptr, "", BBcurrent); + Constant *NameConst = ConstantArray::get(NameType, NameV); + StoreInst *StI = new StoreInst(NameConst, AI, BBcurrent); + CastInst *BI = BitCastInst::CreatePointerCast(AI, + Type::getInt8PtrTy(M.getContext()), "", BBcurrent); + std::vector<Value *> Args; + Args.push_back(BI); + Args.push_back(ConstantInt::get(Type::getInt64Ty(M.getContext()), -1, true)); + Function *RTF = + cast<Function>(M.getOrInsertFunction("llvm_visc_policy_getVersion", + runtimeModule->getFunction("llvm_visc_policy_getVersion")->getFunctionType())); + CallInst *RTFInst = CallInst::Create(RTF, Args, "", BBcurrent); + + ConstantInt *CmpConst = + ConstantInt::get(Type::getInt32Ty(M.getContext()), 0, true); + CmpInst *CmpI = CmpInst::Create(Instruction::ICmp, + CmpInst::ICMP_EQ, + RTFInst, CmpConst, + "", BBcurrent); + + BBtrue = BasicBlock::Create(M.getContext(), "version_cpu", F_wrapper); + BBfalse = BasicBlock::Create(M.getContext(), "not_cpu", F_wrapper); + BranchInst *BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); + + CallInst *GenFuncCI = CallInst::Create(CF, GenFuncCallArgs, "", BBtrue); + ReturnInst *RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); + + // Switch basic block pointers + BBcurrent = BBfalse; + if (GF) { + // We have a GPU version. Generate policy check and call + CmpConst = + ConstantInt::get(Type::getInt32Ty(M.getContext()), 1, true); + CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + RTFInst, CmpConst, "", BBcurrent); + BBtrue = BasicBlock::Create(M.getContext(), "version_gpu", F_wrapper); + BBfalse = BasicBlock::Create(M.getContext(), "not_gpu", F_wrapper); + BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); + + GenFuncCI = CallInst::Create(GF, GenFuncCallArgs, "", BBtrue); + RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); + + if (DeviceAbstraction) { + // Prepare arguments and function for call to wait for device runtime call + std::vector<Value *> Args; // TODO: add the device type as argument? + Function *RTF = + cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType())); + CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI); + } + } + + // Switch basic block pointers + BBcurrent = BBfalse; + if (SF) { + // We have a GPU version. Generate policy check and call + CmpConst = + ConstantInt::get(Type::getInt32Ty(M.getContext()), 2, true); + CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + RTFInst, CmpConst, "", BBcurrent); + BBtrue = BasicBlock::Create(M.getContext(), "version_spir", F_wrapper); + BBfalse = BasicBlock::Create(M.getContext(), "not_spir", F_wrapper); + BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); + + GenFuncCI = CallInst::Create(SF, GenFuncCallArgs, "", BBtrue); + RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); + + if (DeviceAbstraction) { + // Prepare arguments and function for call to wait for device runtime call + std::vector<Value *> Args; // TODO: add the device type as argument? + Function *RTF = + cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus", + runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType())); + CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI); + } + } + + RI = ReturnInst::Create(M.getContext(), + UndefValue::get(FT->getReturnType()), BBfalse); + + // Now, make the node cpu gen func to be this one + // Remove all other versions and update the tag + N->addGenFunc(F_wrapper, visc::CPU_TARGET, true); + N->removeGenFuncForTarget(visc::GPU_TARGET); + N->removeGenFuncForTarget(visc::SPIR_TARGET); + N->setTag(visc::CPU_TARGET); + + // assert(false && "got to the point where we have to combine\n"); + } + +} + +// Code generation for leaf nodes +void CGT_X86::codeGen(DFLeafNode* N) { + // Skip code generation if it is a dummy node + if(N->isDummyNode()) { + DEBUG(errs() << "Skipping dummy node\n"); + return; + } + + // At this point, the X86 backend does not support code generation for + // the case where allocation node is used, so we skip. This means that a + // CPU version will not be created, and therefore code generation will + // only succeed if another backend (nvptx or spir) has been invoked to + // generate a node function for the node including the allocation node. + if (N->isAllocationNode()) { + DEBUG(errs() << "Skipping allocation node\n"); + return; + } + + // Check if clone already exists. If it does, it means we have visited this + // function before and nothing else needs to be done for this leaf node. +// if(N->getGenFunc() != NULL) +// return; + + if (!preferredTargetIncludes(N, visc::CPU_TARGET)) { + errs() << "No CPU hint for node " << N->getFuncPointer()->getName() << + " : skipping it\n"; + + errs() << "Check for cudnn or promise hint for node " + << N->getFuncPointer()->getName() << "\n"; + + switch (N->getTag()) { + case visc::CUDNN_TARGET: { + errs() << "CUDNN hint found. Store CUDNN function as CPU funtion.\n"; + // Make sure there is a generated x86 function for cudnn + assert(N->getGenFuncForTarget(visc::CUDNN_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::CUDNN_TARGET) && ""); + // Store the CUDNN x86 function as the CPU generated function + Function *Ftmp = N->getGenFuncForTarget(N->getTag()); + // after adding the required number of arguments + if (!N->getParent()->isChildGraphStreaming()) + Ftmp = addIdxDimArgs(Ftmp); + + N->removeGenFuncForTarget(visc::CUDNN_TARGET); + N->setTag(visc::None); + N->addGenFunc(Ftmp, visc::CPU_TARGET, true); + N->setTag(visc::CPU_TARGET); + break; + } + case visc::PROMISE_TARGET: { + errs() << "Promise hint found. Store PROMISE function as CPU funtion.\n"; + // Make sure there is a generated x86 function for promise + assert(N->getGenFuncForTarget(visc::PROMISE_TARGET) && ""); + assert(N->hasX86GenFuncForTarget(visc::PROMISE_TARGET) && ""); + // Store the PROMISE x86 function as the CPU generated function + Function *Ftmp = N->getGenFuncForTarget(N->getTag()); + // after adding the required number of arguments + if (!N->getParent()->isChildGraphStreaming()) + Ftmp = addIdxDimArgs(Ftmp); + + N->setTag(visc::None); + N->removeGenFuncForTarget(visc::PROMISE_TARGET); + N->addGenFunc(Ftmp, visc::CPU_TARGET, true); + N->setTag(visc::CPU_TARGET); + break; + } + case visc::GPU_TARGET: + // A leaf node should not have an x86 function for GPU + // by design of DFG2LLVM_NVPTX backend + assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); + break; + case visc::SPIR_TARGET: + // A leaf node should not have an x86 function for SPIR + // by design of DFG2LLVM_SPIR backend + assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && ""); + break; + default: + break; + } + + return; + } + + assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL && + "Error: Visiting a node for which code already generated\n"); + + std::vector<IntrinsicInst *> IItoRemove; + std::vector<std::pair<IntrinsicInst *, Value *> > IItoReplace; + BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap; + + // Get the function associated woth the dataflow node + Function *F = N->getFuncPointer(); + + // Clone the function, if we are seeing this function for the first time. + Function *F_X86; + ValueToValueMapTy VMap; + F_X86 = CloneFunction(F, VMap); + F_X86->removeFromParent(); + // Insert the cloned function into the module + M.getFunctionList().push_back(F_X86); + + // Add the new argument to the argument list. Add arguments only if the cild + // graph of parent node is not streaming + if(!N->getParent()->isChildGraphStreaming()) + F_X86 = addIdxDimArgs(F_X86); + + // Add generated function info to DFNode +// N->setGenFunc(F_X86, visc::CPU_TARGET); + N->addGenFunc(F_X86, visc::CPU_TARGET, true); + + /*** FIXME: HACK FOR DSSOC DEMO -- BEGIN ***/ + /* This part of the code is meant to handle turning the CPU backend into an + "accelerator" backend for ApproxHPVM. For this reason, the HPVM runtime + needs to be essentially deactivated. */ + + /* We look into the leaf node's function for function call starting from + "tensor". These are functions with which we replaced the ApproxHPVM + intrinsics, and for which we have LLVM implementations. If found, it means + we are dealing with an AproxHPVM program. */ + bool isApproxHPVMnode = false; + for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) { + Instruction *I = &(*i); + DEBUG(errs() << *I << "\n"); + + if (CallInst *CI = dyn_cast<CallInst>(I)) { + if ((CI->getCalledFunction()->getName()).startswith("tensor")) { + isApproxHPVMnode = true; + break; + } + } + } + + /*As in CUDNN backend, we remove the in out attributes of tensor operations, + aiming to deactivate the HPVM runtime calls. This has been tested through + CUDNN backend for the internal node codegen, and should ensure that code + does not insert llvm_visc_x86_argument_ptr in the generated function for + leaf node codegen as well. */ + + /* Removing HPVM in/out/inout function attributes */ + if (isApproxHPVMnode) { + for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end(); ai != ae; ai++) { + Argument *Arg = &*ai; + if(Arg->hasAttribute(Attribute::In)) + Arg->removeAttr(Attribute::In); + if(Arg->hasAttribute(Attribute::Out)) + Arg->removeAttr(Attribute::Out); + if(Arg->hasAttribute(Attribute::InOut)) + Arg->removeAttr(Attribute::InOut); + } + }else{ + printf("****** NO REMOVEAL *** \n\n"); + } + + /*** FIXME: HACK FOR DSSOC DEMO -- END ***/ + + // Go through the arguments, and any pointer arguments with in attribute need + // to have x86_argument_ptr call to get the x86 ptr of the argument + // Insert these calls in a new BB which would dominate all other BBs + // Create new BB + BasicBlock* EntryBB = &*F_X86->begin(); + BasicBlock* BB = BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB); + BranchInst* Terminator = BranchInst::Create(EntryBB, BB); + // Insert calls + for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end(); + ai != ae; ++ai) { + if (F_X86->getAttributes().hasAttribute(ai->getArgNo()+1, Attribute::In)) { + assert(ai->getType()->isPointerTy() + && "Only pointer arguments can have visc in/out attributes "); + Function::arg_iterator aiNext = ai; + ++aiNext; + Argument* size = &*aiNext; + assert(size->getType() == Type::getInt64Ty(M.getContext()) + && "Next argument after a pointer should be an i64 type"); + CastInst* BI = BitCastInst::CreatePointerCast(&*ai, + Type::getInt8PtrTy(M.getContext()), + ai->getName()+".i8ptr", + Terminator); + Value* ArgPtrCallArgs[] = {BI, size}; + CallInst::Create(llvm_visc_x86_argument_ptr, + ArrayRef<Value*>(ArgPtrCallArgs, 2), + "", + Terminator); + + } + } + errs() << *BB << "\n"; + + // Go through all the instructions + for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) { + Instruction *I = &(*i); + DEBUG(errs() << *I << "\n"); + // Leaf nodes should not contain VISC graph intrinsics or launch + assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!"); + assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!"); + + if (BuildDFG::isViscQueryIntrinsic(I)) { + IntrinsicInst* II = cast<IntrinsicInst>(I); + IntrinsicInst* ArgII; + DFNode* ArgDFNode; + + /*********************************************************************** + * Handle VISC Query intrinsics * + ***********************************************************************/ + switch (II->getIntrinsicID()) { + /**************************** llvm.visc.getNode() *******************/ + case Intrinsic::visc_getNode: { + // add mapping <intrinsic, this node> to the node-specific map + Leaf_HandleToDFNodeMap[II] = N; + IItoRemove.push_back(II); + break; + } + /************************* llvm.visc.getParentNode() ****************/ + case Intrinsic::visc_getParentNode: { + // get the parent node of the arg node + // get argument node + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + // get the parent node of the arg node + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + // Add mapping <intrinsic, parent node> to the node-specific map + // the argument node must have been added to the map, orelse the + // code could not refer to it + Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent(); + IItoRemove.push_back(II); + break; + } + /*************************** llvm.visc.getNumDims() *****************/ + case Intrinsic::visc_getNumDims: { + // get node from map + // get the appropriate field + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + int numOfDim = Leaf_HandleToDFNodeMap[ArgII]->getNumOfDim(); + IntegerType* IntTy = Type::getInt32Ty(M.getContext()); + ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim); + + II->replaceAllUsesWith(numOfDimConstant); + IItoRemove.push_back(II); + break; + } + /*********************** llvm.visc.getNodeInstanceID() **************/ + case Intrinsic::visc_getNodeInstanceID_x: + case Intrinsic::visc_getNodeInstanceID_y: + case Intrinsic::visc_getNodeInstanceID_z: { + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + + // The dfnode argument should be an ancestor of this leaf node or + // the leaf node itself + int parentLevel = N->getAncestorHops(ArgDFNode); + assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N ) + && "Invalid DFNode argument to getNodeInstanceID_[xyz]!"); + + // Get specified dimension + // (dim = 0) => x + // (dim = 1) => y + // (dim = 2) => z + int dim = (int) (II->getIntrinsicID() - + Intrinsic::visc_getNodeInstanceID_x); + assert((dim >= 0) && (dim < 3) + && "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic ID!"); + + // For immediate ancestor, use the extra argument introduced in + // F_X86 + int numParamsF = F->getFunctionType()->getNumParams(); + int numParamsF_X86 = F_X86->getFunctionType()->getNumParams(); + assert((numParamsF_X86 - numParamsF == 6) + && "Difference of arguments between function and its clone is not 6!"); + + if(parentLevel == 0) { + // Case when the query is for this node itself + unsigned offset = 3 + (3-dim); + // Traverse argument list of F_X86 in reverse order to find the + // correct index or dim argument. + Argument* indexVal = getArgumentFromEnd(F_X86, offset); + assert(indexVal && "Index argument not found. Invalid offset!"); + + DEBUG(errs() << *II << " replaced with " << *indexVal << "\n"); + + II->replaceAllUsesWith(indexVal); + IItoRemove.push_back(II); + } + else { + // Case when query is for an ancestor + Value* args[] = { + ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel), + ConstantInt::get(Type::getInt32Ty(II->getContext()), dim) + }; + CallInst* CI = CallInst::Create(llvm_visc_x86_getDimInstance, + ArrayRef<Value*>(args, 2), + "nodeInstanceID", II); + DEBUG(errs() << *II << " replaced with " << *CI << "\n"); + II->replaceAllUsesWith(CI); + IItoRemove.push_back(II); + } + break; + } + /********************** llvm.visc.getNumNodeInstances() *************/ + case Intrinsic::visc_getNumNodeInstances_x: + case Intrinsic::visc_getNumNodeInstances_y: + case Intrinsic::visc_getNumNodeInstances_z: { + + ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); + ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; + + // The dfnode argument should be an ancestor of this leaf node or + // the leaf node itself + int parentLevel = N->getAncestorHops(ArgDFNode); + assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N ) + && "Invalid DFNode argument to getNodeInstanceID_[xyz]!"); + + // Get specified dimension + // (dim = 0) => x + // (dim = 1) => y + // (dim = 2) => z + int dim = (int) (II->getIntrinsicID() - + Intrinsic::visc_getNumNodeInstances_x); + assert((dim >= 0) && (dim < 3) + && "Invalid dimension for getNumNodeInstances_[xyz]. Check Intrinsic ID!"); + + // For immediate ancestor, use the extra argument introduced in + // F_X86 + int numParamsF = F->getFunctionType()->getNumParams(); + int numParamsF_X86 = F_X86->getFunctionType()->getNumParams(); + assert((numParamsF_X86 - numParamsF == 6) + && "Difference of arguments between function and its clone is not 6!"); + + if(parentLevel == 0) { + // Case when the query is for this node itself + unsigned offset = 3 - dim; + // Traverse argument list of F_X86 in reverse order to find the + // correct index or dim argument. + Argument* limitVal = getArgumentFromEnd(F_X86, offset); + assert(limitVal && "Limit argument not found. Invalid offset!"); + + DEBUG(errs() << *II << " replaced with " << *limitVal << "\n"); + + II->replaceAllUsesWith(limitVal); + IItoRemove.push_back(II); + } + else { + // Case when query is from the ancestor + Value* args[] = { + ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel), + ConstantInt::get(Type::getInt32Ty(II->getContext()), dim) + }; + CallInst* CI = CallInst::Create(llvm_visc_x86_getDimLimit, + ArrayRef<Value*>(args, 2), + "numNodeInstances", II); + DEBUG(errs() << *II << " replaced with " << *CI << "\n"); + II->replaceAllUsesWith(CI); + IItoRemove.push_back(II); + } + + break; + } + default: + DEBUG(errs() << "Found unknown intrinsic with ID = " << + II->getIntrinsicID() << "\n"); + assert(false && "Unknown VISC Intrinsic!"); + break; + } + + } else { + //TODO: how to handle address space qualifiers in load/store + } + + } + + //TODO: + // When to replace the uses? + // In which order is it safe to replace the instructions in + // IItoReplace? + // Probably in the reverse order in the vectors + // It is a good idea to have them in one vector and chech the type + // using dyn_cast in order to determine if we replace with inst or value + + + //TODO: maybe leave these instructions to be removed by a later DCE pass + for (std::vector<IntrinsicInst *>::iterator i = IItoRemove.begin(); + i != IItoRemove.end(); ++i) { + (*i)->replaceAllUsesWith(UndefValue::get((*i)->getType())); + (*i)->eraseFromParent(); + } + + DEBUG(errs() << *F_X86); +} + +} // End of namespace + +char DFG2LLVM_X86::ID = 0; +static RegisterPass<DFG2LLVM_X86> X("dfg2llvm-x86-dsoc", + "Dataflow Graph to LLVM for X86 backend (DSOCC version)", + false /* does not modify the CFG */, + true /* transformation, not just analysis */); + diff --git a/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/LLVMBuild.txt b/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/LLVMBuild.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6c4de95376cb517de25482ecf74f0782c479004 --- /dev/null +++ b/llvm/lib/Transforms/DFG2LLVM_X86_dsoc/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = DFG2LLVM_X86_dsoc +parent = Transforms + diff --git a/llvm/lib/Transforms/InlineTensorCalls/CMakeLists.txt b/llvm/lib/Transforms/InlineTensorCalls/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..51f321884fe7f9cb56e11df573bb837dde89434e --- /dev/null +++ b/llvm/lib/Transforms/InlineTensorCalls/CMakeLists.txt @@ -0,0 +1,13 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( InlineTensorCalls + InlineTensorCalls.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) + diff --git a/llvm/lib/Transforms/InlineTensorCalls/InlineTensorCalls.cpp b/llvm/lib/Transforms/InlineTensorCalls/InlineTensorCalls.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d31434341cf65939768d0acb7a0051d453909971 --- /dev/null +++ b/llvm/lib/Transforms/InlineTensorCalls/InlineTensorCalls.cpp @@ -0,0 +1,77 @@ +//=== InlineApproxHPVMCalls.cpp ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#define ENABLE_ASSERTS + +#define DEBUG_TYPE "INLINE_APPROXHPVM_CALLS" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +#include "llvm/IR/InstIterator.h" + +#include "llvm/Support/raw_ostream.h" +#include "llvm/Analysis/InlineCost.h" + +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/IR/CallSite.h" +#include "llvm/ADT/SetVector.h" +#include <sstream> + +using namespace llvm; + + +namespace { + + struct InlineApproxHPVMCalls : public ModulePass { + static char ID; // Pass identification, replacement for typeid + InlineApproxHPVMCalls() : ModulePass(ID) {} + + bool runOnModule(Module &M) override { + + InlineFunctionInfo IFI; + SmallSetVector<CallSite, 16> Calls; + bool Changed = false; + SmallVector<Function *, 16> InlinedFunctions; + for (Function &F : M){ + if (!F.isDeclaration() && F.getName().startswith("tensor") ) { + //errs()<<"Function = "<<*&F<<"\n"; + Calls.clear(); + + for (User *U : F.users()) + if (auto CS = CallSite(U)) + if (CS.getCalledFunction() == &F) + Calls.insert(CS); + + for (CallSite CS : Calls) + // FIXME: We really shouldn't be able to fail to inline at this point! + // We should do something to log or check the inline failures here. + Changed |= InlineFunction(CS, IFI); + + } + } + + return true; + } + + }; + + +} // End of namespace + +char InlineApproxHPVMCalls::ID = 0; +static RegisterPass<InlineApproxHPVMCalls> X("inline-tensor-calls", + "Inline ApproxHPVM tensor library function calls (CPU version)", + true /* modifies the CFG */, + true /* transformation, * + * not just analysis */); + diff --git a/llvm/lib/Transforms/InlineTensorCalls/InlineTensorCalls.exports b/llvm/lib/Transforms/InlineTensorCalls/InlineTensorCalls.exports new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/llvm/lib/Transforms/InlineTensorCalls/LLVMBuild.txt b/llvm/lib/Transforms/InlineTensorCalls/LLVMBuild.txt new file mode 100644 index 0000000000000000000000000000000000000000..8fff7891af1b6b81fd642bb1300a23c2caca6918 --- /dev/null +++ b/llvm/lib/Transforms/InlineTensorCalls/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = InlineTensorCalls +parent = Transforms + diff --git a/llvm/lib/Transforms/ReplaceIntrinsics/CMakeLists.txt b/llvm/lib/Transforms/ReplaceIntrinsics/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bfb2bf2219ba1278d14b78e7ee5dc0a0abd2702 --- /dev/null +++ b/llvm/lib/Transforms/ReplaceIntrinsics/CMakeLists.txt @@ -0,0 +1,13 @@ +if(WIN32 OR CYGWIN) + set(LLVM_LINK_COMPONENTS Core Support) +endif() + +add_llvm_loadable_module( ReplaceIntrinsics + ReplaceIntrinsics.cpp + + DEPENDS + intrinsics_gen + PLUGIN_TOOL + opt + ) + diff --git a/llvm/lib/Transforms/ReplaceIntrinsics/LLVMBuild.txt b/llvm/lib/Transforms/ReplaceIntrinsics/LLVMBuild.txt new file mode 100644 index 0000000000000000000000000000000000000000..6450fa1714de0200ce18848919d69cff895848d0 --- /dev/null +++ b/llvm/lib/Transforms/ReplaceIntrinsics/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = ReplaceIntrinsics +parent = Transforms + diff --git a/llvm/lib/Transforms/ReplaceIntrinsics/ReplaceIntrinsics.cpp b/llvm/lib/Transforms/ReplaceIntrinsics/ReplaceIntrinsics.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ef649d8e170451d5bf2f133d6113cbfeb30f046e --- /dev/null +++ b/llvm/lib/Transforms/ReplaceIntrinsics/ReplaceIntrinsics.cpp @@ -0,0 +1,516 @@ +//=== ReplaceApproxHPVMIntrinsicsWithFCalls.cpp ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#define ENABLE_ASSERTS + +#define DEBUG_TYPE "REPLACE_APPROXHPVM_INTRINSICS_WITH_FCALLS" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/IR/Attributes.h" +#include "llvm-c/Core.h" +#include "llvm/SupportVISC/VISCTimer.h" +#include "llvm/SupportVISC/DFG2LLVM.h" +#include "llvm/InPlaceDFG/InPlaceDFGAnalysis.h" +#include <sstream> + +using namespace llvm; +using namespace builddfg; +using namespace dfg2llvm; + +// TODO: We still need in place analysis, if calls have the same interface +using namespace inplacedfg; + +namespace { +// Helper class declarations + +// Replace ApproxHPVM intrinsics with LLVM function calls. +// aiming to go through the CPU backend code generation. + +struct DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls : public DFG2LLVM { + static char ID; // Pass identification, replacement for typeid + DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls() : DFG2LLVM(ID) {} +private: + +public: + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<BuildDFG>(); + AU.addRequired<InPlaceDFGAnalysisWrapper>(); + AU.addPreserved<BuildDFG>(); + AU.addPreserved<InPlaceDFGAnalysisWrapper>(); + } + + bool runOnModule(Module &M); +}; + +// Visitor for Code generation traversal (tree traversal for now) +class CGT_ReplaceApproxHPVMIntrinsicsWithFCalls : public CodeGenTraversal { + +private: + //Member variables + InPlaceDFGAnalysis::InPlaceDFGParameter *IPP; + + // VISC Runtime API and Tensor runtime API + + /* TODO: I believe that TensorRt is not needed, since we will have llvm + implementations linked in, so init and cleanup calls can be removed and + relevant code also, but I leave in in for now until verified. */ + Constant* llvm_hpvm_initTensorRt; + Constant* llvm_hpvm_cleanupTensorRt; +// Constant* hpvm_request_tensor; DONE: request tensor will not be used + + // Functions + bool isValidOperandForInPlaceOperation(Value *Op, Function *Fgen, DFNode *N); + + // Virtual Functions + void init(); + void initRuntimeAPI(); + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); + +public: + + // Constructor + CGT_ReplaceApproxHPVMIntrinsicsWithFCalls(Module &_M, BuildDFG &_DFG, InPlaceDFGAnalysis::InPlaceDFGParameter &_IPP) + : CodeGenTraversal(_M, _DFG), IPP(&_IPP) { + initRuntimeAPI(); + } + +}; + +bool CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::isValidOperandForInPlaceOperation(Value *Op, + Function *Fgen, + DFNode *N) { + // We only expect the if branch to be taken + if (Argument *Arg = dyn_cast<Argument>(Op)) { + DEBUG(errs() << *Arg << "\t: argument, candidate for in place\n"); + assert((Arg->getParent() == Fgen) && + "Extra Parameter in body of Function\n"); + // Candidae parameter is a function argument + // In this case, consult the result of in place analysis + // Find position in arg list + unsigned pos = Arg->getArgNo(); + // If this parameter cannot be used for in place operation + // code gen cannot continue + if (IPP->at(N)[pos]) { + DEBUG(errs() << *Arg << "\t: argument, suitable for in place\n"); + return true; + } else { + DEBUG(errs() << *Arg << "\t: argument, not suitable for in place\n"); + return false; + } + } + else { + // If it is not an argument, then it needs to be the result of + // another intrinsic. These are new objects that are allocated, + // and consumed by next intrinsic. Alternatively, the intrinsic + // could have been replaced by a call to an LLVM function. + // We do not expect a merge pass to have run before the replacement pass, + // therefore we do not expect to go in the else branch. + DEBUG(errs() << *Op << "\t: Test for result of intrinsic operation\n"); + if (dyn_cast<IntrinsicInst>(Op)) { + DEBUG(errs() << *Arg << "\t: local, suitable for in place\n"); + return true; + } else if (CallInst *CI = dyn_cast<CallInst>(Op)) { + if ((CI->getCalledFunction()->getName()).startswith("tensor")) + return true; + else + return false; + } + else { + DEBUG(errs() << *Arg << "\t: local, not suitable for in place\n"); + return false; + } + } +} + + +void CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::init() { +} + +// Initialize the VISC runtime API. This makes it easier to insert these calls +void CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::initRuntimeAPI() { + + // Load Runtime API Module + SMDiagnostic Err; + + char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!\n"); + + // FIXME: set correct path + Twine llvmSrcRoot = LLVM_SRC_ROOT; + Twine runtimeAPI = llvmSrcRoot+"/projects/hpvm-tensor-rt/lib/tensor_cpu_runtime.ll"; + runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); + if(runtimeModule == nullptr) + DEBUG(errs() << Err.getMessage()); + else + DEBUG(errs() << "Successfully loaded hpvm-tensor-rt API module\n"); + + // Get or insert Global declarations for + // - initialization + // - cleanup + // - request a tensor + DECLARE(llvm_hpvm_initTensorRt); + DECLARE(llvm_hpvm_cleanupTensorRt); +// DECLARE(hpvm_request_tensor); + + // Find visc.init and visc.cleanup calls, and add placeholder methods + // for initialization and cleanup of the hpvm tensor runtime + + Function* VI = M.getFunction("llvm.visc.init"); + assert(VI->getNumUses() == 1 && "__visc__init should only be used once\n"); + InitCall = cast<Instruction>(*VI->user_begin()); + CallInst::Create(llvm_hpvm_initTensorRt, + ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(M.getContext()), 0)), + "", InitCall); + + Function* VC = M.getFunction("llvm.visc.cleanup"); + assert(VC->getNumUses() == 1 && "__visc__clear should only be used once\n"); + CleanupCall = cast<Instruction>(*VC->user_begin()); + CallInst::Create(llvm_hpvm_cleanupTensorRt, ArrayRef<Value*>(), "", CleanupCall); + +} + +void CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::codeGen(DFInternalNode* N) { + errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n"; + errs () << "Skipping internal node\n"; +} + + +void CGT_ReplaceApproxHPVMIntrinsicsWithFCalls::codeGen(DFLeafNode* N) { + + // Skip if it is a dummy node + if(N->isDummyNode()) { + DEBUG(errs() << "Skipping dummy node\n"); + return; + } + + // Abort if it is an allocation node + if(N->isAllocationNode()) { + assert(false && "Allocation Node not expected in ApproxHPVM"); + return; + } + + // Search for intrinsic only if it has the right hint + if (!checkPreferredTarget(N, visc::CPU_TARGET)) { + errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; + return; + } + + // Get the function associated with the dataflow node + Function *F = N->getFuncPointer(); + errs()<<"function name = "<< F->getName()<<"\n"; + + std::vector<IntrinsicInst *> IItoRemove; + + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &(*i); + if (BuildDFG::isViscIntrinsic(I)) { + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + assert((II->getCalledFunction()->getName()).startswith("llvm.visc.tensor") + && "Only HPVM tensor intrinsics allowed in ApproxHPVM leaf nodes\n"); + /********************* Handle VISC Tensor intrinsics ********************/ + // We replace them with calls to functions with implementations at the LLVM level + switch (II->getIntrinsicID()) { + + case Intrinsic::visc_tensor_convolution: + { /* llvm.hpvm.tensor.convolution */ + DEBUG(errs() << F->getName() << "\t: Handling tensor convolution \n"); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + Args.push_back(II->getOperand(1)); + Args.push_back(II->getOperand(2)); + Args.push_back(II->getOperand(3)); + Args.push_back(II->getOperand(4)); + Args.push_back(II->getOperand(5)); + + Constant* conv_mode = ConstantInt::get(Type::getInt32Ty(M.getContext()), 1); + Constant* conv_precision = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0); + + Args.push_back(conv_mode); + Args.push_back(conv_precision); + + // Create function call + Constant* tensorConvolutionCPU; + DECLARE(tensorConvolutionCPU); + + CallInst* CI = CallInst::Create(tensorConvolutionCPU, + Args, "", II); + // We can replace the call to hpvm.tensor.mul with the LLVM call + II->replaceAllUsesWith(CI); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + + case Intrinsic::visc_tensor_mul: + { /* llvm.hpvm.tensor.mul */ + DEBUG(errs() << F->getName() << "\t: Handling tensor mul\n"); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + Args.push_back(II->getOperand(1)); + + // Create function call + Constant* tensorGemmCPU; + DECLARE(tensorGemmCPU); + + CallInst* CI = CallInst::Create(tensorGemmCPU, + Args, "", II); + // We can replace the call to hpvm.tensor.mul with the LLVM call + II->replaceAllUsesWith(CI); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + + case Intrinsic::visc_tensor_add: + { /* llvm.hpvm.tensor.add */ + DEBUG(errs() << F->getName() << "\t: Handling tensor add\n"); + // Tensor add(a,b) is in place for argument a. + Value *Op = II->getOperand(0); + + // Test the intrinsic operand for in place operation. + bool inplace = isValidOperandForInPlaceOperation(Op, F, N); + // Code generation cannot continue if this is false, because the target + // only provides an in place operation + + // FIXME: remove this comment - must check for in-place + //assert(inplace && + // "Operand not valid for in place operation. Code gen aborted.\n"); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + Args.push_back(II->getOperand(1)); + + // Create function call + Constant* tensorAddCPU; + DECLARE(tensorAddCPU); + CallInst::Create(tensorAddCPU, Args, "", II); + // We can replace the call to hpvm.tensor.add with the 1st argument + // that, due to in place operation, now contains the result + II->replaceAllUsesWith(II->getOperand(0)); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + + case Intrinsic::visc_tensor_pool_max: + case Intrinsic::visc_tensor_pool_mean: + { /* llvm.visc.tensor.relu */ + DEBUG(errs() << F->getName() << "\t: Handling tensor_pool_max\n"); + // Tensor relu(a) is in place for argument a. + Value *Op = II->getOperand(0); + + // Test the intrinsic operand for in place operation. + bool inplace = isValidOperandForInPlaceOperation(Op, F, N); + // Code generation cannot continue if this is false, because the target + // only provides an in place operation + assert(inplace && + "Operand not valid for in place operation. Code gen aborted.\n"); + + // Argument list - tensorPooling(input, poolFunction, window_height, window_width, vertical_pad, horizontal_pad, + // vertical_stride, horizontal_stride); + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + + int pool_type = 0; + if (II->getIntrinsicID() == Intrinsic::visc_tensor_pool_max){ + pool_type = 0; + } + if (II->getIntrinsicID() == Intrinsic::visc_tensor_pool_mean){ + pool_type = 1; + } + + Constant* constPoolType = ConstantInt::get(Type::getInt32Ty(M.getContext()), pool_type); + Args.push_back(constPoolType); // ID for max pool. Min/Avg have different IDs (non-zero) + Args.push_back(II->getOperand(1)); + Args.push_back(II->getOperand(2)); + Args.push_back(II->getOperand(3)); + Args.push_back(II->getOperand(4)); + Args.push_back(II->getOperand(5)); + Args.push_back(II->getOperand(6)); + + // Create function call + Constant* tensorPoolingCPU; + DECLARE(tensorPoolingCPU); + CallInst* CI = CallInst::Create(tensorPoolingCPU, Args, "", II); + + // Replacing intrinsic result uses with the result of the LLVM call + II->replaceAllUsesWith(CI); + + // Mark to remove at the end + IItoRemove.push_back(II); + }break; + + case Intrinsic::visc_tensor_relu: + case Intrinsic::visc_tensor_clipped_relu: + case Intrinsic::visc_tensor_tanh: + { /* llvm.visc.tensor.relu */ + DEBUG(errs() << F->getName() << "\t: Handling tensor activation functions \n"); + // Tensor relu(a) is in place for argument a. + Value *Op = II->getOperand(0); + + // Test the intrinsic operand for in place operation. + bool inplace = isValidOperandForInPlaceOperation(Op, F, N); + // Code generation cannot continue if this is false, because the target + // only provides an in place operation + assert(inplace && + "Operand not valid for in place operation. Code gen aborted.\n"); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + + if (II->getIntrinsicID() == Intrinsic::visc_tensor_relu){ + // Create function call + Constant* tensorReluCPU; + DECLARE(tensorReluCPU); + CallInst::Create(tensorReluCPU, Args, "", II); + } + else if (II->getIntrinsicID() == Intrinsic::visc_tensor_clipped_relu){ + // Create function call + //-- Constant* tensorClippedRelu; + Constant* tensorRelu2CPU; + DECLARE(tensorRelu2CPU); + CallInst::Create(tensorRelu2CPU, Args, "", II); + } + else if (II->getIntrinsicID() == Intrinsic::visc_tensor_tanh){ + // Create function call + Constant* tensorTanhCPU; + errs()<<"tensorTanh Call = \n\n"; + DECLARE(tensorTanhCPU); + //errs()<<"tensorTanh Call = "<<*tensorTanh<<"\l"; + CallInst::Create(tensorTanhCPU, Args, "", II); + } + + // We can replace the call to hpvm.tensor.relu with the 1st argument + // that, due to in place operation, now contains the result + II->replaceAllUsesWith(II->getOperand(0)); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + + case Intrinsic::visc_tensor_softmax: + { /* llvm.visc.tensor.softmax */ + DEBUG(errs() << F->getName() << "\t: Handling tensor softmax\n"); + // Tensor relu(a) is in place for argument a. + Value *Op = II->getOperand(0); + + // Test the intrinsic operand for in place operation. + bool inplace = isValidOperandForInPlaceOperation(Op, F, N); + // Code generation cannot continue if this is false, because the target + // only provides an in place operation + assert(inplace && + "Operand not valid for in place operation. Code gen aborted.\n"); + + // Argument list for the runtime call + std::vector<Value*> Args; + Args.push_back(II->getOperand(0)); + + // Create function call + Constant* tensorSoftmaxCPU; + DECLARE(tensorSoftmaxCPU); + CallInst::Create(tensorSoftmaxCPU, Args, "", II); + // We can replace the call to hpvm.tensor.softmax with the 1st argument + // that, due to in place operation, now contains the result + II->replaceAllUsesWith(II->getOperand(0)); + + // Mark to remove at the end + IItoRemove.push_back(II); + } + break; + + default: + llvm_unreachable("Unknown VISC Intrinsic!"); + break; + + } + + } + + } + + // We need to do this explicitly: DCE pass may not remove them. + // Traverse the vector backwards, otherwise definitions are deleted while + // their subsequent uses are still around. + for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(), + re = IItoRemove.rend(); ri != re; ++ri) { + DEBUG(errs() << "Erasing: " << **ri << "\n"); + errs() << "Erasing: " << **ri << "\n"; + (*ri)->eraseFromParent(); + } + + return; +} + +bool DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls::runOnModule(Module &M) { + errs() << "\nDFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls PASS\n"; + + // Get the BuildDFG Analysis Results: + // - Dataflow graph + BuildDFG &DFG = getAnalysis<BuildDFG>(); + + // Get the In Place Analysis Results + InPlaceDFGAnalysis::InPlaceDFGParameter IPP = + (getAnalysis<InPlaceDFGAnalysisWrapper>()).getIPP(); + // Print results + printInPlaceDFGParameter(IPP); + + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + + // Visitor for Code Generation Graph Traversal + CGT_ReplaceApproxHPVMIntrinsicsWithFCalls *CGTVisitor = + new CGT_ReplaceApproxHPVMIntrinsicsWithFCalls(M, DFG, IPP); + + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode: Roots) { + // Initiate code generation for root DFNode + CGTVisitor->visit(rootNode); + } + + //TODO: Edit module epilogue to remove the VISC intrinsic declarations + delete CGTVisitor; + + return true; +} + + +/****************************************************************************** + * Helper functions * + ******************************************************************************/ + + +} // End of namespace + +char DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls::ID = 0; +static RegisterPass<DFG2LLVM_ReplaceApproxHPVMIntrinsicsWithFCalls> X("replace-intrinsics", + "Replace ApproxHPVM intrinsics with LLVM calls", + false /* does not modify the CFG */, + true /* transformation, * + * not just analysis */); + diff --git a/llvm/lib/Transforms/ReplaceIntrinsics/ReplaceIntrinsics.exports b/llvm/lib/Transforms/ReplaceIntrinsics/ReplaceIntrinsics.exports new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/Makefile index 03b3941834b462c130adab73e739b41f68cd9f05..7be710803ab2a9bf884c33c215464199f3f28217 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/Makefile @@ -15,18 +15,17 @@ APP = lenet TENSOR_INCLUDE_DIR = $(DNN_BENCHMARK_ROOT)/common/include TENSOR_RT_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/include TENSOR_RT_SRC_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/src -# FIXME: Fix this path to be in the BUILD directories (Currently source directory) -#PATH_TO_CPU_TensorRt = $(HPVM_BUILD_DIR)/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime_cpu.ll CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH) -fno-exceptions -ffast-math -std=c++11 -O3 -CCFLAGS += -DDEVICE=CUDNN_TARGET LINKER_FLAGS = -lpthread -lcudart -lcurand -lOpenCL HPVM_LIB_DIR = $(HPVM_BUILD_DIR)/lib -VISC_OPTFLAGS = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/ReplaceIntrinsics.so -load $(HPVM_LIB_DIR)/InlineTensorCalls.so -load $(HPVM_LIB_DIR)/DFG2LLVM_X86_dsoc.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -dfg2llvm-x86-dsoc -clearDFG +OPTFLAGS1 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/ReplaceIntrinsics.so -load $(HPVM_LIB_DIR)/DFG2LLVM_X86_dsoc.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -replace-intrinsics -dfg2llvm-x86-dsoc -clearDFG + +OPTFLAGS2 = -load $(HPVM_LIB_DIR)/InlineTensorCalls.so -inline-tensor-calls TARGET = $(BUILD_DIR)/$(APP).opt.bc @@ -45,13 +44,16 @@ $(BUILD_DIR)/%.ll: $(SRC_DIR)/%.cpp $(BUILD_DIR)/%.visc.ll: $(BUILD_DIR)/%.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $< -S -o $@ +#$(BUILD_DIR)/lenet_tensor_rt.bc + $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.visc.ll + $(OPT) $(OPTFLAGS1) $< -o $@ $(CC) -emit-llvm -c $(TENSOR_RT_SRC_DIR)/tensor_cpu_runtime.cc -o $(BUILD_DIR)/tensor_cpu_runtime.bc $(OPT) -always-inline $(BUILD_DIR)/tensor_cpu_runtime.bc -o $(BUILD_DIR)/tensor_cpu_runtime.bc - $(LLVM_LINK) $< $(BUILD_DIR)/tensor_cpu_runtime.bc -o $(BUILD_DIR)/lenet_tensor_rt.bc - $(OPT) $(VISC_OPTFLAGS) $(BUILD_DIR)/lenet_tensor_rt.bc -o $@ - $(LLVM_LINK) $@ $(VISC_RT_PATH) -o $(BUILD_DIR)/lenet_linked.bc - $(CC) $(BUILD_DIR)/lenet_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/lenet_linked $(LINKER_FLAGS) + $(LLVM_LINK) $@ $(BUILD_DIR)/tensor_cpu_runtime.bc -o $(BUILD_DIR)/lenet_tensor_rt.bc + $(OPT) $(OPTFLAGS2) $(BUILD_DIR)/lenet_tensor_rt.bc -o $(BUILD_DIR)/lenet_inline.bc + #$(LLVM_LINK) $@ $(VISC_RT_PATH) -o $(BUILD_DIR)/lenet_linked.bc + #$(CC) $(BUILD_DIR)/lenet_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/lenet_linked $(LINKER_FLAGS) $(BUILD_DIR): mkdir -p $@ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/approximate.py b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/approximate.py new file mode 100644 index 0000000000000000000000000000000000000000..c77d43bf8ff554b77623ed0ea291d4590e52cb3a --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/approximate.py @@ -0,0 +1,87 @@ + + +import os +import sys +import subprocess + + +# Configuration variables - change for each benchmark +bench_build_dir = os.environ["LLVM_SRC_ROOT"] + "/test/VISC/DNN_Benchmarks/benchmarks/lenet/build/" +visc_file_name = "lenet.visc.ll" +num_tests = 10 +threshold_accuracy = 98.0 +binary_name = "./lenet_tune" +result_dir = "./opentuner_test_result" +num_flags = 14 # FIXME: Auto-extract the number of tensor ops from the bitcode file +error_range = 9 + + +def change_dir(): + os.chdir(bench_build_dir) + print os.getcwd() + +def setup_env(): + os.environ["LD_LIBRARY_PATH"] = os.environ["LD_LIBRARY_PATH"] + ":" + os.environ["LLVM_BUILD_ROOT"] + "/lib" + print os.environ["LD_LIBRARY_PATH"] + + +def build_binaries(): + subprocess.call("make", shell=True) + + +def run_autotuner(): + + # Change build directory to benchmark build directory + change_dir() + + LLVM_SRC_ROOT = os.environ["LLVM_SRC_ROOT"] + autotuner_cmd = "python " + LLVM_SRC_ROOT + "projects/hpvm-tensor-rt/opentuner/autotuner/approxhpvm_tuner.py " + \ + " --test-limit " + str(num_tests) + \ + " --accuracy " + str(threshold_accuracy) + \ + " --binary " + str(binary_name) + \ + " --result-dir " + str(result_dir) + \ + " --num-flags " + str(num_flags) + \ + " --error-range " + str(error_range) + + print autotuner_cmd + + subprocess.call(autotuner_cmd, shell=True) + + +def add_approx_info(): + + # Change directory and setup env variables + change_dir() + setup_env() + + subprocess.call("which opt", shell=True) + + approxinfo_cmd = "opt -load LLVMBuildDFG.so -load InsertApproxInfo.so -insert-approxinfo --results-dir " + \ + result_dir + " " + " " + \ + visc_file_name + " -S -o " + visc_file_name + "_approx.ll" + + print approxinfo_cmd + subprocess.call(approxinfo_cmd, shell=True) + + + + +def run_scheduler(): + + change_dir() + setup_env() + + sched_cmd = "opt -load LLVMBuildDFG.so -load ApproxScheduler.so -approx-scheduler --category quad --rank 4 " + \ + visc_file_name + "_approx.ll" + " -S -o " + visc_file_name + "_sched_out.ll" + print sched_cmd + subprocess.call(sched_cmd, shell=True) + + + +if __name__ == "__main__": + + #build_binaries() + #run_autotuner() + #add_approx_info() + run_scheduler() + diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/opentuner_run.sh b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/opentuner_run.sh new file mode 100644 index 0000000000000000000000000000000000000000..13209394589933ad648e8a9ede1b6fc3b013a264 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/opentuner_run.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +BUILD_DIR=${LLVM_SRC_ROOT}/test/VISC/DNN_Benchmarks/benchmarks/lenet/build/ +cd $BUILD_DIR +python ~/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/approxhpvm_tuner.py --test-limit 300 --accuracy 86.7 --binary ./lenet_tune --result-dir ./opentuner_test_result --num-flags 14 --error-range 9 + diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/setup_tyler_paths.sh b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/setup_tyler_paths.sh new file mode 100644 index 0000000000000000000000000000000000000000..3548f182f198724600aee855b66169a1bdf12a3a --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/bin/setup_tyler_paths.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# CUDNN Path setup +module load cuda-toolkit/9.1 +export CUDA_INCLUDE_PATH=/software/cuda-9.1/include +export CUDNN_PATH=/software/cuda-9.1/lib64/ +export LIBRARY_PATH=/software/cuda-9.1/lib64/:$LIBRARY_PATH +export LD_LIBRARY_PATH=/software/cuda-9.1/lib64/:$LD_LIBRARY_PATH + +# HPVM Path setup +export CPATH=$CPATH:/home/hsharif3/anaconda2/include/ +export PATH=/home/hsharif3/Gitlab/hpvm/build/bin/:$PATH +export LLVM_BUILD_ROOT=/home/hsharif3/Gitlab/hpvm/build/ +export LLVM_SRC_ROOT=/home/hsharif3/Gitlab/hpvm/llvm/ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/src/lenet.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/src/lenet.cpp index 1746fc13dc4809f8c3d806fa144903fac50f3315..67213d38302982ee677ec0337aad5728d6de27ea 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/src/lenet.cpp +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/src/lenet.cpp @@ -10,7 +10,7 @@ #include <tensorUtils.h> void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::CPU_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 2, 2, 1, 1); @@ -18,7 +18,7 @@ void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::CPU_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -26,7 +26,7 @@ void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_2_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::CPU_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_relu(t1); @@ -34,7 +34,7 @@ void var_2_node(void* t1, size_t bytes_t1) { } void var_3_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::CPU_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -42,7 +42,7 @@ void var_3_node(void* t1, size_t bytes_t1) { } void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::CPU_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 2, 2, 1, 1); @@ -50,7 +50,7 @@ void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::CPU_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -58,7 +58,7 @@ void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_6_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::CPU_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_relu(t1); @@ -66,7 +66,7 @@ void var_6_node(void* t1, size_t bytes_t1) { } void var_7_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::CPU_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 1, 1, 2, 2); @@ -74,7 +74,7 @@ void var_7_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::CPU_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -82,7 +82,7 @@ void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_9_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::CPU_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_relu(t1); @@ -90,7 +90,7 @@ void var_9_node(void* t1, size_t bytes_t1) { } void var_10_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::CPU_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_mul(t1, t2); @@ -98,7 +98,7 @@ void var_10_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::CPU_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -106,7 +106,7 @@ void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_12_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::CPU_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_relu(t1); @@ -114,7 +114,7 @@ void var_12_node(void* t1, size_t bytes_t1) { } void var_13_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::CPU_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_mul(t1, t2); @@ -122,7 +122,7 @@ void var_13_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::CPU_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -130,7 +130,7 @@ void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_15_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::CPU_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_relu(t1); @@ -138,7 +138,7 @@ void var_15_node(void* t1, size_t bytes_t1) { } void var_16_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::CPU_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_softmax(t1);