From 024890d34228c3e5237adadc215905be6cf7c7b5 Mon Sep 17 00:00:00 2001 From: Akash Kothari <akashk4@tyler.cs.illinois.edu> Date: Mon, 21 Dec 2020 09:18:20 -0600 Subject: [PATCH] Remove the dsoc x86 passes --- lib/DFG2LLVM_X86_dsoc/CMakeLists.txt | 13 - lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports | 0 lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp | 2128 ------------------- lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt | 22 - 4 files changed, 2163 deletions(-) delete mode 100644 lib/DFG2LLVM_X86_dsoc/CMakeLists.txt delete mode 100644 lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports delete mode 100644 lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp delete mode 100644 lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt diff --git a/lib/DFG2LLVM_X86_dsoc/CMakeLists.txt b/lib/DFG2LLVM_X86_dsoc/CMakeLists.txt deleted file mode 100644 index 75569addda..0000000000 --- a/lib/DFG2LLVM_X86_dsoc/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -if(WIN32 OR CYGWIN) - set(LLVM_LINK_COMPONENTS Core Support) -endif() - -add_llvm_loadable_module( DFG2LLVM_X86_dsoc - DFG2LLVM_X86_dsoc.cpp - - DEPENDS - intrinsics_gen - PLUGIN_TOOL - opt - ) - diff --git a/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports b/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86.exports deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp b/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp deleted file mode 100644 index fbe5e4f6bd..0000000000 --- a/lib/DFG2LLVM_X86_dsoc/DFG2LLVM_X86_dsoc.cpp +++ /dev/null @@ -1,2128 +0,0 @@ -//===-------------------------- DFG2LLVM_X86.cpp --------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "DFG2LLVM_X86" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/IR/InstIterator.h" -#include "llvm/Transforms/Utils/ValueMapper.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/IRReader/IRReader.h" -#include "llvm/Linker/Linker.h" -#include "llvm/Support/SourceMgr.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Constant.h" -#include "llvm/SupportVISC/DFG2LLVM.h" - -using namespace llvm; -using namespace builddfg; -using namespace dfg2llvm; - -// VISC Command line option to use timer or not -static cl::opt<bool> -VISCTimer_X86("visc-timers-x86", cl::desc("Enable visc timers")); -// Command line option to enable device abstraction or not -static cl::opt<bool> -DeviceAbstraction("visc-eda", cl::init(false), cl::Hidden, - cl::desc("Enable visc device abstraction")); - - -namespace { - -// Helper Functions -static bool isVISCCall_llvm_visc_policy_getVersion(Instruction *I) { - if (!isa<CallInst>(I)) - return false; - CallInst *CI = cast<CallInst>(I); - return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("llvm_visc_policy_getVersion"); -} - -CallInst *get_llvm_visc_policy_getVersion_call(Function *F) { - for (inst_iterator ib = inst_begin(F), ie = inst_end(F); ib != ie; ++ib) { - Instruction *I = &*ib; - if (isVISCCall_llvm_visc_policy_getVersion(I)) - return cast<CallInst>(I); - } - return NULL; -} - -// DFG2LLVM_X86 - The first implementation. -struct DFG2LLVM_X86 : public DFG2LLVM { - static char ID; // Pass identification, replacement for typeid - DFG2LLVM_X86() :DFG2LLVM(ID) {} - -private: - // Member variables - - // Functions - -public: - bool runOnModule(Module &M); -}; - -// Visitor for Code generation traversal (tree traversal for now) -class CGT_X86 : public CodeGenTraversal { - -private: - //Member variables - - Constant* malloc; - // VISC Runtime API - Constant* llvm_visc_x86_launch; - Constant* llvm_visc_x86_wait; - Constant* llvm_visc_x86_argument_ptr; - - Constant* llvm_visc_streamLaunch; - Constant* llvm_visc_streamPush; - Constant* llvm_visc_streamPop; - Constant* llvm_visc_streamWait; - Constant* llvm_visc_createBindInBuffer; - Constant* llvm_visc_createBindOutBuffer; - Constant* llvm_visc_createEdgeBuffer; - Constant* llvm_visc_createLastInputBuffer; - Constant* llvm_visc_createThread; - //Constant* llvm_visc_freeThreads; - Constant* llvm_visc_bufferPush; - Constant* llvm_visc_bufferPop; - Constant* llvm_visc_x86_dstack_push; - Constant* llvm_visc_x86_dstack_pop; - Constant* llvm_visc_x86_getDimLimit; - Constant* llvm_visc_x86_getDimInstance; - - //Functions - std::vector<IntrinsicInst*>* getUseList(Value* LI); - Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = ""); - void addDoWhileLoop(Instruction*, Instruction*, Value*); - void addWhileLoop(Instruction*, Instruction*, Instruction*, Value*); - Instruction *addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *); - Argument* getArgumentFromEnd(Function* F, unsigned offset); - Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, - Instruction* InsertBefore); - void invokeChild_X86(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap, - Instruction* InsertBefore); - void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap, - Instruction* InsertBefore); - StructType* getArgumentListStructTy(DFNode*); - Function* createFunctionFilter(DFNode* C); - void startNodeThread(DFNode*, std::vector<Value*>, DenseMap<DFEdge*, Value*>, - Value*, Value*, Instruction*); - Function* createLaunchFunction(DFInternalNode*); - Function* createPushFunction(DFInternalNode*); - Function* createPopFunction(DFInternalNode*); - Function* createWaitFunction(DFInternalNode*); - - // Virtual Functions - void init() { - VISCTimer = VISCTimer_X86; - TargetName = "X86"; - } - void initRuntimeAPI(); - void codeGen(DFInternalNode* N); - void codeGen(DFLeafNode* N); - Function* codeGenStreamPush(DFInternalNode* N); - Function* codeGenStreamPop(DFInternalNode* N); - -public: - // Constructor - CGT_X86(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG) { - init(); - initRuntimeAPI(); - } - - void codeGenLaunch(DFInternalNode* Root); - void codeGenLaunchStreaming(DFInternalNode* Root); -}; - -bool DFG2LLVM_X86::runOnModule(Module &M) { - errs() << "\nDFG2LLVM_X86 PASS\n"; - - // Get the BuildDFG Analysis Results: - // - Dataflow graph - // - Maps from i8* hansles to DFNode and DFEdge - BuildDFG &DFG = getAnalysis<BuildDFG>(); - - //DFInternalNode *Root = DFG.getRoot(); - std::vector<DFInternalNode*> Roots = DFG.getRoots(); - // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); - // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); - - // Visitor for Code Generation Graph Traversal - CGT_X86 *CGTVisitor = new CGT_X86(M, DFG); - - // Iterate over all the DFGs and produce code for each one of them - for (auto rootNode: Roots) { - // Initiate code generation for root DFNode - CGTVisitor->visit(rootNode); - // Go ahead and replace the launch intrinsic with pthread call, otherwise return now. - // TODO: Later on, we might like to do this in a separate pass, which would - // allow us the flexibility to switch between complete static code generation - // for DFG or having a customized runtime+scheduler - - // Do streaming code generation if root node is streaming. Usual otherwise - if(rootNode->isChildGraphStreaming()) - CGTVisitor->codeGenLaunchStreaming(rootNode); - else - CGTVisitor->codeGenLaunch(rootNode); - } - - delete CGTVisitor; - return true; -} - -// Initialize the VISC runtime API. This makes it easier to insert these calls -void CGT_X86::initRuntimeAPI() { - - // Load Runtime API Module - SMDiagnostic Err; - - char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); - assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); - - // FIXME: hardcoded path to 'build_dsoc' - should probably be a environment variable - Twine llvmSrcRoot = LLVM_SRC_ROOT; - Twine runtimeAPI = llvmSrcRoot+"/../build_dsoc/projects/visc-rt/visc-rt.ll"; - - runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); - - if(runtimeModule == NULL) - DEBUG(errs() << Err.getMessage()); - else - DEBUG(errs() << "Successfully loaded visc-rt API module\n"); - - // Get or insert the global declarations for launch/wait functions - DECLARE(llvm_visc_x86_launch); - DECLARE(malloc); - DECLARE(llvm_visc_x86_wait); - DECLARE(llvm_visc_x86_argument_ptr); - DECLARE(llvm_visc_streamLaunch); - DECLARE(llvm_visc_streamPush); - DECLARE(llvm_visc_streamPop); - DECLARE(llvm_visc_streamWait); - DECLARE(llvm_visc_createBindInBuffer); - DECLARE(llvm_visc_createBindOutBuffer); - DECLARE(llvm_visc_createEdgeBuffer); - DECLARE(llvm_visc_createLastInputBuffer); - DECLARE(llvm_visc_createThread); - //DECLARE(llvm_visc_freeThreads); - DECLARE(llvm_visc_bufferPush); - DECLARE(llvm_visc_bufferPop); - DECLARE(llvm_visc_x86_dstack_push); - DECLARE(llvm_visc_x86_dstack_pop); - DECLARE(llvm_visc_x86_getDimLimit); - DECLARE(llvm_visc_x86_getDimInstance); - - // Get or insert timerAPI functions as well if you plan to use timers - initTimerAPI(); - - // Insert init context in main - Function* VI = M.getFunction("llvm.visc.init"); - assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); - DEBUG(errs() << "Inserting x86 timer initialization\n"); - Instruction* I = cast<Instruction>(*VI->user_begin()); - initializeTimerSet(I); - switchToTimer(visc_TimerID_NONE, I); - // Insert code for initializing the sceduling policy - Function *IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_init", - runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType())); - CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I); - DEBUG(errs() << *IPCallInst << "\n"); - - // If device abstraction is enabled, we add a runtime call to start the - // device status simulation - if (DeviceAbstraction) { - Function *ID = - cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_start", - runtimeModule->getFunction("llvm_visc_deviceAbstraction_start")->getFunctionType())); - CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I); - DEBUG(errs() << *IDCallInst << "\n"); - } - - // Insert print instruction at visc exit - Function* VC = M.getFunction("llvm.visc.cleanup"); - assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once"); - - // Insert code for clearing the sceduling policy - I = cast<Instruction>(*VC->user_begin()); - IP = cast<Function>(M.getOrInsertFunction("llvm_visc_policy_clear", - runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType())); - IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I); - errs() << *IPCallInst << "\n"; - - DEBUG(errs() << "Inserting x86 timer print\n"); - printTimerSet(I); - - // If device abstraction is enabled, we add a runtime call to end the - // device status simulation - if (DeviceAbstraction) { - Function *ID = - cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_end", - runtimeModule->getFunction("llvm_visc_deviceAbstraction_end")->getFunctionType())); - CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I); - DEBUG(errs() << *IDCallInst << "\n"); - } - -} - -/* Returns vector of all wait instructions - */ -std::vector<IntrinsicInst*>* CGT_X86::getUseList(Value* GraphID) { - std::vector<IntrinsicInst*>* UseList = new std::vector<IntrinsicInst*>(); - // It must have been loaded from memory somewhere - for(Value::user_iterator ui = GraphID->user_begin(), - ue = GraphID->user_end(); ui!=ue; ++ui) { - if(IntrinsicInst* waitI = dyn_cast<IntrinsicInst>(*ui)) { - UseList->push_back(waitI); - } - //else if (PHINode* PN = dyn_cast<PHINode>(*ui)){ - //errs() << "Found PhiNode use of graphID\n"; - //std::vector<IntrinsicInst*>* phiUseList = getUseList(PN); - //UseList->insert(UseList->end(), phiUseList->begin(), phiUseList->end()); - //free(phiUseList); - //} - else { - llvm_unreachable("Error: Operation on Graph ID not supported!\n"); - } - } - return UseList; -} - -/* Traverse the function argument list in reverse order to get argument at a - * distance offset fromt he end of argument list of function F - */ -Argument* CGT_X86::getArgumentFromEnd(Function* F, unsigned offset) { - assert((F->getFunctionType()->getNumParams() >= offset && offset > 0) - && "Invalid offset to access arguments!"); - Function::arg_iterator e = F->arg_end(); - // Last element of argument iterator is dummy. Skip it. - e--; - Argument* arg; - for( ; offset != 0; e--) { - offset--; - arg = &*e; - } - return arg; -} - -/* Add Loop around the instruction I - * Algorithm: - * (1) Split the basic block of instruction I into three parts, where the - * middleblock/body would contain instruction I. - * (2) Add phi node before instruction I. Add incoming edge to phi node from - * predecessor - * (3) Add increment and compare instruction to index variable - * (4) Replace terminator/branch instruction of body with conditional branch - * which loops over bidy if true and goes to end if false - * (5) Update phi node of body - */ -void CGT_X86::addWhileLoop(Instruction* CondBlockStart, Instruction* BodyStart, - Instruction* BodyEnd, Value* TerminationCond) { - BasicBlock* Entry = CondBlockStart->getParent(); - BasicBlock* CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition"); - BasicBlock* WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body"); - BasicBlock* WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end"); - - // Replace the terminator instruction of conditional with new conditional - // branch which goes to while.body if true and branches to while.end otherwise - BranchInst* BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond); - ReplaceInstWithInst(CondBlock->getTerminator(), BI); - - // While Body should jump to condition block - BranchInst* UnconditionalBranch = BranchInst::Create(CondBlock); - ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch); - -} - -Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond, - BasicBlock *Body) { - Module *M = Entry->getParent()->getParent(); - Type *Int64Ty = Type::getInt64Ty(M->getContext()); - - // Insert a PHI instruction at the beginning of the condition block - Instruction *IB = Cond->getFirstNonPHI(); - PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB); - - ConstantInt *IConst = - ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true); - Instruction *CounterIncr = - BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst, - "cnt_incr", Body->getTerminator()); - - // Set incoming values for Phi node - IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true); - CounterPhi->addIncoming(IConst, Entry); - CounterPhi->addIncoming(CounterIncr, Body); - - // Return the pointer to the created PHI node in the corresponding argument - return CounterPhi; -} - -/* Add Loop around the instruction I - * Algorithm: - * (1) Split the basic block of instruction I into three parts, where the - * middleblock/body would contain instruction I. - * (2) Add phi node before instruction I. Add incoming edge to phi node from - * predecessor - * (3) Add increment and compare instruction to index variable - * (4) Replace terminator/branch instruction of body with conditional branch - * which loops over bidy if true and goes to end if false - * (5) Update phi node of body - */ -void CGT_X86::addDoWhileLoop(Instruction* From, Instruction* To, Value* TerminationCond) { - BasicBlock* Entry = From->getParent(); - BasicBlock* ForBody = Entry->splitBasicBlock(From, "for.body"); - - // To Instruction should also belong to the same basic block as the From basic - // block will have a terminator instruction - assert(To->getParent() == ForBody - && "To Instruction should also belong to the same basic block!"); - BasicBlock* ForEnd = ForBody->splitBasicBlock(To, "for.end"); - - // Replace the terminator instruction of for.body with new conditional - // branch which loops over body if true and branches to for.end otherwise - BranchInst* BI = BranchInst::Create(ForEnd, ForBody, TerminationCond); - ReplaceInstWithInst(ForBody->getTerminator(), BI); - -} - -/* Add Loop around the instruction I - * Algorithm: - * (1) Split the basic block of instruction I into three parts, where the - * middleblock/body would contain instruction I. - * (2) Add phi node before instruction I. Add incoming edge to phi node from - * predecessor - * (3) Add increment and compare instruction to index variable - * (4) Replace terminator/branch instruction of body with conditional branch - * which loops over bidy if true and goes to end if false - * (5) Update phi node of body - */ -Value* CGT_X86::addLoop(Instruction* I, Value* limit, const Twine& indexName) { - BasicBlock* Entry = I->getParent(); - BasicBlock* ForBody = Entry->splitBasicBlock(I, "for.body"); - - BasicBlock::iterator i(I); - ++i; - Instruction* NextI = &*i; - // Next Instruction should also belong to the same basic block as the basic - // block will have a terminator instruction - assert(NextI->getParent() == ForBody - && "Next Instruction should also belong to the same basic block!"); - BasicBlock* ForEnd = ForBody->splitBasicBlock(NextI, "for.end"); - - - // Add Phi Node for index variable - PHINode* IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()), - 2, "index."+indexName, I); - - // Add incoming edge to phi - IndexPhi->addIncoming(ConstantInt::get(Type::getInt64Ty(I->getContext()), 0), - Entry); - // Increment index variable - BinaryOperator* IndexInc = BinaryOperator::Create(Instruction::Add, - IndexPhi, ConstantInt::get(Type::getInt64Ty(I->getContext()), 1), - "index."+indexName+".inc", ForBody->getTerminator()); - - // Compare index variable with limit - CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc, - limit, "cond."+indexName, ForBody->getTerminator()); - - // Replace the terminator instruction of for.body with new conditional - // branch which loops over body if true and branches to for.end otherwise - BranchInst* BI = BranchInst::Create(ForBody, ForEnd, Cond); - ReplaceInstWithInst(ForBody->getTerminator(), BI); - - // Add incoming edge to phi node in body - IndexPhi->addIncoming(IndexInc, ForBody); - return IndexPhi; -} - -// Returns a packed struct type. The structtype is created by packing the input -// types, output types and isLastInput buffer type. All the streaming -// inputs/outputs are converted to i8*, since this is the type of buffer -// handles. -StructType* CGT_X86::getArgumentListStructTy(DFNode* C) { - std::vector<Type*> TyList; - // Input types - Function* CF = C->getFuncPointer(); - for(Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end(); - ai != ae; ++ai) { - if(C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge()) - TyList.push_back(Type::getInt8PtrTy(CF->getContext())); - else - TyList.push_back(ai->getType()); - } - // Output Types - StructType* OutStructTy = cast<StructType>(CF->getReturnType()); - for (unsigned i = 0; i < OutStructTy->getNumElements(); i++) { - // All outputs of a node are streaming edge - assert(C->getOutDFEdgeAt(i)->isStreamingEdge() - && "All output edges of child node have to be streaming"); - TyList.push_back(Type::getInt8PtrTy(CF->getContext())); - } - // isLastInput buffer element - TyList.push_back(Type::getInt8PtrTy(CF->getContext())); - - StructType* STy = StructType::create(CF->getContext(), TyList, - Twine("struct.thread."+CF->getName()).str(), true); - return STy; - -} - -void CGT_X86::startNodeThread(DFNode* C, std::vector<Value*> Args, DenseMap<DFEdge*, Value*> - EdgeBufferMap, Value* isLastInputBuffer, Value* graphID, - Instruction* IB) { - DEBUG(errs() << "Starting Pipeline for child node: " << C->getFuncPointer()->getName() << "\n"); - // Create a filter/pipeline function for the child node - Function* C_Pipeline = createFunctionFilter(C); - Function* CF = C->getFuncPointer(); - - // Get module context and i32 0 constant, as they would be frequently used in - // this function. - LLVMContext& Ctx = IB->getParent()->getContext(); - Constant* IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0); - - // Marshall arguments - // Create a packed struct type with inputs of C followed by outputs and then - // another i8* to indicate isLastInput buffer. Streaming inputs are replaced - // by i8* - // - StructType* STy = getArgumentListStructTy(C); - // Allocate the struct on heap *NOT* stack and bitcast i8* to STy* - CallInst* CI = CallInst::Create(malloc, ArrayRef<Value*>(ConstantExpr::getSizeOf(STy)), - C->getFuncPointer()->getName()+".inputs", IB); - CastInst* Struct = BitCastInst::CreatePointerCast(CI, STy->getPointerTo(), CI->getName()+".i8ptr", IB); - //AllocaInst* AI = new AllocaInst(STy, C->getFuncPointer()->getName()+".inputs", IB); - // Insert elements in the struct - DEBUG(errs() << "Marshall inputs for child node: " << C->getFuncPointer()->getName() << "\n"); - // Marshall Inputs - for(unsigned i=0; i < CF->getFunctionType()->getNumParams(); i++) { - // Create constant int (i) - Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i); - // Get Element pointer instruction - Value* GEPIndices[] = { IntZero, Int_i }; - GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, - ArrayRef<Value*>(GEPIndices, 2), - Struct->getName()+".arg_"+Twine(i), - IB); - DFEdge* E = C->getInDFEdgeAt(i); - if (E->getSourceDF()->isEntryNode()) { - // This is a Bind Input Edge - if(E->isStreamingEdge()) { - // Streaming Bind Input edge. Get buffer corresponding to it - assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming Bind DFEdge!"); - new StoreInst(EdgeBufferMap[E], GEP, IB); - } - else { - // Non-streaming Bind edge - new StoreInst(Args[i], GEP, IB); - } - } - else { - // This is an edge between siblings. - // This must be an streaming edge. As it is our assumption that all edges - // between two nodes in a DFG are streaming. - assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming DFEdge!"); - new StoreInst(EdgeBufferMap[E], GEP, IB); - } - } - unsigned numInputs = CF->getFunctionType()->getNumParams(); - unsigned numOutputs = cast<StructType>(CF->getReturnType())->getNumElements(); - // Marshall Outputs - DEBUG(errs() << "Marshall outputs for child node: " << C->getFuncPointer()->getName() << "\n"); - for(unsigned i = 0; i < numOutputs; i++ ) { - // Create constant int (i+numInputs) - Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i+numInputs); - // Get Element pointer instruction - Value* GEPIndices[] = { IntZero, Int_i }; - GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, - ArrayRef<Value*>(GEPIndices, 2), - Struct->getName()+".out_"+Twine(i), - IB); - DFEdge* E = C->getOutDFEdgeAt(i); - assert(E->isStreamingEdge() && "Output Edge must be streaming of all nodes"); - assert(EdgeBufferMap.count(E) && "No mapping buffer for a Out Streaming DFEdge!"); - new StoreInst(EdgeBufferMap[E], GEP, IB); - } - // Marshall last argument. isLastInput buffer - DEBUG(errs() << "Marshall isLastInput for child node: " << C->getFuncPointer()->getName() << "\n"); - // Create constant int (i+numInputs) - Constant* Int_index = ConstantInt::get(Type::getInt32Ty(Ctx), numInputs+numOutputs); - // Get Element pointer instruction - Value* GEPIndices[] = { IntZero, Int_index }; - GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct, - ArrayRef<Value*>(GEPIndices, 2), - Struct->getName()+".isLastInput", IB); - new StoreInst(isLastInputBuffer, GEP, IB); - - // AllocaInst AI points to memory with all the arguments packed - // Call runtime to create the thread with these arguments - DEBUG(errs() << "Start Thread for child node: " << C->getFuncPointer()->getName() << "\n"); - DEBUG(errs() << *llvm_visc_createThread << "\n"); - DEBUG(errs() << *graphID->getType() << "\n"); - DEBUG(errs() << *C_Pipeline->getType() << "\n"); - DEBUG(errs() << *Struct->getType() << "\n"); - // Bitcast AI to i8* - CastInst* BI = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx), Struct->getName(), IB); - Value* CreateThreadArgs[] = {graphID, C_Pipeline, BI}; - CallInst* CreateThread = CallInst::Create(llvm_visc_createThread, - ArrayRef<Value*>(CreateThreadArgs, 3), - "", - IB); - -} - -Function* CGT_X86::createLaunchFunction(DFInternalNode* N) { - DEBUG(errs() << "Generating Streaming Launch Function\n"); - // Get Function associated with Node N - Function* NF = N->getFuncPointer(); - - // Map from Streaming edge to buffer - DenseMap<DFEdge*, Value*> EdgeBufferMap; - - /* Now we have all the necessary global declarations necessary to generate the - * Launch function, pointer to which can be passed to pthread utils to execute - * DFG. The Launch function has just one input: i8* data.addr - * This is the address of the all the input data that needs to be passed to - * this function. In our case it contains the input arguments of the Root - * function in the correct order. - * (1) Create an empty Launch function of type void (i8* args, i8* GraphID) - * (2) Extract each of inputs from data.addr - * (3) create Buffers for all the streaming edges - * - Put buffers in the context - * (4) Go over each child node - * - marshall its arguments together (use buffers in place of streaming - * arguments) - * - Start the threads - * (5) The return value from Root is stored in memory, pointer to which is - * passed to pthread_exit call. - */ - // (1) Create Launch Function of type void (i8* args, i8* GraphID) - Type* i8Ty = Type::getInt8Ty(M.getContext()); - Type* ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()}; - FunctionType* LaunchFuncTy = FunctionType::get(Type::getVoidTy(NF->getContext()), - ArrayRef<Type*>(ArgTypes, 2), false); - Function* LaunchFunc = Function::Create(LaunchFuncTy, - NF->getLinkage(), - NF->getName()+".LaunchFunction", - &M); - DEBUG(errs() << "Generating Code for Streaming Launch Function\n"); - // Give a name to the argument which is used pass data to this thread - Argument* data = &*LaunchFunc->arg_begin(); - Argument* graphID = &*(++LaunchFunc->arg_begin()); - data->setName("data.addr"); - graphID->setName("graphID"); - // Add a basic block to this empty function and a return null statement to it - DEBUG(errs() << *LaunchFunc->getReturnType() << "\n"); - BasicBlock *BB = BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc); - ReturnInst* RI = ReturnInst::Create(LaunchFunc->getContext(), - BB); - - DEBUG(errs() << "Created Empty Launch Function\n"); - - // (2) Extract each of inputs from data.addr - std::vector<Type*> TyList; - std::vector<std::string> names; - std::vector<Value*> Args; - - for (Function::arg_iterator ai = NF->arg_begin(), ae = NF->arg_end(); - ai != ae; ++ai) { - if(N->getChildGraph()->getEntry()->getOutDFEdgeAt(ai->getArgNo())->isStreamingEdge()) { - TyList.push_back(i8Ty->getPointerTo()); - names.push_back(Twine(ai->getName()+"_buffer").str()); - continue; - } - TyList.push_back(ai->getType()); - names.push_back(ai->getName()); - } - Args = extractElements(data, TyList, names, RI); - DEBUG(errs() << "Launch function for " << NF->getName() << *LaunchFunc << "\n"); - // (3) Create buffers for all the streaming edges - for(DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(), - de = N->getChildGraph()->dfedge_end(); di != de; ++di) { - DFEdge* Edge = *di; - DEBUG(errs() << *Edge->getType() << "\n"); - Value* size = ConstantExpr::getSizeOf(Edge->getType()); - Value* CallArgs[] = {graphID, size}; - if (Edge->isStreamingEdge()) { - CallInst* CI; - // Create a buffer call - if(Edge->getSourceDF()->isEntryNode()) { - // Bind Input Edge - Constant* Int_ArgNo = ConstantInt::get(Type::getInt32Ty(RI->getContext()), - Edge->getSourcePosition()); - Value* BindInCallArgs[] = {graphID, size, Int_ArgNo}; - CI = CallInst::Create(llvm_visc_createBindInBuffer, ArrayRef<Value*>(BindInCallArgs, 3), - "BindIn."+Edge->getDestDF()->getFuncPointer()->getName(), - RI); - } - else if(Edge->getDestDF()->isExitNode()) { - // Bind Output Edge - CI = CallInst::Create(llvm_visc_createBindOutBuffer, ArrayRef<Value*>(CallArgs, 2), - "BindOut."+Edge->getSourceDF()->getFuncPointer()->getName(), - RI); - } - else { - // Streaming Edge - CI = CallInst::Create(llvm_visc_createEdgeBuffer, - ArrayRef<Value*>(CallArgs, 2), - Edge->getSourceDF()->getFuncPointer()->getName()+"." - +Edge->getDestDF()->getFuncPointer()->getName(), - RI); - } - EdgeBufferMap[Edge] = CI; - } - } - // Create buffer for isLastInput for all the child nodes - DFGraph* G = N->getChildGraph(); - DenseMap<DFNode*, Value*> NodeLastInputMap; - for(DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce; ++ci) { - DFNode* child = *ci; - if(child->isDummyNode()) - continue; - Value* size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext())); - Value* CallArgs[] = {graphID, size}; - CallInst* CI = CallInst::Create(llvm_visc_createLastInputBuffer, ArrayRef<Value*>(CallArgs, 2), - "BindIn.isLastInput."+child->getFuncPointer()->getName(), - RI); - NodeLastInputMap[child] = CI; - } - DEBUG(errs() << "Start Each child node filter\n"); - // (4) Marshall arguments for each child node and start the thread with its - // pipeline funtion - for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), - ce = N->getChildGraph()->end(); ci != ce; ++ci) { - DFNode* C = *ci; - // Skip dummy node call - if (C->isDummyNode()) - continue; - - // Marshall all the arguments for this node into an i8* - // Pass to the runtime to create the thread - // Start the thread for child node C - startNodeThread(C, Args, EdgeBufferMap, NodeLastInputMap[C], graphID, RI); - } - - DEBUG(errs() << "Launch function:\n"); - DEBUG(errs() << *LaunchFunc << "\n"); - - return LaunchFunc; -} - - -Function* CGT_X86::createPushFunction(DFInternalNode* N) { - DEBUG(errs() << "Generating Push function\n"); - Function* PushFunc; - return PushFunc; -} - -Function* CGT_X86::createPopFunction(DFInternalNode* N) { - DEBUG(errs() << "Generating Pop function\n"); - Function* PushFunc; - return PushFunc; -} - -Function* CGT_X86::createWaitFunction(DFInternalNode* N) { - DEBUG(errs() << "Generating Wait function\n"); - Function* PushFunc; - return PushFunc; -} -/* This fuction does the steps necessary to launch a streaming graph - * Steps - * Create Pipeline/Filter function for each node in child graph of Root - * Create Functions DFGLaunch, DFGPush, DFGPop, DFGWait - * Modify each of the instrinsic in host code - * Launch, Push, Pop, Wait - */ -void CGT_X86::codeGenLaunchStreaming(DFInternalNode* Root) { - IntrinsicInst* LI = Root->getInstruction(); - Function* RootLaunch = createLaunchFunction(Root); - //Function* RootPush = createPushFunction(Root); - //Function* RootPop = createPopFunction(Root); - //Function* RootWait = createWaitFunction(Root); - // Substitute launch intrinsic main - DEBUG(errs() << "Substitute launch intrinsic\n"); - Value* LaunchInstArgs[] = {RootLaunch, - LI->getArgOperand(1) - }; - CallInst* LaunchInst = CallInst::Create(llvm_visc_streamLaunch, - ArrayRef<Value*>(LaunchInstArgs,2), - "graph"+Root->getFuncPointer()->getName(), LI); - //ReplaceInstWithInst(LI, LaunchInst); - - DEBUG(errs() << *LaunchInst << "\n"); - // Replace all wait instructions with x86 specific wait instructions - DEBUG(errs() << "Substitute wait, push, pop intrinsics\n"); - std::vector<IntrinsicInst*>* UseList = getUseList(LI); - for(unsigned i=0; i < UseList->size(); ++i) { - IntrinsicInst* II = UseList->at(i); - CallInst* CI; - Value* PushArgs[] = {LaunchInst, II->getOperand(1)}; - switch(II->getIntrinsicID()) { - case Intrinsic::visc_wait: - CI = CallInst::Create(llvm_visc_streamWait, - ArrayRef<Value*>(LaunchInst), - ""); - break; - case Intrinsic::visc_push: - CI = CallInst::Create(llvm_visc_streamPush, - ArrayRef<Value*>(PushArgs, 2), - ""); - break; - case Intrinsic::visc_pop: - CI = CallInst::Create(llvm_visc_streamPop, - ArrayRef<Value*>(LaunchInst), - ""); - break; - default: - llvm_unreachable("GraphID is used by an instruction other than wait, push, pop"); - }; - DEBUG(errs() << "Replace:\n\t" << *II << "\n"); - ReplaceInstWithInst(II, CI); - DEBUG(errs() << "\twith " << *CI << "\n"); - } - - -} - -void CGT_X86::codeGenLaunch(DFInternalNode* Root) { - // TODO: Place an assert to check if the constant passed by launch intrinsic - // as the number of arguments to DFG is same as the number of arguments of the - // root of DFG - DEBUG(errs() << "Generating Launch Function\n"); - // Get Launch Instruction - IntrinsicInst* LI = Root->getInstruction(); - switchToTimer(visc_TimerID_PTHREAD_CREATE, LI); - DEBUG(errs() << "Generating Launch Function\n"); - - /* Now we have all the necessary global declarations necessary to generate the - * Launch function, pointer to which can be passed to pthread utils to execute - * DFG. The Launch function has just one input: i8* data.addr - * This is the address of the all the input data that needs to be passed to - * this function. In our case it contains the input arguments of the Root - * function in the correct order. - * (1) Create an empty Launch function of type i8*(i8*) - * (2) Extract each of inputs from data.addr and pass them as arguments to the - * call to Root function - * (3) The return value from Root is stored in memory, pointer to which is - * passed to pthread_exit call. - */ - // Create Launch Function of type i8*(i8*) which calls the root function - Type* i8Ty = Type::getInt8Ty(M.getContext()); - FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(), - ArrayRef<Type*>(i8Ty->getPointerTo()), - false); - Function* AppFunc = Function::Create(AppFuncTy, - Root->getFuncPointer()->getLinkage(), - "LaunchDataflowGraph", - &M); - DEBUG(errs() << "Generating Launch Function\n"); - // Give a name to the argument which is used pass data to this thread - Value* data = &*AppFunc->arg_begin(); - data->setName("data.addr"); - // Add a basic block to this empty function and a return null statement to it - BasicBlock *BB = BasicBlock::Create(AppFunc->getContext(), "entry", AppFunc); - ReturnInst* RI = ReturnInst::Create(AppFunc->getContext(), - Constant::getNullValue(AppFunc->getReturnType()), - BB); - switchToTimer(visc_TimerID_ARG_UNPACK, RI); - - DEBUG(errs() << "Created Empty Launch Function\n"); - // Find the X86 function generated for Root and -// Function* RootF_X86 = Root->getGenFunc(); - Function* RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET); - assert(RootF_X86 && "Error: No generated CPU function for Root node\n"); - assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) && - "Error: Generated Function for Root node with no x86 wrapper\n"); - - // Generate a call to RootF_X86 with null parameters for now - std::vector<Value*>Args; - for(unsigned i=0; i< RootF_X86->getFunctionType()->getNumParams(); i++) { - Args.push_back(Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i))); - } - CallInst* CI = CallInst::Create(RootF_X86, Args, RootF_X86->getName()+".output", RI); - - // Extract input data from i8* data.addr and patch them to correct argument of - // call to RootF_X86. For each argument - std::vector<Type*> TyList; - std::vector<std::string> names; - for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end(); - ai != ae; ++ai) { - TyList.push_back(ai->getType()); - names.push_back(ai->getName()); - } - std::vector<Value*> elements = extractElements(data, TyList, names, CI); - // Patch the elements to the call arguments - for(unsigned i=0; i<CI->getNumArgOperands(); i++) - CI->setArgOperand(i, elements[i]); - - // Add timers around Call to RootF_X86 function - switchToTimer(visc_TimerID_COMPUTATION, CI); - switchToTimer(visc_TimerID_OUTPUT_PACK, RI); - - // Code for returning the output - CastInst* OutputAddrCast = CastInst::CreatePointerCast(data, - CI->getType()->getPointerTo(), - CI->getName()+".addr", - RI); - new StoreInst(CI, OutputAddrCast, RI); - switchToTimer(visc_TimerID_NONE, RI); - - DEBUG(errs() << "Application specific function:\n"); - DEBUG(errs() << *AppFunc << "\n"); - - // Substitute launch intrinsic main - Value* LaunchInstArgs[] = {AppFunc, - LI->getArgOperand(1) - }; - CallInst* LaunchInst = CallInst::Create(llvm_visc_x86_launch, - ArrayRef<Value*>(LaunchInstArgs,2), - "graph"+Root->getFuncPointer()->getName(), LI); - //ReplaceInstWithInst(LI, LaunchInst); - - DEBUG(errs() << *LaunchInst << "\n"); - // Replace all wait instructions with x86 specific wait instructions - std::vector<IntrinsicInst*>* UseList = getUseList(LI); - for(unsigned i=0; i < UseList->size(); ++i) { - IntrinsicInst* II = UseList->at(i); - CallInst* CI; - switch(II->getIntrinsicID()) { - case Intrinsic::visc_wait: - CI = CallInst::Create(llvm_visc_x86_wait, - ArrayRef<Value*>(LaunchInst), - ""); - break; - case Intrinsic::visc_push: - CI = CallInst::Create(llvm_visc_bufferPush, - ArrayRef<Value*>(LaunchInst), - ""); - break; - case Intrinsic::visc_pop: - CI = CallInst::Create(llvm_visc_bufferPop, - ArrayRef<Value*>(LaunchInst), - ""); - break; - default: - llvm_unreachable("GraphID is used by an instruction other than wait, push, pop"); - }; - ReplaceInstWithInst(II, CI); - DEBUG(errs() << *CI << "\n"); - } - -} - -Value* CGT_X86::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore) { - // TODO: Assumption is that each input port of a node has just one - // incoming edge. May change later on. - - // Find the incoming edge at the requested input port - DFEdge* E = Child->getInDFEdgeAt(i); - assert(E && "No incoming edge or binding for input element!"); - // Find the Source DFNode associated with the incoming edge - DFNode* SrcDF = E->getSourceDF(); - - // If Source DFNode is a dummyNode, edge is from parent. Get the - // argument from argument list of this internal node - Value* inputVal; - if(SrcDF->isEntryNode()) { - inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition()); - DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); - } - else { - // edge is from a sibling - // Check - code should already be generated for this source dfnode - assert(OutputMap.count(SrcDF) - && "Source node call not found. Dependency violation!"); - - // Find CallInst associated with the Source DFNode using OutputMap - Value* CI = OutputMap[SrcDF]; - - // Extract element at source position from this call instruction - std::vector<unsigned> IndexList; - IndexList.push_back(E->getSourcePosition()); - DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); - ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, - "", InsertBefore); - inputVal = EI; - } - return inputVal; -} - -void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86, - ValueToValueMapTy &VMap,Instruction* IB) { - Function* CF = C->getFuncPointer(); - -// Function* CF_X86 = C->getGenFunc(); - Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET); - assert(CF_X86 != NULL - && "Found leaf node for which code generation has not happened yet!\n"); - assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) && - "The generated function to be called from x86 backend is not an x86 function\n"); - DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n"); - - std::vector<Value*> Args; - // Create argument list to pass to call instruction - // First find the correct values using the edges - // The remaing six values are inserted as constants for now. - for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) { - Args.push_back(getInValueAt(C, i, F_X86, IB)); - } - - Value* I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0); - for(unsigned j=0; j<6; j++) - Args.push_back(I64Zero); - - errs() << "Gen Function type: " << *CF_X86->getType() << "\n"; - errs() << "Node Function type: " << *CF->getType() << "\n"; - errs() << "Arguments: " << Args.size() << "\n"; - - // Call the F_X86 function associated with this node - CallInst* CI = CallInst::Create(CF_X86, Args, - CF_X86->getName()+"_output", - IB); - DEBUG(errs() << *CI << "\n"); - OutputMap[C] = CI; - - // Find num of dimensions this node is replicated in. - // Based on number of dimensions, insert loop instructions - std::string varNames[3] = {"x", "y", "z"}; - unsigned numArgs = CI->getNumArgOperands(); - for(unsigned j=0; j < C->getNumOfDim(); j++) { - Value* indexLimit = NULL; - // Limit can either be a constant or an arguement of the internal node. - // In case of constant we can use that constant value directly in the - // new F_X86 function. In case of an argument, we need to get the mapped - // value using VMap - if(isa<Constant>(C->getDimLimits()[j])) { - indexLimit = C->getDimLimits()[j]; - DEBUG(errs() << "In Constant case:\n" - << " indexLimit type = " << *indexLimit->getType() << "\n"); - } - else { - indexLimit = VMap[C->getDimLimits()[j]]; - DEBUG(errs() << "In VMap case:" - <<" indexLimit type = " << *indexLimit->getType() << "\n"); - } - assert(indexLimit && "Invalid dimension limit!"); - // Insert loop - Value* indexVar = addLoop(CI, indexLimit, varNames[j]); - DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n"); - // Insert index variable and limit arguments - CI->setArgOperand(numArgs-6+j, indexVar); - CI->setArgOperand(numArgs-3+j, indexLimit); - } - // Insert call to runtime to push the dim limits and instanceID on the depth - // stack - Value* args[] = { - ConstantInt::get(Type::getInt32Ty(CI->getContext()), C->getNumOfDim()), // numDim - CI->getArgOperand(numArgs-3+0), // limitX - CI->getArgOperand(numArgs-6+0), // iX - CI->getArgOperand(numArgs-3+1), // limitY - CI->getArgOperand(numArgs-6+1), // iY - CI->getArgOperand(numArgs-3+2), // limitZ - CI->getArgOperand(numArgs-6+2) // iZ - }; - - CallInst* Push = CallInst::Create(llvm_visc_x86_dstack_push, ArrayRef<Value*>(args, 7), "", CI); - DEBUG(errs() << "Push on stack: " << *Push << "\n"); - // Insert call to runtime to pop the dim limits and instanceID from the depth - // stack - BasicBlock::iterator i(CI); - ++i; - Instruction* NextI = &*i; - // Next Instruction should also belong to the same basic block as the basic - // block will have a terminator instruction - assert(NextI->getParent() == CI->getParent() - && "Next Instruction should also belong to the same basic block!"); - - CallInst* Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI); - DEBUG(errs() << "Pop from stack: " << *Pop << "\n"); - DEBUG(errs() << *CI->getParent()->getParent()); -} - -/* This function takes a DFNode, and creates a filter function for it. By filter - * function we mean a function which keeps on getting input from input buffers, - * applying the function on the inputs and then pushes data on output buffers - */ -// Create a function with void* (void*) type. -// Create a new basic block -// Add a return instruction to the basic block -// extract arguments from the aggregate data input. Type list would be -// Replace the streaming inputs with i8* types signifying handle to -// corresponding buffers -// Add a boolean argument isLastInput -// Add runtime API calls to get input for each of the streaming inputs -// Add a call to the generated function of the child node -// Add runtime API calls to push output for each of the streaming outputs -// Add loop around the basic block, which exits the loop if isLastInput is false - -Function* CGT_X86::createFunctionFilter(DFNode* C) { - DEBUG(errs() << "*********Creating Function filter for " << C->getFuncPointer()->getName() << "*****\n"); - - /* Create a function with same argument list as child.*/ - DEBUG(errs() << "\tCreate a function with the same argument list as child\n"); - // Get the generated function for child node - Function* CF = C->getFuncPointer(); - // Create Filter Function of type i8*(i8*) which calls the root function - Type* i8Ty = Type::getInt8Ty(M.getContext()); - FunctionType* CF_PipelineTy = FunctionType::get(i8Ty->getPointerTo(), - ArrayRef<Type*>(i8Ty->getPointerTo()), - false); - Function* CF_Pipeline = Function::Create(CF_PipelineTy, - CF->getLinkage(), - CF->getName()+"_Pipeline", - &M); - DEBUG(errs() << "Generating Pipline Function\n"); - // Give a name to the argument which is used pass data to this thread - Value* data = &*CF_Pipeline->arg_begin(); - data->setName("data.addr"); - // Create a new basic block - DEBUG(errs() << "\tCreate new BB and add a return function\n"); - // Add a basic block to this empty function - BasicBlock *BB = BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline); - // Add a return instruction to the basic block - ReturnInst* RI = ReturnInst::Create(CF_Pipeline->getContext(), - UndefValue::get(CF_Pipeline->getReturnType()), BB); - - - /* Extract the elements from the aggregate argument to the function. - * Replace the streaming inputs with i8* types signifying handle to - * corresponding buffers - * Add outputs to the list as well - * Add isLastInput to the list - */ - DEBUG(errs() << "\tReplace streaming input arguments with i8* type\n"); - // These Args will be used when passing arguments to the generated function - // inside loop, and reading outputs as well. - std::vector<Value*> Args; - std::vector<Type*> TyList; - std::vector<std::string> names; - // Adding inputs - for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); - i != e; ++i) { - if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { - TyList.push_back(i8Ty->getPointerTo()); - names.push_back((Twine(i->getName())+"_buffer").str()); - } - else { - TyList.push_back(i->getType()); - names.push_back(i->getName()); - } - } - // Adding outputs. FIXME: Since we assume all outputs to be streaming edges, - // because we get there buffer handles - StructType* RetTy = cast<StructType>(CF->getReturnType()); - for (unsigned i=0; i<RetTy->getNumElements(); i++) { - TyList.push_back(i8Ty->getPointerTo()); - names.push_back("out"); - } - /* Add a boolean argument isLastInput */ - DEBUG(errs() << "\tAdd a boolean argument called isLastInput to function\n"); - TyList.push_back(i8Ty->getPointerTo()); - names.push_back("isLastInput_buffer"); - - // Extract the inputs, outputs and - Args = extractElements(data, TyList, names, RI); - for(unsigned i=0; i<Args.size(); i++) { - DEBUG(errs() << *Args[i] << "\n"); - } - - // Split the Args vector into, input output and isLastInput - unsigned numInputs = CF->getFunctionType()->getNumParams(); - unsigned numOutputs = RetTy->getNumElements(); - std::vector<Value*> InputArgs(Args.begin(), Args.begin() + numInputs); - std::vector<Value*> OutputArgs(Args.begin() + numInputs, Args.begin() + numInputs + numOutputs); - Instruction* isLastInput = cast<Instruction>(Args[Args.size()-1]); - - /* Add runtime API calls to get input for each of the streaming input edges */ - DEBUG(errs() << "\tAdd runtime API calls to get input for each of the streaming input edges\n"); - // First read the termination condition variable islastInput - CallInst* isLastInputPop = CallInst::Create(llvm_visc_bufferPop, - ArrayRef<Value*>(isLastInput), - "", - RI); - - CastInst* BI = BitCastInst::CreateIntegerCast(isLastInputPop, - Type::getInt64Ty(CF_Pipeline->getContext()), - false, - "isLastInput", - RI); - isLastInput = BI; - // Create a loop termination condition - CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, - isLastInput, Constant::getNullValue(Type::getInt64Ty(CF->getContext())), "isLastInputNotZero", - RI); - - // Get input from buffers of all the incoming streaming edges - for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); - i != e; ++i) { - if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) { - CallInst* bufferIn = CallInst::Create(llvm_visc_bufferPop, - ArrayRef<Value*>(InputArgs[i->getArgNo()]), - "", - RI); - CastInst* BI; - if(i->getType()->isPointerTy()) { - BI = CastInst::Create(CastInst::IntToPtr, - bufferIn, - i->getType(), - i->getName()+".addr", - RI); - } - else if(i->getType()->isFloatTy()) { - BI = CastInst::CreateFPCast(bufferIn, - i->getType(), - i->getName()+".addr", - RI); - } - else { - BI = CastInst::CreateIntegerCast(bufferIn, - i->getType(), - false, - i->getName()+".addr", - RI); - } - // Replace the argument in Args vector. We would be using the vector as - // parameters passed to the call - InputArgs[i->getArgNo()] = BI; - } - } - /* Add a call to the generated function of the child node */ - DEBUG(errs() << "\tAdd a call to the generated function of the child node\n"); -// DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n"); -// CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs, -// C->getGenFunc()->getName()+".output", RI); - Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET); - DEBUG(errs() << "Type: " - << *CGenF->getType() - << "\n"); - CallInst* CI = CallInst::Create(CGenF, - InputArgs, - CGenF->getName()+".output", - RI); - - /* Add runtime API calls to push output for each of the streaming outputs */ - // FIXME: Assumption - // All edges between siblings are streaming edges - DEBUG(errs() << "\tAdd runtime API calls to push output for each of the streaming outputs\n"); - for (unsigned i=0; i< numOutputs; i++) { - // Extract output - ExtractValueInst* EI = ExtractValueInst::Create(CI, ArrayRef<unsigned>(i), - "",RI); - // Convert to i64 - CastInst* BI; - if(EI->getType()->isPointerTy()) - BI = CastInst::Create(CastInst::PtrToInt,EI, - Type::getInt64Ty(CF_Pipeline->getContext()), - "", - RI); - else - BI = CastInst::CreateIntegerCast(EI, Type::getInt64Ty(CF_Pipeline->getContext()), - false, "", RI); - // Push to Output buffer - Value* bufferOutArgs[] = {OutputArgs[i], BI}; - CallInst* bufferOut = CallInst::Create(llvm_visc_bufferPush, - ArrayRef<Value*>(bufferOutArgs, 2), - "", - RI); - } - - // Add loop around the basic block, which exits the loop if isLastInput is false - //addDoWhileLoop(cast<Instruction>(Cond)->getNextNode(), RI, Cond); -// addWhileLoop(cast<Instruction>(isLastInputPop), cast<Instruction>(Cond)->getNextNode(), -// RI, Cond); - - // Add loop around the basic block, which exits the loop if isLastInput is false - // Pointers to keep the created loop structure - BasicBlock *EntryBB, *CondBB, *BodyBB; - Instruction *CondStartI = cast<Instruction>(isLastInputPop); - Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode(); - EntryBB = CondStartI->getParent(); - - addWhileLoop(CondStartI, BodyStartI, RI, Cond); - CondBB = CondStartI->getParent(); - BodyBB = CI->getParent(); - Instruction *CntI = NULL; - CallInst *GetPolicyCI = get_llvm_visc_policy_getVersion_call(CGenF); - - // If the node function calls the visc runtime call to get policy, we update - // it with the counter information. This means we need to pass an additional - // argument to the generated function, that is the iteration number, and then - // use it as an argument to the policy_getVersion call - if (GetPolicyCI) { - CntI = addWhileLoopCounter(EntryBB, CondBB, BodyBB); - assert(CntI && "Counter instruction not found\n"); - - // Create new function type (with additional argument for iteration number) - Type *NewRetTy = CGenF->getFunctionType()->getReturnType(); - std::vector<Type*> NewArgTypes; - for (Function::arg_iterator ai = CGenF->arg_begin(), ae = CGenF->arg_end(); - ai != ae ; ++ai) { - NewArgTypes.push_back(ai->getType()); - } - NewArgTypes.push_back(Type::getInt64Ty(M.getContext())); - FunctionType *NewFT = FunctionType::get(NewRetTy, NewArgTypes, false); - Function *NewCGenF = viscUtils::cloneFunction(CGenF, NewFT, false); - // At least one (the last) argument exists (we added it) - Function::arg_iterator ae = NewCGenF->arg_end(); - --ae; - Argument *CntArg = &*ae; - CntArg->setName("iteration"); - // Replace the old cpu gen func with this one - C->addGenFunc(NewCGenF, visc::CPU_TARGET, true); - - // Add counter to the actual parameter list, to create the new call - InputArgs.push_back(CntI); - CallInst* newCI = CallInst::Create(NewCGenF, - InputArgs, - NewCGenF->getName()+".output"); - ReplaceInstWithInst(CI, newCI); - - // Set second operand of the policy_getVersion call to the last function - // argument - GetPolicyCI = get_llvm_visc_policy_getVersion_call(NewCGenF); - GetPolicyCI->setArgOperand(1, CntArg); - } - - // Return the Function pointer - DEBUG(errs() << "Pipeline Version of " << CF->getName() << ":\n"); - DEBUG(errs() << *CF_Pipeline << "\n"); - return CF_Pipeline; -} - -void CGT_X86::codeGen(DFInternalNode* N) { - // Check if N is root node and its graph is streaming. We do not do codeGen - // for Root in such a case - if(N->isRoot() && N->isChildGraphStreaming()) - return; - - // Check if clone already exists. If it does, it means we have visited this - // function before and nothing else needs to be done for this leaf node. -// if(N->getGenFunc() != NULL) -// return; - if (!preferredTargetIncludes(N, visc::CPU_TARGET)) { - errs() << "No CPU hint for node " << N->getFuncPointer()->getName() << - " : skipping it\n"; - return; - } - - assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL && - "Error: Visiting a node for which code already generated\n"); - - // Sort children in topological order before code generation - N->getChildGraph()->sortChildren(); - - // Only process if all children have a CPU x86 function - // Otherwise skip to end - bool codeGen = true; - for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), - ce = N->getChildGraph()->end(); ci != ce; ++ci) { - DFNode* C = *ci; - // Skip dummy node call - if (C->isDummyNode()) - continue; - - if (!(C->hasX86GenFuncForTarget(visc::CPU_TARGET))) { - errs() << "No CPU x86 version for child node " - << C->getFuncPointer()->getName() - << "\n Skip code gen for parent node " - << N->getFuncPointer()->getName() << "\n"; - codeGen = false; - } - } - - if (codeGen) { - Function* F = N->getFuncPointer(); - // Create of clone of F with no instructions. Only the type is the same as F - // without the extra arguments. - Function* F_X86; - - // Clone the function, if we are seeing this function for the first time. We - // only need a clone in terms of type. - ValueToValueMapTy VMap; - - // Create new function with the same type - F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); - - // Loop over the arguments, copying the names of arguments over. - Function::arg_iterator dest_iterator = F_X86->arg_begin(); - for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); - i != e; ++i) { - dest_iterator->setName(i->getName()); // Copy the name over... - // Increment dest iterator - ++dest_iterator; - } - - // Add a basic block to this empty function - BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86); - ReturnInst* RI = ReturnInst::Create(F_X86->getContext(), - UndefValue::get(F_X86->getReturnType()), BB); - - // Add Index and Dim arguments except for the root node and the child graph of - // parent node is not streaming - if(!N->isRoot() && !N->getParent()->isChildGraphStreaming()) - F_X86 = addIdxDimArgs(F_X86); - - BB = &*F_X86->begin(); - RI = cast<ReturnInst>(BB->getTerminator()); - - //Add generated function info to DFNode -// N->setGenFunc(F_X86, visc::CPU_TARGET); - N->addGenFunc(F_X86, visc::CPU_TARGET, true); - - // Loop over the arguments, to create the VMap. - dest_iterator = F_X86->arg_begin(); - for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); - i != e; ++i) { - // Add mapping and increment dest iterator - VMap[&*i] = &*dest_iterator; - ++dest_iterator; - } - - // Iterate over children in topological order - for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), - ce = N->getChildGraph()->end(); ci != ce; ++ci) { - DFNode* C = *ci; - // Skip dummy node call - if (C->isDummyNode()) - continue; - - // Create calls to CPU function of child node - invokeChild_X86(C, F_X86, VMap, RI); - - } - - DEBUG(errs() << "*** Generating epilogue code for the function****\n"); - // Generate code for output bindings - // Get Exit node - DFNode* C = N->getChildGraph()->getExit(); - // Get OutputType of this node - StructType* OutTy = N->getOutputType(); - Value *retVal = UndefValue::get(F_X86->getReturnType()); - // Find all the input edges to exit node - for (unsigned i=0; i < OutTy->getNumElements(); i++) { - DEBUG(errs() << "Output Edge " << i << "\n"); - // Find the incoming edge at the requested input port - DFEdge* E = C->getInDFEdgeAt(i); - - assert(E && "No Binding for output element!"); - // Find the Source DFNode associated with the incoming edge - DFNode* SrcDF = E->getSourceDF(); - - DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); - - // If Source DFNode is a dummyNode, edge is from parent. Get the - // argument from argument list of this internal node - Value* inputVal; - if(SrcDF->isEntryNode()) { - inputVal = getArgumentAt(F_X86, i); - DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); - } - else { - // edge is from a internal node - // Check - code should already be generated for this source dfnode - assert(OutputMap.count(SrcDF) - && "Source node call not found. Dependency violation!"); - - // Find Output Value associated with the Source DFNode using OutputMap - Value* CI = OutputMap[SrcDF]; - - // Extract element at source position from this call instruction - std::vector<unsigned> IndexList; - IndexList.push_back(E->getSourcePosition()); - DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); - ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, - "",RI); - inputVal = EI; - } - std::vector<unsigned> IdxList; - IdxList.push_back(i); - retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI); - } - DEBUG(errs() << "Extracted all\n"); - retVal->setName("output"); - ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); - ReplaceInstWithInst(RI, newRI); - - } - - //-------------------------------------------------------------------------// - // Here, we need to check if this node (N) has more than one versions - // If so, we query the policy and have a call to each version - // If not, we see which version exists, check that it is in fact an x86 - // function and save it as the CPU_TARGET function - - // TODO: visc_id per node, so we can use this for id for policies - // For now, use node function name and change it later - Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET); - Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET); - Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET); - - bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); - bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); - bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET); - - errs() << "Node: " << N->getFuncPointer()->getName() - << " with tag " << N->getTag() << "\n"; - errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n"; - errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n"; - errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n"; - errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n"; - errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n"; - errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n"; - - - if (N->getTag() == visc::None) { - // No code is available for this node. This (usually) means that this - // node is a node that - // - from the accelerator backends has been mapped to an intermediate - // node, and thus they have not produced a genFunc - // - a child node had no CPU hint, thus no code gen for CPU could - // take place - errs() << "No GenFunc - Skipping CPU code generation for node " - << N->getFuncPointer()->getName() << "\n"; - } else if (viscUtils::isSingleTargetTag(N->getTag())) { - // There is a single version for this node according to code gen hints. - // Therefore, we do not need to check the policy, we simply use the - // available implementation, whichever target it is for. - - // Sanity check - to be removed TODO - switch (N->getTag()) { - case visc::CPU_TARGET: - assert(N->getGenFuncForTarget(visc::CPU_TARGET) && ""); - assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && ""); - assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && ""); - assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); - assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && ""); - assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && ""); - break; - case visc::GPU_TARGET: - assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && ""); - assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && ""); - assert(N->getGenFuncForTarget(visc::GPU_TARGET) && ""); - assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && ""); - assert(!(N->getGenFuncForTarget(visc::SPIR_TARGET)) && ""); - assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && ""); - break; - case visc::SPIR_TARGET: - assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && ""); - assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && ""); - assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && ""); - assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); - assert(N->getGenFuncForTarget(visc::SPIR_TARGET) && ""); - assert(N->hasX86GenFuncForTarget(visc::SPIR_TARGET) && ""); - break; - default: - assert(false && "Unreachable: we checked that tag was single target!\n"); - break; - } - - // If device abstraction is enabled, then we may need to edit the node - // function. In case this is a GPU or SPIR gen func, we issue a call to - // the runtime that waits for the device to be available - if (DeviceAbstraction) { - Function *NodeGenFunc = NULL; - switch (N->getTag()) { - case visc::GPU_TARGET: - NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET); - break; - case visc::SPIR_TARGET: - NodeGenFunc = N->getGenFuncForTarget(visc::SPIR_TARGET); - break; - default: - break; - } - - if (NodeGenFunc) { - // If we found a function to edit, we add the call to the runtime as - // its first statement - BasicBlock *BB = &*NodeGenFunc->begin(); - std::vector<Value *> Args; // TODO: add the device type as argument? - Function *RTF = - cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus", - runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType())); - CallInst *RTFInst = CallInst::Create(RTF, Args, "", BB->getFirstNonPHI()); - } - - } - - Function *Ftmp = N->getGenFuncForTarget(N->getTag()); - N->removeGenFuncForTarget(visc::GPU_TARGET); - N->removeGenFuncForTarget(visc::SPIR_TARGET); - N->setTag(visc::None); - N->addGenFunc(Ftmp, visc::CPU_TARGET, true); - N->setTag(visc::CPU_TARGET); - - // Sanity checks - to be removed TODO - CF = N->getGenFuncForTarget(visc::CPU_TARGET); - GF = N->getGenFuncForTarget(visc::GPU_TARGET); - SF = N->getGenFuncForTarget(visc::SPIR_TARGET); - - CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); - GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); - SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET); - - errs() << "After editing\n"; - errs() << "Node: " << N->getFuncPointer()->getName() - << " with tag " << N->getTag() << "\n"; - errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n"; - errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n"; - errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n"; - errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n"; - errs() << "SPIR Fun: " << (SF ? SF->getName() : "null" ) << "\n"; - errs() << "hasx86GenFuncForSPIR : " << SFx86 << "\n"; - - // assert(false && "got to the point where we have to select\n"); - } else { - // We have more than one targets - - errs() << "Node Name (for policy) : " - << N->getFuncPointer()->getName() << "\n"; - - Function *CF = N->getGenFuncForTarget(visc::CPU_TARGET); - Function *GF = N->getGenFuncForTarget(visc::GPU_TARGET); - Function *SF = N->getGenFuncForTarget(visc::SPIR_TARGET); - - bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET); - bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET); - bool SFx86 = N->hasX86GenFuncForTarget(visc::SPIR_TARGET); - - // These assertions express what we can support with the current runtime. - // Code generation works the same way even for other target combinations. - // For now, we want either CPU and GPU, or CPU and SPIR - assert((CF && (GF && !SF || !GF && SF)) && "Invalid target selection\n"); - assert((CFx86 && (GFx86 && !SFx86 || !GFx86 && SFx86)) && - "Generated functions without appropriate x86 wrapper\n"); - - FunctionType *FT = CF->getFunctionType(); - if (GF) - assert(FT == GF->getFunctionType() && - "Type mismatch between generated functions for GPU and CPU targets.\n"); - if (SF) - assert(FT == SF->getFunctionType() && - "Type mismatch between generated functions for SPIR and CPU targets.\n"); - - // Code generation of wrapper function - Function *F_wrapper; - ValueToValueMapTy VMap; - F_wrapper = Function::Create(FT, CF->getLinkage(), CF->getName()+"_wrapper", &M); - - // Copy argument names over - Function::arg_iterator dest_iterator = F_wrapper->arg_begin(); - for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); - i != e; ++i) { - dest_iterator->setName(i->getName()); - VMap[&*i] = &*dest_iterator; - ++dest_iterator; - } - // Gather all arguments of wrapper in a vector, to prepare the call to - // the individual gen functions - std::vector<Value *> GenFuncCallArgs; - for (Function::arg_iterator i = F_wrapper->arg_begin(), e = F_wrapper->arg_end(); - i != e; ++i) { - GenFuncCallArgs.push_back(&*i); - } - - BasicBlock *BBcurrent, *BBtrue, *BBfalse; - - BBcurrent = BasicBlock::Create(M.getContext(), "entry", F_wrapper); - - StringRef FName = N->getFuncPointer()->getName(); - size_t nameSize = FName.size()+1; - std::vector<Constant *> NameV; - for (char c: FName) { - NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), c)); - } - NameV.push_back(ConstantInt::get(Type::getInt8Ty(M.getContext()), '\0')); - ArrayType *NameType = - ArrayType::get(IntegerType::get(M.getContext(), 8), nameSize); - AllocaInst *AI = new AllocaInst(NameType, nullptr, "", BBcurrent); - Constant *NameConst = ConstantArray::get(NameType, NameV); - StoreInst *StI = new StoreInst(NameConst, AI, BBcurrent); - CastInst *BI = BitCastInst::CreatePointerCast(AI, - Type::getInt8PtrTy(M.getContext()), "", BBcurrent); - std::vector<Value *> Args; - Args.push_back(BI); - Args.push_back(ConstantInt::get(Type::getInt64Ty(M.getContext()), -1, true)); - Function *RTF = - cast<Function>(M.getOrInsertFunction("llvm_visc_policy_getVersion", - runtimeModule->getFunction("llvm_visc_policy_getVersion")->getFunctionType())); - CallInst *RTFInst = CallInst::Create(RTF, Args, "", BBcurrent); - - ConstantInt *CmpConst = - ConstantInt::get(Type::getInt32Ty(M.getContext()), 0, true); - CmpInst *CmpI = CmpInst::Create(Instruction::ICmp, - CmpInst::ICMP_EQ, - RTFInst, CmpConst, - "", BBcurrent); - - BBtrue = BasicBlock::Create(M.getContext(), "version_cpu", F_wrapper); - BBfalse = BasicBlock::Create(M.getContext(), "not_cpu", F_wrapper); - BranchInst *BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); - - CallInst *GenFuncCI = CallInst::Create(CF, GenFuncCallArgs, "", BBtrue); - ReturnInst *RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); - - // Switch basic block pointers - BBcurrent = BBfalse; - if (GF) { - // We have a GPU version. Generate policy check and call - CmpConst = - ConstantInt::get(Type::getInt32Ty(M.getContext()), 1, true); - CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, - RTFInst, CmpConst, "", BBcurrent); - BBtrue = BasicBlock::Create(M.getContext(), "version_gpu", F_wrapper); - BBfalse = BasicBlock::Create(M.getContext(), "not_gpu", F_wrapper); - BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); - - GenFuncCI = CallInst::Create(GF, GenFuncCallArgs, "", BBtrue); - RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); - - if (DeviceAbstraction) { - // Prepare arguments and function for call to wait for device runtime call - std::vector<Value *> Args; // TODO: add the device type as argument? - Function *RTF = - cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus", - runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType())); - CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI); - } - } - - // Switch basic block pointers - BBcurrent = BBfalse; - if (SF) { - // We have a GPU version. Generate policy check and call - CmpConst = - ConstantInt::get(Type::getInt32Ty(M.getContext()), 2, true); - CmpI = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, - RTFInst, CmpConst, "", BBcurrent); - BBtrue = BasicBlock::Create(M.getContext(), "version_spir", F_wrapper); - BBfalse = BasicBlock::Create(M.getContext(), "not_spir", F_wrapper); - BrI = BranchInst::Create(BBtrue, BBfalse, CmpI, BBcurrent); - - GenFuncCI = CallInst::Create(SF, GenFuncCallArgs, "", BBtrue); - RI = ReturnInst::Create(M.getContext(), GenFuncCI, BBtrue); - - if (DeviceAbstraction) { - // Prepare arguments and function for call to wait for device runtime call - std::vector<Value *> Args; // TODO: add the device type as argument? - Function *RTF = - cast<Function>(M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus", - runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType())); - CallInst *RTFInst = CallInst::Create(RTF, Args, "", GenFuncCI); - } - } - - RI = ReturnInst::Create(M.getContext(), - UndefValue::get(FT->getReturnType()), BBfalse); - - // Now, make the node cpu gen func to be this one - // Remove all other versions and update the tag - N->addGenFunc(F_wrapper, visc::CPU_TARGET, true); - N->removeGenFuncForTarget(visc::GPU_TARGET); - N->removeGenFuncForTarget(visc::SPIR_TARGET); - N->setTag(visc::CPU_TARGET); - - // assert(false && "got to the point where we have to combine\n"); - } - -} - -// Code generation for leaf nodes -void CGT_X86::codeGen(DFLeafNode* N) { - // Skip code generation if it is a dummy node - if(N->isDummyNode()) { - DEBUG(errs() << "Skipping dummy node\n"); - return; - } - - // At this point, the X86 backend does not support code generation for - // the case where allocation node is used, so we skip. This means that a - // CPU version will not be created, and therefore code generation will - // only succeed if another backend (nvptx or spir) has been invoked to - // generate a node function for the node including the allocation node. - if (N->isAllocationNode()) { - DEBUG(errs() << "Skipping allocation node\n"); - return; - } - - // Check if clone already exists. If it does, it means we have visited this - // function before and nothing else needs to be done for this leaf node. -// if(N->getGenFunc() != NULL) -// return; - - if (!preferredTargetIncludes(N, visc::CPU_TARGET)) { - errs() << "No CPU hint for node " << N->getFuncPointer()->getName() << - " : skipping it\n"; - - errs() << "Check for cudnn or promise hint for node " - << N->getFuncPointer()->getName() << "\n"; - - switch (N->getTag()) { - case visc::CUDNN_TARGET: { - errs() << "CUDNN hint found. Store CUDNN function as CPU funtion.\n"; - // Make sure there is a generated x86 function for cudnn - assert(N->getGenFuncForTarget(visc::CUDNN_TARGET) && ""); - assert(N->hasX86GenFuncForTarget(visc::CUDNN_TARGET) && ""); - // Store the CUDNN x86 function as the CPU generated function - Function *Ftmp = N->getGenFuncForTarget(N->getTag()); - // after adding the required number of arguments - if (!N->getParent()->isChildGraphStreaming()) - Ftmp = addIdxDimArgs(Ftmp); - - N->removeGenFuncForTarget(visc::CUDNN_TARGET); - N->setTag(visc::None); - N->addGenFunc(Ftmp, visc::CPU_TARGET, true); - N->setTag(visc::CPU_TARGET); - break; - } - case visc::PROMISE_TARGET: { - errs() << "Promise hint found. Store PROMISE function as CPU funtion.\n"; - // Make sure there is a generated x86 function for promise - assert(N->getGenFuncForTarget(visc::PROMISE_TARGET) && ""); - assert(N->hasX86GenFuncForTarget(visc::PROMISE_TARGET) && ""); - // Store the PROMISE x86 function as the CPU generated function - Function *Ftmp = N->getGenFuncForTarget(N->getTag()); - // after adding the required number of arguments - if (!N->getParent()->isChildGraphStreaming()) - Ftmp = addIdxDimArgs(Ftmp); - - N->setTag(visc::None); - N->removeGenFuncForTarget(visc::PROMISE_TARGET); - N->addGenFunc(Ftmp, visc::CPU_TARGET, true); - N->setTag(visc::CPU_TARGET); - break; - } - case visc::GPU_TARGET: - // A leaf node should not have an x86 function for GPU - // by design of DFG2LLVM_NVPTX backend - assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && ""); - break; - case visc::SPIR_TARGET: - // A leaf node should not have an x86 function for SPIR - // by design of DFG2LLVM_SPIR backend - assert(!(N->hasX86GenFuncForTarget(visc::SPIR_TARGET)) && ""); - break; - default: - break; - } - - return; - } - - assert(N->getGenFuncForTarget(visc::CPU_TARGET) == NULL && - "Error: Visiting a node for which code already generated\n"); - - std::vector<IntrinsicInst *> IItoRemove; - std::vector<std::pair<IntrinsicInst *, Value *> > IItoReplace; - BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap; - - // Get the function associated woth the dataflow node - Function *F = N->getFuncPointer(); - - // Clone the function, if we are seeing this function for the first time. - Function *F_X86; - ValueToValueMapTy VMap; - F_X86 = CloneFunction(F, VMap); - F_X86->removeFromParent(); - // Insert the cloned function into the module - M.getFunctionList().push_back(F_X86); - - // Add the new argument to the argument list. Add arguments only if the cild - // graph of parent node is not streaming - if(!N->getParent()->isChildGraphStreaming()) - F_X86 = addIdxDimArgs(F_X86); - - // Add generated function info to DFNode -// N->setGenFunc(F_X86, visc::CPU_TARGET); - N->addGenFunc(F_X86, visc::CPU_TARGET, true); - - /*** FIXME: HACK FOR DSSOC DEMO -- BEGIN ***/ - /* This part of the code is meant to handle turning the CPU backend into an - "accelerator" backend for ApproxHPVM. For this reason, the HPVM runtime - needs to be essentially deactivated. */ - - /* We look into the leaf node's function for function call starting from - "tensor". These are functions with which we replaced the ApproxHPVM - intrinsics, and for which we have LLVM implementations. If found, it means - we are dealing with an AproxHPVM program. */ - bool isApproxHPVMnode = false; - for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) { - Instruction *I = &(*i); - DEBUG(errs() << *I << "\n"); - - if (CallInst *CI = dyn_cast<CallInst>(I)) { - if ((CI->getCalledFunction()->getName()).startswith("tensor")) { - isApproxHPVMnode = true; - break; - } - } - } - - /*As in CUDNN backend, we remove the in out attributes of tensor operations, - aiming to deactivate the HPVM runtime calls. This has been tested through - CUDNN backend for the internal node codegen, and should ensure that code - does not insert llvm_visc_x86_argument_ptr in the generated function for - leaf node codegen as well. */ - - /* Removing HPVM in/out/inout function attributes */ - if (isApproxHPVMnode) { - for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end(); ai != ae; ai++) { - Argument *Arg = &*ai; - if(Arg->hasAttribute(Attribute::In)) - Arg->removeAttr(Attribute::In); - if(Arg->hasAttribute(Attribute::Out)) - Arg->removeAttr(Attribute::Out); - if(Arg->hasAttribute(Attribute::InOut)) - Arg->removeAttr(Attribute::InOut); - } - }else{ - printf("****** NO REMOVEAL *** \n\n"); - } - - /*** FIXME: HACK FOR DSSOC DEMO -- END ***/ - - // Go through the arguments, and any pointer arguments with in attribute need - // to have x86_argument_ptr call to get the x86 ptr of the argument - // Insert these calls in a new BB which would dominate all other BBs - // Create new BB - BasicBlock* EntryBB = &*F_X86->begin(); - BasicBlock* BB = BasicBlock::Create(M.getContext(), "getVISCPtrArgs", F_X86, EntryBB); - BranchInst* Terminator = BranchInst::Create(EntryBB, BB); - // Insert calls - for(Function::arg_iterator ai = F_X86->arg_begin(), ae = F_X86->arg_end(); - ai != ae; ++ai) { - if (F_X86->getAttributes().hasAttribute(ai->getArgNo()+1, Attribute::In)) { - assert(ai->getType()->isPointerTy() - && "Only pointer arguments can have visc in/out attributes "); - Function::arg_iterator aiNext = ai; - ++aiNext; - Argument* size = &*aiNext; - assert(size->getType() == Type::getInt64Ty(M.getContext()) - && "Next argument after a pointer should be an i64 type"); - CastInst* BI = BitCastInst::CreatePointerCast(&*ai, - Type::getInt8PtrTy(M.getContext()), - ai->getName()+".i8ptr", - Terminator); - Value* ArgPtrCallArgs[] = {BI, size}; - CallInst::Create(llvm_visc_x86_argument_ptr, - ArrayRef<Value*>(ArgPtrCallArgs, 2), - "", - Terminator); - - } - } - errs() << *BB << "\n"; - - // Go through all the instructions - for (inst_iterator i = inst_begin(F_X86), e = inst_end(F_X86); i != e; ++i) { - Instruction *I = &(*i); - DEBUG(errs() << *I << "\n"); - // Leaf nodes should not contain VISC graph intrinsics or launch - assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!"); - assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!"); - - if (BuildDFG::isViscQueryIntrinsic(I)) { - IntrinsicInst* II = cast<IntrinsicInst>(I); - IntrinsicInst* ArgII; - DFNode* ArgDFNode; - - /*********************************************************************** - * Handle VISC Query intrinsics * - ***********************************************************************/ - switch (II->getIntrinsicID()) { - /**************************** llvm.visc.getNode() *******************/ - case Intrinsic::visc_getNode: { - // add mapping <intrinsic, this node> to the node-specific map - Leaf_HandleToDFNodeMap[II] = N; - IItoRemove.push_back(II); - break; - } - /************************* llvm.visc.getParentNode() ****************/ - case Intrinsic::visc_getParentNode: { - // get the parent node of the arg node - // get argument node - ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); - // get the parent node of the arg node - ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; - // Add mapping <intrinsic, parent node> to the node-specific map - // the argument node must have been added to the map, orelse the - // code could not refer to it - Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent(); - IItoRemove.push_back(II); - break; - } - /*************************** llvm.visc.getNumDims() *****************/ - case Intrinsic::visc_getNumDims: { - // get node from map - // get the appropriate field - ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); - int numOfDim = Leaf_HandleToDFNodeMap[ArgII]->getNumOfDim(); - IntegerType* IntTy = Type::getInt32Ty(M.getContext()); - ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim); - - II->replaceAllUsesWith(numOfDimConstant); - IItoRemove.push_back(II); - break; - } - /*********************** llvm.visc.getNodeInstanceID() **************/ - case Intrinsic::visc_getNodeInstanceID_x: - case Intrinsic::visc_getNodeInstanceID_y: - case Intrinsic::visc_getNodeInstanceID_z: { - ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); - ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; - - // The dfnode argument should be an ancestor of this leaf node or - // the leaf node itself - int parentLevel = N->getAncestorHops(ArgDFNode); - assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N ) - && "Invalid DFNode argument to getNodeInstanceID_[xyz]!"); - - // Get specified dimension - // (dim = 0) => x - // (dim = 1) => y - // (dim = 2) => z - int dim = (int) (II->getIntrinsicID() - - Intrinsic::visc_getNodeInstanceID_x); - assert((dim >= 0) && (dim < 3) - && "Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic ID!"); - - // For immediate ancestor, use the extra argument introduced in - // F_X86 - int numParamsF = F->getFunctionType()->getNumParams(); - int numParamsF_X86 = F_X86->getFunctionType()->getNumParams(); - assert((numParamsF_X86 - numParamsF == 6) - && "Difference of arguments between function and its clone is not 6!"); - - if(parentLevel == 0) { - // Case when the query is for this node itself - unsigned offset = 3 + (3-dim); - // Traverse argument list of F_X86 in reverse order to find the - // correct index or dim argument. - Argument* indexVal = getArgumentFromEnd(F_X86, offset); - assert(indexVal && "Index argument not found. Invalid offset!"); - - DEBUG(errs() << *II << " replaced with " << *indexVal << "\n"); - - II->replaceAllUsesWith(indexVal); - IItoRemove.push_back(II); - } - else { - // Case when query is for an ancestor - Value* args[] = { - ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel), - ConstantInt::get(Type::getInt32Ty(II->getContext()), dim) - }; - CallInst* CI = CallInst::Create(llvm_visc_x86_getDimInstance, - ArrayRef<Value*>(args, 2), - "nodeInstanceID", II); - DEBUG(errs() << *II << " replaced with " << *CI << "\n"); - II->replaceAllUsesWith(CI); - IItoRemove.push_back(II); - } - break; - } - /********************** llvm.visc.getNumNodeInstances() *************/ - case Intrinsic::visc_getNumNodeInstances_x: - case Intrinsic::visc_getNumNodeInstances_y: - case Intrinsic::visc_getNumNodeInstances_z: { - - ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); - ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; - - // The dfnode argument should be an ancestor of this leaf node or - // the leaf node itself - int parentLevel = N->getAncestorHops(ArgDFNode); - assert(( parentLevel >= 0 || ArgDFNode == (DFNode*)N ) - && "Invalid DFNode argument to getNodeInstanceID_[xyz]!"); - - // Get specified dimension - // (dim = 0) => x - // (dim = 1) => y - // (dim = 2) => z - int dim = (int) (II->getIntrinsicID() - - Intrinsic::visc_getNumNodeInstances_x); - assert((dim >= 0) && (dim < 3) - && "Invalid dimension for getNumNodeInstances_[xyz]. Check Intrinsic ID!"); - - // For immediate ancestor, use the extra argument introduced in - // F_X86 - int numParamsF = F->getFunctionType()->getNumParams(); - int numParamsF_X86 = F_X86->getFunctionType()->getNumParams(); - assert((numParamsF_X86 - numParamsF == 6) - && "Difference of arguments between function and its clone is not 6!"); - - if(parentLevel == 0) { - // Case when the query is for this node itself - unsigned offset = 3 - dim; - // Traverse argument list of F_X86 in reverse order to find the - // correct index or dim argument. - Argument* limitVal = getArgumentFromEnd(F_X86, offset); - assert(limitVal && "Limit argument not found. Invalid offset!"); - - DEBUG(errs() << *II << " replaced with " << *limitVal << "\n"); - - II->replaceAllUsesWith(limitVal); - IItoRemove.push_back(II); - } - else { - // Case when query is from the ancestor - Value* args[] = { - ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel), - ConstantInt::get(Type::getInt32Ty(II->getContext()), dim) - }; - CallInst* CI = CallInst::Create(llvm_visc_x86_getDimLimit, - ArrayRef<Value*>(args, 2), - "numNodeInstances", II); - DEBUG(errs() << *II << " replaced with " << *CI << "\n"); - II->replaceAllUsesWith(CI); - IItoRemove.push_back(II); - } - - break; - } - default: - DEBUG(errs() << "Found unknown intrinsic with ID = " << - II->getIntrinsicID() << "\n"); - assert(false && "Unknown VISC Intrinsic!"); - break; - } - - } else { - //TODO: how to handle address space qualifiers in load/store - } - - } - - //TODO: - // When to replace the uses? - // In which order is it safe to replace the instructions in - // IItoReplace? - // Probably in the reverse order in the vectors - // It is a good idea to have them in one vector and chech the type - // using dyn_cast in order to determine if we replace with inst or value - - - //TODO: maybe leave these instructions to be removed by a later DCE pass - for (std::vector<IntrinsicInst *>::iterator i = IItoRemove.begin(); - i != IItoRemove.end(); ++i) { - (*i)->replaceAllUsesWith(UndefValue::get((*i)->getType())); - (*i)->eraseFromParent(); - } - - DEBUG(errs() << *F_X86); -} - -} // End of namespace - -char DFG2LLVM_X86::ID = 0; -static RegisterPass<DFG2LLVM_X86> X("dfg2llvm-x86-dsoc", - "Dataflow Graph to LLVM for X86 backend (DSOCC version)", - false /* does not modify the CFG */, - true /* transformation, not just analysis */); - diff --git a/lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt b/lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt deleted file mode 100644 index a6c4de9537..0000000000 --- a/lib/DFG2LLVM_X86_dsoc/LLVMBuild.txt +++ /dev/null @@ -1,22 +0,0 @@ -;===- ./lib/Transforms/DFG2LLVM_NVPTX/LLVMBuild.txt ------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = DFG2LLVM_X86_dsoc -parent = Transforms - -- GitLab