diff --git a/llvm/include/llvm/IR/DFGraph.h b/llvm/include/llvm/IR/DFGraph.h index 2d935b3c5f3190f2341dd5ac39dfb81eef0f7074..6093fe9297701038396e8c6b366efd3fadb981ed 100644 --- a/llvm/include/llvm/IR/DFGraph.h +++ b/llvm/include/llvm/IR/DFGraph.h @@ -154,6 +154,11 @@ public: InternalNode, LeafNode }; + // To identify for which backend code has been generated + enum CodeGenTag { + X86, + PTX + }; private: typedef std::vector<DFNode*> DFNodeListType; @@ -162,6 +167,7 @@ private: // Important things that make up a Dataflow Node IntrinsicInst* II; ///< Associated IntrinsicInst/Value Function* FuncPointer; ///< Associated Function + Function* GenFunc; ///< Associated Function generated by backend DFInternalNode* Parent; ///< Pointer to parent dataflow Node unsigned NumOfDim; ///< Number of dimensions std::vector<Value*> DimLimits; ///< Number of instances in each dimension @@ -179,6 +185,8 @@ private: ///< hierarchy int Rank; ///< Ordering based on toplogical sort const DFNodeKind Kind; ///< Kind of Node Internal/Leaf + CodeGenTag Tag; // Code Generated for which backend + public: // Iterators @@ -247,7 +255,7 @@ public: DFNodeKind getKind() const { return Kind; } - + DFNode(IntrinsicInst* _II, Function* _FuncPointer, DFInternalNode* _Parent, unsigned _NumOfDim, std::vector<Value*> _DimLimits, DFNodeKind _K); @@ -302,6 +310,23 @@ public: return Rank; } + void setTag(CodeGenTag T) { + Tag = T; + } + + CodeGenTag getTag() const { + return Tag; + } + + void setGenFunc(Function* F, CodeGenTag T) { + GenFunc = F; + Tag = T; + } + + Function* getGenFunc() { + return GenFunc; + } + bool isDummyNode() { return isEntryNode() || isExitNode(); } diff --git a/llvm/include/llvm/IR/IntrinsicsVISC.td b/llvm/include/llvm/IR/IntrinsicsVISC.td index ed848f99b5b4f18967fe779afe249657c95dad1c..62aeb6f32b7216e800e531ea15ee25b58aa8af62 100644 --- a/llvm/include/llvm/IR/IntrinsicsVISC.td +++ b/llvm/include/llvm/IR/IntrinsicsVISC.td @@ -18,15 +18,15 @@ let TargetPrefix = "visc" in { */ /* Launch intrinsic - - * i32 llvm.visc.launch(graphID*, function* , ArgList*); + * i8* llvm.visc.launch(function* , ArgList*); */ - def int_visc_launch : Intrinsic<[llvm_i32_ty], [llvm_ptrptr_ty, llvm_ptr_ty, + def int_visc_launch : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_ptr_ty], []>; /* Wait intrinsic - - * i32 llvm.visc.wait(graphID*, returnVal*); + * void llvm.visc.wait(graphID*, returnVal*); */ - def int_visc_wait : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], []>; + def int_visc_wait : Intrinsic<[], [llvm_ptr_ty], []>; /* Create Node intrinsic - * i8* llvm.visc.createNode(function*); diff --git a/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp b/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp index 466bc41e14dee6d7d4ba68aee643495a3ebbb7f2..c236e5094d592b40e723898dc7b1e19be4fc3627 100644 --- a/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp +++ b/llvm/lib/Transforms/BuildDFG/BuildDFG.cpp @@ -49,7 +49,7 @@ bool BuildDFG::runOnModule(Module &M) { // Intrinsic Instruction has been initialized from this point on. - Function* F = cast<Function>((II->getOperand(1))->stripPointerCasts()); + Function* F = cast<Function>((II->getOperand(0))->stripPointerCasts()); Root = DFInternalNode::Create(II, F); BuildGraph(Root, F); diff --git a/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp b/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp index 3190876910b96862cc55e4e75fde54f8dad3f3c6..05869fa0dfca5b742d828f330e2f17fd21b28aa6 100644 --- a/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp +++ b/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp @@ -65,7 +65,7 @@ public: virtual void visit(DFInternalNode* N) { // Follows a bottom-up approach for code generation. // First generate code for all the child nodes - errs() << "Erasing Node (I) - " << N->getFuncPointer()->getName() << "\n"; + DEBUG(errs() << "Erasing Node (I) - " << N->getFuncPointer()->getName() << "\n"); for(DFGraph::children_iterator i = N->getChildGraph()->begin(), e = N->getChildGraph()->end(); i != e; ++i) { DFNode* child = *i; @@ -78,7 +78,7 @@ public: } virtual void visit(DFLeafNode* N) { - errs() << "Erasing Node (L) - " << N->getFuncPointer()->getName() << "\n"; + DEBUG(errs() << "Erasing Node (L) - " << N->getFuncPointer()->getName() << "\n"); deleteNode(N); //errs() << "DONE: Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n"; } diff --git a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp index 8c111a64b54b024d584e11b9f1793c391d1cfc20..71de15d38546b00fa46c0dc11888a564ee5bef76 100644 --- a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp @@ -15,6 +15,11 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/BuildDFG/BuildDFG.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Constant.h" using namespace llvm; using namespace builddfg; @@ -56,19 +61,44 @@ private: // we already have an index and dim extended function copy or not (i.e., // "Have we visited this function before?") ValueMap<Function*, Function*> FMap; - DenseMap<DFNode*, CallInst*> CallMap; + DenseMap<DFNode*, Value*> OutputMap; + + // VISC Runtime API + Module* runtimeModule; + Constant* llvm_visc_x86_launch; + Constant* llvm_visc_x86_wait; + Constant* llvm_visc_ptx_launch; + Constant* llvm_visc_ptx_wait; + Constant* llvm_visc_ptx_initContext; + Constant* llvm_visc_ptx_input_scalar; + Constant* llvm_visc_ptx_input_ptr; + Constant* llvm_visc_ptx_output_ptr; + Constant* llvm_visc_ptx_getOutput; + Constant* llvm_visc_ptx_executeNode; + FunctionType* AppFuncTy; + //Functions + void initRuntimeAPI(); + std::vector<IntrinsicInst*>* getWaitList(Value* LI); void addIdxDimArgs(Function* F); Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = ""); Argument* getArgumentFromEnd(Function* F, unsigned offset); Argument* getArgumentAt(Function* F, unsigned offset); + Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, + Instruction* InsertBefore); + void invokeChild_X86(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap, + Instruction* InsertBefore); + void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap, + Instruction* InsertBefore); void codeGenLaunch(DFInternalNode* Root); void codeGen(DFInternalNode* N); void codeGen(DFLeafNode* N); public: // Constructor - CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) { } + CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) { + initRuntimeAPI(); + } virtual void visit(DFInternalNode* N) { // Follows a bottom-up approach for code generation. @@ -113,37 +143,75 @@ bool DFG2LLVM_X86::runOnModule(Module &M) { return true; } +// Initialize the VISC runtime API. This makes it easier to insert these calls +void CodeGenTraversal::initRuntimeAPI() { + + // Load Runtime API Module + SMDiagnostic Err; + runtimeModule = ParseIRFile("/home/psrivas2/current-src/projects/visc-rt/visc-rt.ll", Err, M.getContext()); + if(runtimeModule == NULL) + DEBUG(errs() << Err.getMessage()); + else + DEBUG(errs() << "Successfully loaded visc-rt API module\n"); + + // Get or insert the global declarations for launch/wait functions + llvm_visc_x86_launch = M.getOrInsertFunction("llvm_visc_x86_launch", + runtimeModule->getFunction("llvm_visc_x86_launch")->getFunctionType()); + DEBUG(errs() << *llvm_visc_x86_launch); + + llvm_visc_x86_wait = M.getOrInsertFunction("llvm_visc_x86_wait", + runtimeModule->getFunction("llvm_visc_x86_wait")->getFunctionType()); + DEBUG(errs() << *llvm_visc_x86_wait); + + llvm_visc_ptx_launch = M.getOrInsertFunction("llvm_visc_ptx_launch", + runtimeModule->getFunction("llvm_visc_ptx_launch")->getFunctionType()); + DEBUG(errs() << *llvm_visc_ptx_launch); + + llvm_visc_ptx_wait = M.getOrInsertFunction("llvm_visc_ptx_wait", + runtimeModule->getFunction("llvm_visc_ptx_wait")->getFunctionType()); + DEBUG(errs() << *llvm_visc_ptx_wait); + + llvm_visc_ptx_initContext = M.getOrInsertFunction("llvm_visc_ptx_initContext" , + runtimeModule->getFunction("llvm_visc_ptx_initContext")->getFunctionType()); + DEBUG(errs() << *llvm_visc_ptx_initContext); + + llvm_visc_ptx_input_scalar = M.getOrInsertFunction("llvm_visc_ptx_input_scalar", + runtimeModule->getFunction("llvm_visc_ptx_input_scalar")->getFunctionType()); + DEBUG(errs() << *llvm_visc_ptx_input_scalar); + + llvm_visc_ptx_input_ptr = M.getOrInsertFunction("llvm_visc_ptx_input_ptr", + runtimeModule->getFunction("llvm_visc_ptx_input_ptr")->getFunctionType()); + DEBUG(errs() << *llvm_visc_ptx_input_ptr); + + llvm_visc_ptx_output_ptr = M.getOrInsertFunction("llvm_visc_ptx_output_ptr", + runtimeModule->getFunction("llvm_visc_ptx_output_ptr")->getFunctionType()); + DEBUG(errs() << *llvm_visc_ptx_output_ptr); + + llvm_visc_ptx_getOutput = M.getOrInsertFunction("llvm_visc_ptx_getOutput", + runtimeModule->getFunction("llvm_visc_ptx_getOutput")->getFunctionType()); + DEBUG(errs() << *llvm_visc_ptx_getOutput); + + llvm_visc_ptx_executeNode = M.getOrInsertFunction("llvm_visc_ptx_executeNode", + runtimeModule->getFunction("llvm_visc_ptx_executeNode")->getFunctionType()); + DEBUG(errs() << *llvm_visc_ptx_executeNode); + +} + /* Returns vector of all wait instructions */ -std::vector<CallInst*>* getWaitList(CallInst* LI) { - Value* GraphIDAddr = LI->getArgOperand(0); - std::vector<CallInst*>* WaitList = new std::vector<CallInst*>(); +std::vector<IntrinsicInst*>* CodeGenTraversal::getWaitList(Value* GraphID) { + std::vector<IntrinsicInst*>* WaitList = new std::vector<IntrinsicInst*>(); // It must have been loaded from memory somewhere - GraphIDAddr->use_begin(); - for(Value::use_iterator ui = GraphIDAddr->use_begin(), - ue = GraphIDAddr->use_end(); ui!=ue; ++ui) { - if(LoadInst* LI = dyn_cast<LoadInst>(*ui)) { - DEBUG(errs() << *LI << "\n"); - for(Value::use_iterator i = LI->use_begin(), e = LI->use_end(); i!=e; ++i) { - if(CallInst* waitI = dyn_cast<CallInst>(*i)) { - DEBUG(errs() << *waitI << "\n"); - WaitList->push_back(waitI); - } - } + for(Value::use_iterator ui = GraphID->use_begin(), + ue = GraphID->use_end(); ui!=ue; ++ui) { + if(IntrinsicInst* waitI = dyn_cast<IntrinsicInst>(*ui)) { + assert(waitI->getIntrinsicID() == Intrinsic::visc_wait + && "GraphID can only be used by llvm.visc.wait intrinsic"); + WaitList->push_back(waitI); } - // If graphID memory address is used by another launch, then break - if(CallInst* CI = dyn_cast<CallInst>(*ui)) { - if(LI != CI) { - DEBUG(errs()<< "Warning: Overwriting graph ID in memory -- " << *CI << "\n" << *LI << "\n"); - break; - } - } - // If graphID in memory is overwritten using store, it's an error - if(StoreInst* SI =dyn_cast<StoreInst>(*ui)) { - assert(SI->getPointerOperand() == GraphIDAddr - && "Error: Do not manually write over graphID in memory!"); + else { + llvm_unreachable("Error: Operation on Graph ID not supported!\n"); } - } return WaitList; } @@ -258,60 +326,12 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) { // TODO: Place an assert to check if the constant passed bu launch intrinsic // as the number of arguments to DFG is same as the number of arguments of the // root of DFG - + DEBUG(errs() << "Generating Launch Function\n"); // Get Launch Instruction IntrinsicInst* LI = Root->getInstruction(); + DEBUG(errs() << "Generating Launch Function\n"); - // Get frequently used types - Type* i64Ty = Type::getInt64Ty(LI->getContext()); - Type* i32Ty = Type::getInt32Ty(LI->getContext()); - Type* i8Ty = Type::getInt8Ty(LI->getContext()); - Type* voidTy = Type::getVoidTy(LI->getContext()); - - /* Get or Insert visc runtime utilities necessary to run DFG as a separate thread - * (1) llvm_visc_launch_x86 - * (2) llvm_visc_wait_x86 - */ - Type *GraphIDTy; - std::vector<Type*>Elements; - // PThreads use different attribute types for 32-bit and 64-bit machines - if(M.getPointerSize() == Module::Pointer64) { - GraphIDTy = i64Ty; - } - else { - GraphIDTy = i32Ty; - } - - FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(), - ArrayRef<Type*>(i8Ty->getPointerTo()), - false); - // Argument types for llvm_visc_launch_x86 - Type* ArgTypesLaunch[] = {i8Ty->getPointerTo()->getPointerTo(), - AppFuncTy->getPointerTo(), - i8Ty->getPointerTo()}; - - // Construct FunctionType of llvm_visc_launch_x86 call - FunctionType* LaunchFuncTy = FunctionType::get(i32Ty, - ArrayRef<Type*>(ArgTypesLaunch, 3), - false); - - // Construct FunctionType for llvm_visc_wait_x86 call - FunctionType* WaitFuncTy = FunctionType::get(i32Ty, - ArrayRef<Type*>(i8Ty->getPointerTo()), - false); - - // Get or insert the global declarations for pthread functions - Constant* Launch = M.getOrInsertFunction("llvm_visc_launch_x86", LaunchFuncTy); - Constant* Wait = M.getOrInsertFunction("llvm_visc_wait_x86", WaitFuncTy); - - // Construct FunctionType for malloc call - FunctionType* MallocTy = FunctionType::get(i8Ty->getPointerTo(), - ArrayRef<Type*>(i64Ty), - false); - // Get or insert the global declaration for malloc call - Constant* Malloc = M.getOrInsertFunction("malloc", MallocTy); - - /* Now we have all the necessary global declarations necessary to generate the + /* Now we have all the necessary global declarations necessary to generate the * Launch function, pointer to which can be passed to pthread utils to execute * DFG. The Launch function has just one input: i8* data.addr * This is the address of the all the input data that needs to be passed to @@ -324,10 +344,15 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) { * passed to pthread_exit call. */ // Create Launch Function of type i8*(i8*) which calls the root function + Type* i8Ty = Type::getInt8Ty(M.getContext()); + AppFuncTy = FunctionType::get(i8Ty->getPointerTo(), + ArrayRef<Type*>(i8Ty->getPointerTo()), + false); Function* AppFunc = Function::Create(AppFuncTy, Root->getFuncPointer()->getLinkage(), "LaunchDataflowGraph", &M); + DEBUG(errs() << "Generating Launch Function\n"); // Give a name to the argument which is used pass data to this thread Value* data = AppFunc->arg_begin(); data->setName("data.addr"); @@ -336,6 +361,8 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) { ReturnInst* RI = ReturnInst::Create(AppFunc->getContext(), Constant::getNullValue(AppFunc->getReturnType()), BB); + + DEBUG(errs() << "Created Empty Launch Function\n"); // Find the X86 function generated for Root and Function* RootF_X86 = FMap[Root->getFuncPointer()]; // Generate a call to RootF_X86 with null parameters for now @@ -364,7 +391,7 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) { // as no more arguments left // Increment using GEP: %nextArg = getelementptr <ptr-to-argType> %arg.addr, i64 1 // This essentially takes us to the next argument in memory - Constant* IntOne = ConstantInt::get(i64Ty, 1); + Constant* IntOne = ConstantInt::get(Type::getInt64Ty(M.getContext()), 1); GetElementPtrInst* GEP = GetElementPtrInst::Create(BI, ArrayRef<Value*>(IntOne), "nextArg", @@ -374,9 +401,8 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) { argNum++; data = GEP; } - + // Code for returning the output - Constant* SizeOf = ConstantExpr::getSizeOf(CI->getType()); CastInst* OutputAddrCast = CastInst::CreatePointerCast(data, CI->getType()->getPointerTo(), CI->getName()+".addr", @@ -387,21 +413,20 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) { DEBUG(errs() << *AppFunc << "\n"); // Substitute launch intrinsic main - Value* LaunchInstArgs[] = {LI->getArgOperand(0), - AppFunc, - LI->getArgOperand(2)}; - CallInst* LaunchInst = CallInst::Create(Launch, - ArrayRef<Value*>(LaunchInstArgs,3), - "", LI); + Value* LaunchInstArgs[] = {AppFunc, + LI->getArgOperand(1)}; + CallInst* LaunchInst = CallInst::Create(llvm_visc_x86_launch, + ArrayRef<Value*>(LaunchInstArgs,2), + "graph"+Root->getFuncPointer()->getName(), LI); //ReplaceInstWithInst(LI, LaunchInst); DEBUG(errs() << *LaunchInst << "\n"); // Replace all wait instructions with x86 specific wait instructions - std::vector<CallInst*>* WaitList = getWaitList(LaunchInst); + std::vector<IntrinsicInst*>* WaitList = getWaitList(LI); for(unsigned i=0; i < WaitList->size(); ++i) { - CallInst* waitI = WaitList->at(i); - CallInst* waitI_X86 = CallInst::Create(Wait, - ArrayRef<Value*>(waitI->getArgOperand(0)), + IntrinsicInst* waitI = WaitList->at(i); + CallInst* waitI_X86 = CallInst::Create(llvm_visc_x86_wait, + ArrayRef<Value*>(LaunchInst), ""); ReplaceInstWithInst(waitI, waitI_X86); DEBUG(errs() << *waitI_X86 << "\n"); @@ -409,6 +434,171 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) { } +Value* CodeGenTraversal::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore) { + // TODO: Assumption is that each input port of a node has just one + // incoming edge. May change later on. + + // Find the incoming edge at the requested input port + DFEdge* E = Child->getInDFEdgeAt(i); + assert(E && "No incoming edge or binding for input element!"); + // Find the Source DFNode associated with the incoming edge + DFNode* SrcDF = E->getSourceDF(); + + // If Source DFNode is a dummyNode, edge is from parent. Get the + // argument from argument list of this internal node + Value* inputVal; + if(SrcDF->isEntryNode()) { + inputVal = getArgumentAt(ParentF_X86, i); + DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + } + else { + // edge is from a sibling + // Check - code should already be generated for this source dfnode + assert(OutputMap.count(SrcDF) + && "Source node call not found. Dependency violation!"); + + // Find CallInst associated with the Source DFNode using FMap + Value* CI = OutputMap[SrcDF]; + + // Extract element at source position from this call instruction + std::vector<unsigned> IndexList; + IndexList.push_back(E->getSourcePosition()); + DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); + ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, + "", InsertBefore); + inputVal = EI; + } + return inputVal; +} + +void CodeGenTraversal::invokeChild_X86(DFNode* C, Function* F_X86, + ValueToValueMapTy &VMap,Instruction* IB) { + Function* CF = C->getFuncPointer(); + + Function* CF_X86 = C->getGenFunc(); + DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n"); + assert(CF_X86 != NULL + && "Found leaf node for which code generation has not happened yet!"); + + std::vector<Value*> Args; + // Create argument list to pass to call instruction + // First find the correct values using the edges + // The remaing six values are inserted as constants for now. + for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) { + Args.push_back(getInValueAt(C, i, F_X86, IB)); + } + + Value* I32Zero = ConstantInt::get(Type::getInt32Ty(F_X86->getContext()), 0); + for(unsigned j=0; j<6; j++) + Args.push_back(I32Zero); + + // Call the F_X86 function associated with this node + CallInst* CI = CallInst::Create(CF_X86, Args, + CF_X86->getName()+"_output", + IB); + DEBUG(errs() << *CI << "\n"); + OutputMap[C] = CI; + + // Find num of dimensions this node is replicated in. + // Based on number of dimensions, insert loop instructions + std::string varNames[3] = {"x", "y", "z"}; + for(unsigned j=0; j < C->getNumOfDim(); j++) { + Value* indexLimit; + // Limit can either be a constant or an arguement of the internal node. + // In case of constant we can use that constant value directly in the + // new F_X86 function. In case of an argument, we need to get the mapped + // value using VMap + if(isa<Constant>(C->getDimLimits()[j])) + indexLimit = C->getDimLimits()[j]; + else + indexLimit = VMap[C->getDimLimits()[j]]; + assert(indexLimit && "Invalid dimension limit!"); + // Insert loop + Value* indexVar = addLoop(CI, indexLimit, varNames[j]); + unsigned numArgs = CI->getNumArgOperands(); + // Insert index variable and limit arguments + CI->setArgOperand(numArgs-6+j, indexVar); + CI->setArgOperand(numArgs-3+j, indexLimit); + } + +} + +void CodeGenTraversal::invokeChild_PTX(DFNode* C, Function* F_X86, + ValueToValueMapTy &VMap, Instruction* IB) { + Function* CF = C->getFuncPointer(); + + //FIXME: A way to check if PTX code has been generated for this child node + /*assert(FMap.count(CF) + && "Found leaf node for which code generation has not happened yet!"); + */ + //assert(C->getTag() == DFNode::PTX && "Cannot generate GPU call for non PTX nodes"); + + // Initialize context + CallInst::Create(llvm_visc_ptx_initContext, None, "", IB); + + // Initialize command queue + // Filename = <DFNode function name>.nvptx.ll + Twine file = CF->getName() + ".nvptx.ll"; + DEBUG(errs() << file << "\n"); + Constant* filename = ConstantDataArray::get(M.getContext(), + ArrayRef<uint8_t>((uint8_t*)file.str().c_str(), file.str().length())); + + CallInst* GraphID = CallInst::Create(llvm_visc_ptx_launch, + ArrayRef<Value*>(filename), + "graph"+CF->getName(), + IB); + + // Iterate over the required input edges of the node and use the visc-rt API + // to set inputs + for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) { + + Value* inputVal = getInValueAt(C, i, F_X86, IB); + // input value has been obtained. + // Check if input is a scalar value or a pointer operand + // For scalar values such as int, float, etc. the size is simply the size of + // type on target machine, but for pointers, the size of data would be the + // next integer argument + Value* inputSize; + if(inputVal->getType()->isPointerTy()) { + // Pointer Input + inputSize = getInValueAt(C, i+1, F_X86, IB); + assert(inputSize->getType()->isIntegerTy() + && "Pointer type input must always be followed by size (integer type)"); + } + else { // Scalar Input + inputSize = ConstantExpr::getSizeOf(inputVal->getType()); + } + + Value* setInputArgs[] = {GraphID, + inputVal, + ConstantInt::get(Type::getInt32Ty(M.getContext()),i), + inputSize + }; + CallInst::Create(llvm_visc_ptx_input_ptr, + ArrayRef<Value*>(setInputArgs, 4), "", IB); + + } + // Setup output + // FIXME: Note - There is a tricky question. In X86 we do not need to care + // about pointer inputs which modify data in memory implicitly (without + // showing it as output). There is no extra cost needed to handle such inputs + // For PTX, we need to read back such data from device memory to host memory. + // The cost is huge and hence we need to differentiate between readonly + // pointer inputs vs read/write pointer inputs. Currently supporting only a + // simple model in which all input edges are readonly and output is + // writeonly. + StructType* OutputTy = C->getOutputType(); + for(unsigned i=0; OutputTy->getNumElements(); i++) { + Type* elemTy = OutputTy->getElementType(i); + } + + // Enqueue kernel + // Read Output + // return output + // free data structures + +} + void CodeGenTraversal::codeGen(DFInternalNode* N) { Function* F = N->getFuncPointer(); @@ -443,7 +633,10 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) { UndefValue::get(F_X86->getReturnType()), BB); //Add old func: new func pair to the FMap + // FIXME: We do not require the FMap probably. Only one of setGenFunc or FMap + // is required FMap[F] = F_X86; + N->setGenFunc(F_X86, DFNode::X86); // Add Index and Dim arguments except for the root node if(!N->isRoot()) @@ -459,86 +652,12 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) { if (C->isDummyNode()) continue; - Function* CF = C->getFuncPointer(); - - assert(FMap.count(CF) - && "Found leaf node for which code generation has not happened yet!"); - Function* CF_X86 = FMap[CF]; - std::vector<Value*> Args; - // Create argument list to pass to call instruction - // First find the correct values using the edges - // The remaing six values are inserted as constants for now. - for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) { - - // TODO: Assumption is that each input port of a node has just one - // incoming edge. May change later on. - - // Find the incoming edge at the requested input port - DFEdge* E = C->getInDFEdgeAt(i); - assert(E && "No incoming edge or binding for input element!"); - // Find the Source DFNode associated with the incoming edge - DFNode* SrcDF = E->getSourceDF(); - - // If Source DFNode is a dummyNode, edge is from parent. Get the - // argument from argument list of this internal node - Value* inputVal; - if(SrcDF->isEntryNode()) { - inputVal = getArgumentAt(F_X86, i); - DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); - } - else { - // edge is from a sibling - // Check - code should already be generated for this source dfnode - assert(CallMap.count(SrcDF) - && "Source node call not found. Dependency violation!"); - - // Find CallInst associated with the Source DFNode using FMap - CallInst* CI = CallMap[SrcDF]; - - // Extract element at source position from this call instruction - std::vector<unsigned> IndexList; - IndexList.push_back(E->getSourcePosition()); - DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); - ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, - "", RI); - inputVal = EI; - } - // input value has been obtained. - Args.push_back(inputVal); - } - - Value* I32Zero = ConstantInt::get(Type::getInt32Ty(F_X86->getContext()), 0); - for(unsigned j=0; j<6; j++) - Args.push_back(I32Zero); - - // Call the F_X86 function associated with this node - CallInst* CI = CallInst::Create(CF_X86, Args, - CF_X86->getName()+"_output", - RI); - DEBUG(errs() << *CI << "\n"); - CallMap[C] = CI; - - // Find num of dimensions this node is replicated in. - // Based on number of dimensions, insert loop instructions - std::string varNames[3] = {"x", "y", "z"}; - for(unsigned j=0; j < C->getNumOfDim(); j++) { - Value* indexLimit; - // Limit can either be a constant or an arguement of the internal node. - // In case of constant we can use that constant value directly in the - // new F_X86 function. In case of an argument, we need to get the mapped - // value using VMap - if(isa<Constant>(C->getDimLimits()[j])) - indexLimit = C->getDimLimits()[j]; - else - indexLimit = VMap[C->getDimLimits()[j]]; - assert(indexLimit && "Invalid dimension limit!"); - // Insert loop - Value* indexVar = addLoop(CI, indexLimit, varNames[j]); - unsigned numArgs = CI->getNumArgOperands(); - // Insert index variable and limit arguments - CI->setArgOperand(numArgs-6+j, indexVar); - CI->setArgOperand(numArgs-3+j, indexLimit); - } + // Check if Child Node has PTX tag or X86 tag + invokeChild_PTX(C, F_X86, VMap, RI); + if (C->getTag() == DFNode::PTX) + invokeChild_PTX(C, F_X86, VMap, RI); + else + invokeChild_X86(C, F_X86, VMap, RI); } DEBUG(errs() << "*** Generating epilogue code for the function****\n"); @@ -550,6 +669,7 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) { Value *retVal = UndefValue::get(F_X86->getReturnType()); // Find all the input edges to exit node for (unsigned i=0; i < OutTy->getNumElements(); i++) { + DEBUG(errs() << "Output Edge " << i << "\n"); // Find the incoming edge at the requested input port DFEdge* E = C->getInDFEdgeAt(i); @@ -557,6 +677,8 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) { // Find the Source DFNode associated with the incoming edge DFNode* SrcDF = E->getSourceDF(); + DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); + // If Source DFNode is a dummyNode, edge is from parent. Get the // argument from argument list of this internal node Value* inputVal; @@ -567,11 +689,11 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) { else { // edge is from a internal node // Check - code should already be generated for this source dfnode - assert(CallMap.count(SrcDF) + assert(OutputMap.count(SrcDF) && "Source node call not found. Dependency violation!"); - // Find CallInst associated with the Source DFNode using FMap - CallInst* CI = CallMap[SrcDF]; + // Find Output Value associated with the Source DFNode using OutputMap + Value* CI = OutputMap[SrcDF]; // Extract element at source position from this call instruction std::vector<unsigned> IndexList; @@ -585,6 +707,7 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) { IdxList.push_back(i); retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI); } + DEBUG(errs() << "Extracted all\n"); retVal->setName("output"); ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); ReplaceInstWithInst(RI, newRI); @@ -628,7 +751,10 @@ void CodeGenTraversal::codeGen(DFLeafNode* N) { // Insert the cloned function into the module M.getFunctionList().push_back(F_X86); // Add old func: new func pair to the FMap + // FIXME: We do not require the FMap probably. Only one of setGenFunc or FMap + // is required FMap[F] = F_X86; + N->setGenFunc(F_X86, DFNode::X86); // Add the new argument to the argument list addIdxDimArgs(F_X86); diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp index 9080a31b4a4a39aaea7ef4dffac4f055a24ad48d..4b05c8178bfedd860728ebdb97546b398e4fa33c 100644 --- a/llvm/projects/visc-rt/visc-rt.cpp +++ b/llvm/projects/visc-rt/visc-rt.cpp @@ -1,6 +1,7 @@ #include <pthread.h> #include <cstdlib> #include <cstdio> +#include <string> #include <CL/cl.h> typedef struct { @@ -14,89 +15,175 @@ typedef struct { cl_kernel clKernel; } DFNodeContext_PTX; +cl_context globalGPUContext; + +static inline void checkErr(cl_int err, cl_int success, const char * name) { + if (err != success) { + printf("ERROR: %s\n", name); + exit(EXIT_FAILURE); + } +} + extern "C" -__int32_t llvm_visc_launch_x86(void** graphID, void* (*rootFunc)(void*), void* arguments) { +void* llvm_visc_x86_launch(void* (*rootFunc)(void*), void* arguments) { DFNodeContext_X86 *Context = (DFNodeContext_X86 *) malloc(sizeof(DFNodeContext_X86)); - *graphID = Context; - return pthread_create(&Context->threadID, NULL, rootFunc, arguments); + int err; + if((err = pthread_create(&Context->threadID, NULL, rootFunc, arguments)) != 0) + printf("Failed to create pthread. Error code = %d\n", err); + return Context; } extern "C" -__int32_t llvm_visc_wait_x86(void* graphID) { +void llvm_visc_x86_wait(void* graphID) { DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; - return pthread_join(Context->threadID, NULL); + pthread_join(Context->threadID, NULL); } -static inline void checkErr(cl_int err, cl_int success, const char * name) { - if (err != success) { - printf("ERROR: %s\n", name); - exit(EXIT_FAILURE); - } +extern "C" +void* llvm_visc_ptx_initContext() { + cl_uint numPlatforms; + cl_int errcode; + errcode = clGetPlatformIDs(0, NULL, &numPlatforms); + checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms"); + + // now get all the platform IDs + cl_platform_id* platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id)*numPlatforms); + errcode = clGetPlatformIDs(numPlatforms, platforms, NULL); + checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs"); + + for(unsigned i=0; i < numPlatforms; i++) { + char buffer[10240]; + printf(" -- %d --\n", i); + clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL); + printf(" PROFILE = %s\n", buffer); + clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL); + printf(" VERSION = %s\n", buffer); + clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL); + printf(" NAME = %s\n", buffer); + clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL); + printf(" VENDOR = %s\n", buffer); + clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL); + printf(" EXTENSIONS = %s\n", buffer); + } + // set platform property - just pick the first one + cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, + (long) platforms[0], + 0}; + globalGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, + NULL, NULL, &errcode); + checkErr(errcode, CL_SUCCESS, "Failure to create GPU context"); + return globalGPUContext; +} + +extern "C" +void llvm_visc_ptx_clearContext() { + clReleaseContext(globalGPUContext); } +extern "C" +void llvm_visc_ptx_input_scalar(void* graphID, void* input, int arg_index, size_t size) { + DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; + cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input); + checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument"); +} + +extern "C" +void* llvm_visc_ptx_input_ptr(void* graphID, void* input, int arg_index, size_t size) { + DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; + cl_int errcode; + cl_mem d_input = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE | + CL_MEM_COPY_HOST_PTR, size, input, &errcode); + checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device"); + errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input); + checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument"); + return d_input; +} + +extern "C" +void* llvm_visc_ptx_output_ptr(void* graphID, int arg_index, size_t size) { + DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; + cl_int errcode; + cl_mem d_output = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE, + size, NULL, &errcode); + checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device"); + errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output); + checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument"); + return d_output; +} + +extern "C" +void* llvm_visc_ptx_getOutput(void* graphID, void* d_output, size_t size) { + void* h_output = malloc(size); + DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; + cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size, + h_output, 0, NULL, NULL); + checkErr(errcode, CL_SUCCESS, "Failure to read output"); + return h_output; +} + +extern "C" +void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t* localWorkSize, const size_t* globalWorkSize) { + DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; + cl_event* event; + cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue, + Context->clKernel, workDim, NULL, globalWorkSize, localWorkSize, 0, NULL, event); + checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel"); + return event; +} + + ////////////////////////////////////////////////////////////////////////////// //! Loads a Program binary file. //! //! @return the source string if succeeded, 0 otherwise -//! @param cFilename program filename +//! @param Filename program filename //! @param szFinalLength returned length of the code string ////////////////////////////////////////////////////////////////////////////// -static char* LoadProgSource(const char* cFilename, size_t* szFinalLength) +static char* LoadProgSource(const char* Filename, size_t* szFinalLength) { - // locals - FILE* pFileStream = NULL; - size_t szSourceLength; - - // open the OpenCL source code file - #ifdef _WIN32 // Windows version - if(fopen_s(&pFileStream, cFilename, "rb") != 0) - { - return NULL; - } - #else // Linux version - pFileStream = fopen(cFilename, "rb"); - if(pFileStream == 0) - { - return NULL; - } - #endif - - // get the length of the source code - fseek(pFileStream, 0, SEEK_END); - szSourceLength = ftell(pFileStream); - fseek(pFileStream, 0, SEEK_SET); - - // allocate a buffer for the source code string and read it in - char* cSourceString = (char *)malloc(szSourceLength + 1); - if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1) - { - fclose(pFileStream); - free(cSourceString); - return 0; - } - - // close the file and return the total length of the combined (preamble + source) string - fclose(pFileStream); - if(szFinalLength != 0) - { - *szFinalLength = szSourceLength; - } - cSourceString[szSourceLength] = '\0'; - - return cSourceString; + // locals + FILE* pFileStream = NULL; + size_t szSourceLength; + + // open the OpenCL source code file + pFileStream = fopen(Filename, "rb"); + if(pFileStream == 0) + { + return NULL; + } + + // get the length of the source code + fseek(pFileStream, 0, SEEK_END); + szSourceLength = ftell(pFileStream); + fseek(pFileStream, 0, SEEK_SET); + + // allocate a buffer for the source code string and read it in + char* cSourceString = (char *)malloc(szSourceLength + 1); + if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1) + { + fclose(pFileStream); + free(cSourceString); + return 0; + } + + // close the file and return the total length of the combined (preamble + source) string + fclose(pFileStream); + if(szFinalLength != 0) + { + *szFinalLength = szSourceLength; + } + cSourceString[szSourceLength] = '\0'; + + return cSourceString; } extern "C" -__int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void* arguments) { +void* llvm_visc_ptx_launch(const char* Filename) { // Initialize OpenCL // OpenCL specific variables DFNodeContext_PTX *Context = (DFNodeContext_PTX *) malloc(sizeof(DFNodeContext_PTX)); - // Return Context pointer as grpahID; - *graphID = Context; - - size_t dataBytes; size_t kernelLength; cl_int errcode; @@ -112,7 +199,7 @@ __int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void* /* Initialize OpenCL */ /*****************************************/ // query the number of platforms - cl_uint numPlatforms; + /*cl_uint numPlatforms; errcode = clGetPlatformIDs(0, NULL, &numPlatforms); checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms"); @@ -142,6 +229,9 @@ __int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void* Context->clGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode); checkErr(errcode, CL_SUCCESS, "Failure to create GPU context"); + */ + // For a single context for all kernels + Context->clGPUContext = globalGPUContext; // get the list of GPU devices associated with context errcode = clGetContextInfo(Context->clGPUContext, CL_CONTEXT_DEVICES, 0, @@ -155,23 +245,13 @@ __int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void* Context->clCommandQue = clCreateCommandQueue(Context->clGPUContext, clDevices[0], 0, &errcode); checkErr(errcode, CL_SUCCESS, "Failure to create command queue"); - /* Application specific code - // Setup device memory - d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, bytes_C, NULL, - &errcode); - d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, - bytes_A, h_A, &errcode); - d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, - bytes_B, h_B, &errcode); - */ - - char *clMatrixMul = LoadProgSource("matrixMul.nvptx.s", &kernelLength); - checkErr(clMatrixMul != NULL, 1 /*bool true*/, "Failure to load Program Binary"); + char *programSource = LoadProgSource(Filename, &kernelLength); + checkErr(programSource != NULL, 1 /*bool true*/, "Failure to load Program Binary"); cl_int binaryStatus; Context->clProgram = clCreateProgramWithBinary(Context->clGPUContext, 1, &clDevices[0], &kernelLength, - (const unsigned char **)&clMatrixMul, + (const unsigned char **)&programSource, &binaryStatus, &errcode); checkErr(errcode, CL_SUCCESS, "Failure to create program from binary"); @@ -184,37 +264,7 @@ __int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void* // Invoke the callback function to put memory allocations in place - rootFunc(graphID); - - /* Application Specific Code - // Launch OpenCL kernel - size_t localWorkSize[2], globalWorkSize[2]; - - int wA = WA; - int wC = WC; - errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C); - errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A); - errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B); - errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA); - errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC); - checkErr(errcode, CL_SUCCESS, "Failure to set kernel arguments"); - - localWorkSize[0] = BLOCK_SIZE; - localWorkSize[1] = BLOCK_SIZE; - globalWorkSize[0] = ((WB-1)/BLOCK_SIZE + 1) * BLOCK_SIZE; - globalWorkSize[1] = ((HA-1)/BLOCK_SIZE + 1) * BLOCK_SIZE; - - errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, - globalWorkSize, localWorkSize, - 0, NULL, NULL); - checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel"); - - - // Retrieve result from device - errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, bytes_C, - h_C, 0, NULL, NULL); - checkErr(errcode, CL_SUCCESS, "Failure to read buffer"); - */ + //rootFunc(Context); /* App specific code // Deallocate memory @@ -228,28 +278,19 @@ __int32_t llvm_visc_launch_ptx(void** graphID, void* (*rootFunc) (void*), void* */ free(clDevices); - free(clMatrixMul); + free(programSource); - /* - // Free in wait implementation - clReleaseContext(Context->clGPUContext); - clReleaseKernel(Context->clKernel); - clReleaseProgram(Context->clProgram); - */ - - return 0; + return Context; } extern "C" -__int32_t llvm_visc_wait_ptx(void* graphID) { +void llvm_visc_ptx_wait(void* graphID) { DFNodeContext_PTX *Context = (DFNodeContext_PTX*) graphID; clFinish(Context->clCommandQue); // Release - clReleaseContext(Context->clGPUContext); + // clReleaseContext(Context->clGPUContext); clReleaseKernel(Context->clKernel); clReleaseProgram(Context->clProgram); - - return 0; } diff --git a/llvm/test/VISC/unitTests/3level.ll b/llvm/test/VISC/unitTests/3level.ll index c884acf32cafdce42a5b8e219c3a1ab32676afae..72248b3b95d837cbbc6b0671c984b341b7690ba5 100644 --- a/llvm/test/VISC/unitTests/3level.ll +++ b/llvm/test/VISC/unitTests/3level.ll @@ -17,10 +17,10 @@ declare i8* @llvm.visc.createNode(i8*) #0 declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0 ; Function Attrs: nounwind -declare i32 @llvm.visc.launch(i8**, i8*, i8*) #0 +declare i8* @llvm.visc.launch(i8*, i8*) #0 ; Function Attrs: nounwind -declare i32 @llvm.visc.wait(i8*) #0 +declare void @llvm.visc.wait(i8*) #0 ; Function Attrs: nounwind declare i8* @llvm.visc.getNode() #0 @@ -42,11 +42,9 @@ entry: %1 = bitcast { i32, %rtype }* %in.addr to i32* store i32 %conv.i, i32* %1 %args = bitcast { i32, %rtype }* %in.addr to i8* - %graphIDloc = alloca i8* - %launch = call i32 @llvm.visc.launch(i8** %graphIDloc, i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) + %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (i32)* @Root to i8*), i8* %args) %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %conv.i) #0 - %graphID = load i8** %graphIDloc - %wait = call i32 @llvm.visc.wait(i8* %graphID) + call void @llvm.visc.wait(i8* %graphID) %2 = getelementptr { i32, %rtype }* %in.addr, i32 0, i32 1 %outputstruct = load %rtype* %2 %output1 = extractvalue %rtype %outputstruct, 0