diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp index 0670018dd8997d18b3d4a397c886d19099b81e37..a37d9c152504c9f4271daceff81ff5be83ec292c 100644 --- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -17,8 +17,12 @@ #include "llvm/Pass.h" #include "llvm/Support/InstIterator.h" #include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/BuildDFG/BuildDFG.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker.h" +#include "llvm/Support/SourceMgr.h" #include <sstream> @@ -69,9 +73,28 @@ namespace { // we already have an index and dim extended function copy or not (i.e., // "Have we visited this function before?") ValueMap<Function*, Function*> FMap; + DenseMap<DFNode*, Value*> OutputMap; + + // VISC Runtime API + Module* runtimeModule; + Constant* llvm_visc_ptx_launch; + Constant* llvm_visc_ptx_wait; + Constant* llvm_visc_ptx_initContext; + Constant* llvm_visc_ptx_input_scalar; + Constant* llvm_visc_ptx_input_ptr; + Constant* llvm_visc_ptx_output_ptr; + Constant* llvm_visc_ptx_getOutput; + Constant* llvm_visc_ptx_executeNode; + //Functions void transformFunctionToVoid(Function* F); + void initRuntimeAPI(); + void addIdxDimArgs(Function* F); + Argument* getArgumentAt(Function* F, unsigned offset); + Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, + Instruction* InsertBefore); + void insertRuntimeCalls(DFInternalNode* N, const Twine& FileName, const Twine& Kernel); void codeGen(DFInternalNode* N); void codeGen(DFLeafNode* N); @@ -101,6 +124,358 @@ namespace { }; + // Initialize the VISC runtime API. This makes it easier to insert these calls + void CodeGenTraversal::initRuntimeAPI() { + + // Load Runtime API Module + SMDiagnostic Err; + runtimeModule = ParseIRFile("/home/psrivas2/current-src/projects/visc-rt/visc-rt.ll", Err, M.getContext()); + if(runtimeModule == NULL) + DEBUG(errs() << Err.getMessage()); + else + DEBUG(errs() << "Successfully loaded visc-rt API module\n"); + + // Get or insert the global declarations for launch/wait functions + llvm_visc_ptx_launch = M.getOrInsertFunction("llvm_visc_ptx_launch", + runtimeModule->getFunction("llvm_visc_ptx_launch")->getFunctionType()); + DEBUG(errs() << *llvm_visc_ptx_launch); + + llvm_visc_ptx_wait = M.getOrInsertFunction("llvm_visc_ptx_wait", + runtimeModule->getFunction("llvm_visc_ptx_wait")->getFunctionType()); + DEBUG(errs() << *llvm_visc_ptx_wait); + + llvm_visc_ptx_initContext = M.getOrInsertFunction("llvm_visc_ptx_initContext" , + runtimeModule->getFunction("llvm_visc_ptx_initContext")->getFunctionType()); + DEBUG(errs() << *llvm_visc_ptx_initContext); + + llvm_visc_ptx_input_scalar = M.getOrInsertFunction("llvm_visc_ptx_input_scalar", + runtimeModule->getFunction("llvm_visc_ptx_input_scalar")->getFunctionType()); + DEBUG(errs() << *llvm_visc_ptx_input_scalar); + + llvm_visc_ptx_input_ptr = M.getOrInsertFunction("llvm_visc_ptx_input_ptr", + runtimeModule->getFunction("llvm_visc_ptx_input_ptr")->getFunctionType()); + DEBUG(errs() << *llvm_visc_ptx_input_ptr); + + llvm_visc_ptx_output_ptr = M.getOrInsertFunction("llvm_visc_ptx_output_ptr", + runtimeModule->getFunction("llvm_visc_ptx_output_ptr")->getFunctionType()); + DEBUG(errs() << *llvm_visc_ptx_output_ptr); + + llvm_visc_ptx_getOutput = M.getOrInsertFunction("llvm_visc_ptx_getOutput", + runtimeModule->getFunction("llvm_visc_ptx_getOutput")->getFunctionType()); + DEBUG(errs() << *llvm_visc_ptx_getOutput); + + llvm_visc_ptx_executeNode = M.getOrInsertFunction("llvm_visc_ptx_executeNode", + runtimeModule->getFunction("llvm_visc_ptx_executeNode")->getFunctionType()); + DEBUG(errs() << *llvm_visc_ptx_executeNode); + + } + void CodeGenTraversal::addIdxDimArgs(Function* F) { + // Add Index and Dim arguments + std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"}; + for (int i = 0; i < 6; ++i) { + new Argument(Type::getInt32Ty(F->getContext()), names[i], F); + } + + // Create the argument type list with added argument types + std::vector<Type*> ArgTypes; + for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + ArgTypes.push_back(ai->getType()); + } + // Adding new arguments to the function argument list, would not change the + // function type. We need to change the type of this function to reflect the + // added arguments + FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg()); + PointerType* PTy = PointerType::get(FTy, cast<PointerType>(F->getType())->getAddressSpace()); + + // Change the function type + F->mutateType(PTy); + } + + /* Traverse the function F argument list to get argument at offset*/ + Argument* CodeGenTraversal::getArgumentAt(Function* F, unsigned offset) { + assert((F->getFunctionType()->getNumParams() > offset && offset >= 0) + && "Invalid offset to access arguments!"); + + Argument* arg; + Function::arg_iterator i = F->arg_begin(), e = F->arg_end(); + for(; offset != 0 && i!=e; i++) { + offset--; + } + arg = i; + DEBUG(errs() << *F); + DEBUG(errs() << *arg <<"\n"); + return arg; + } + + + Value* CodeGenTraversal::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, + Instruction* InsertBefore) { + // TODO: Assumption is that each input port of a node has just one + // incoming edge. May change later on. + + // Find the incoming edge at the requested input port + DFEdge* E = Child->getInDFEdgeAt(i); + assert(E && "No incoming edge or binding for input element!"); + // Find the Source DFNode associated with the incoming edge + DFNode* SrcDF = E->getSourceDF(); + + // If Source DFNode is a dummyNode, edge is from parent. Get the + // argument from argument list of this internal node + Value* inputVal; + if(SrcDF->isEntryNode()) { + inputVal = getArgumentAt(ParentF_X86, i); + DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + } + else { + // edge is from a sibling + // Check - code should already be generated for this source dfnode + assert(OutputMap.count(SrcDF) + && "Source node call not found. Dependency violation!"); + + // Find CallInst associated with the Source DFNode using FMap + Value* CI = OutputMap[SrcDF]; + + // Extract element at source position from this call instruction + std::vector<unsigned> IndexList; + IndexList.push_back(E->getSourcePosition()); + DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); + ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, + "", InsertBefore); + inputVal = EI; + } + return inputVal; + } + + // Generate Code to call the kernel + // The plan is to replace the internal node with a leaf node. This method is + // used to generate a function to associate with this leaf node. The function + // is responsible for all the memory allocation/transfer and invoking the + // kernel call on the device + void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileName, const Twine& KernelName) { + // Check if clone already exists. If it does, it means we have visited this + // function before and nothing else needs to be done for this leaf node. + assert(N->getGenFunc() != NULL && "Code already generated for this node"); + + Function* F = N->getFuncPointer(); + + + // Create of clone of F with no instructions. Only the type is the same as F + // without the extra arguments. + Function* F_X86; + + // Clone the function, if we are seeing this function for the first time. We + // only need a clone in terms of type. + ValueToValueMapTy VMap; + + // Create new function with the same type + F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); + + // Loop over the arguments, copying the names of arguments over. + Function::arg_iterator dest_iterator = F_X86->arg_begin(); + for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(); + i != e; ++i) { + dest_iterator->setName(i->getName()); // Copy the name over... + // Add mapping to VMap and increment dest iterator + VMap[i] = dest_iterator++; + } + + // Add a basic block to this empty function + BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86); + ReturnInst* RI = ReturnInst::Create(M.getContext(), + UndefValue::get(F_X86->getReturnType()), BB); + + //Add old func: new func pair to the FMap + N->setGenFunc(F_X86, DFNode::X86); + + // FIXME: Adding Index and Dim arguments are probably not required except + // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do + // have those arguments) + + // Add Index and Dim arguments except for the root node + if(!N->isRoot()) + addIdxDimArgs(F_X86); + + // Sort children in topological order before code generation for kernel call + N->getChildGraph()->sortChildren(); + + // The DFNode N has the property that it has only one child (leaving Entry + // and Exit dummy nodes). This child is the PTX kernel. This simplifies code + // generation for kernel calls significantly. All the inputs to this child + // node would either be constants or from the parent node N. + + assert(N->getChildGraph()->size() == 3 + && "Node expected to have just one non-dummy node!"); + + DFNode* C; + for(DFGraph::children_iterator ci = N->getChildGraph()->begin(), + ce = N->getChildGraph()->end(); ci != ce; ++ci) { + C = *ci; + // Skip dummy node call + if (!C->isDummyNode()) + break; + } + + assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!"); + + Function* CF = C->getFuncPointer(); + // Initialize context + CallInst::Create(llvm_visc_ptx_initContext, None, "", RI); + + // Initialize command queue + Constant* file = ConstantDataArray::get(M.getContext(), + ArrayRef<uint8_t>((uint8_t*)FileName.str().c_str(), FileName.str().length())); + + Constant* kernel = ConstantDataArray::get(M.getContext(), + ArrayRef<uint8_t>((uint8_t*)KernelName.str().c_str(), KernelName.str().length())); + + Value* LaunchInstArgs[] = {file, kernel}; + CallInst* GraphID = CallInst::Create(llvm_visc_ptx_launch, + ArrayRef<Value*>(LaunchInstArgs, 2), + "graph"+CF->getName(), + RI); + // Iterate over the required input edges of the node and use the visc-rt API + // to set inputs + for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) { + + Value* inputVal = getInValueAt(C, i, F_X86, RI); + // input value has been obtained. + // Check if input is a scalar value or a pointer operand + // For scalar values such as int, float, etc. the size is simply the size of + // type on target machine, but for pointers, the size of data would be the + // next integer argument + if(inputVal->getType()->isPointerTy()) { + // Pointer Input + Value* inputSize = getInValueAt(C, i+1, F_X86, RI); + assert(inputSize->getType()->isIntegerTy() + && "Pointer type input must always be followed by size (integer type)"); + Value* setInputArgs[] = {GraphID, + inputVal, + ConstantInt::get(Type::getInt32Ty(M.getContext()),i), + inputSize + }; + CallInst::Create(llvm_visc_ptx_input_ptr, + ArrayRef<Value*>(setInputArgs, 4), "", RI); + } + else { // Scalar Input + Value* setInputArgs[] = {GraphID, + inputVal, + ConstantInt::get(Type::getInt32Ty(M.getContext()),i), + ConstantExpr::getSizeOf(inputVal->getType()) + }; + CallInst::Create(llvm_visc_ptx_input_scalar, + ArrayRef<Value*>(setInputArgs, 4), "", RI); + } + + } + // Setup output + // FIXME: Note - There is a tricky question. In X86 we do not need to care + // about pointer inputs which modify data in memory implicitly (without + // showing it as output). There is no extra cost needed to handle such inputs + // For PTX, we need to read back such data from device memory to host memory. + // The cost is huge and hence we need to differentiate between readonly + // pointer inputs vs read/write pointer inputs. Currently supporting only a + // simple model in which all input edges are readonly and output is + // writeonly. + + // Set output + StructType* OutputTy = C->getOutputType(); + unsigned outputIndex = CF->getFunctionType()->getNumParams(); + Value* outputSize = ConstantExpr::getSizeOf(OutputTy); + Value* setOutputArgs[] = {GraphID, + ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex), + ConstantExpr::getSizeOf(OutputTy)}; + + CallInst* d_Output = CallInst::Create(llvm_visc_ptx_output_ptr, + ArrayRef<Value*>(setOutputArgs,3), + "d_output."+CF->getName(), + RI); + + // Enqueue kernel + // Need work dim, localworksize, globalworksize + // FIXME: Talk to DFG2LLVM_PTX pass to figure out the workdim, loacal work + // size and global work size + Value* ExecNodeArgs[] = {GraphID, + ConstantInt::get(Type::getInt32Ty(M.getContext()), C->getNumOfDim()), + Constant::getNullValue(Type::getInt64PtrTy(M.getContext())), + Constant::getNullValue(Type::getInt64PtrTy(M.getContext())) + }; + CallInst* Event = CallInst::Create(llvm_visc_ptx_executeNode, + ArrayRef<Value*>(ExecNodeArgs, 4), + "event."+CF->getName(), + RI); + // Wait for Kernel to Finish + CallInst::Create(llvm_visc_ptx_wait, + ArrayRef<Value*>(GraphID), + "", + RI); + // Read Output + Value* GetOutputArgs[] = {GraphID, + d_Output, + outputSize}; + CallInst* h_Output = CallInst::Create(llvm_visc_ptx_getOutput, + ArrayRef<Value*>(GetOutputArgs, 3), + "h_output."+CF->getName(), + RI); + // Prepare output + CastInst* BI = BitCastInst::CreatePointerCast(h_Output, CF->getReturnType(), "output.ptr", RI); + LoadInst* KernelOutput = new LoadInst(BI, "output."+CF->getName(), RI); + OutputMap[C] = KernelOutput; + + DEBUG(errs() << "*** Generating epilogue code for the function****\n"); + // Generate code for output bindings + // Get Exit node + C = N->getChildGraph()->getExit(); + // Get OutputType of this node + StructType* OutTy = N->getOutputType(); + Value *retVal = UndefValue::get(F_X86->getReturnType()); + // Find all the input edges to exit node + for (unsigned i=0; i < OutTy->getNumElements(); i++) { + DEBUG(errs() << "Output Edge " << i << "\n"); + // Find the incoming edge at the requested input port + DFEdge* E = C->getInDFEdgeAt(i); + + assert(E && "No Binding for output element!"); + // Find the Source DFNode associated with the incoming edge + DFNode* SrcDF = E->getSourceDF(); + + DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); + + // If Source DFNode is a dummyNode, edge is from parent. Get the + // argument from argument list of this internal node + Value* inputVal; + if(SrcDF->isEntryNode()) { + inputVal = getArgumentAt(F_X86, i); + DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + } + else { + // edge is from a internal node + // Check - code should already be generated for this source dfnode + assert(OutputMap.count(SrcDF) + && "Source node call not found. Dependency violation!"); + + // Find Output Value associated with the Source DFNode using OutputMap + Value* CI = OutputMap[SrcDF]; + + // Extract element at source position from this call instruction + std::vector<unsigned> IndexList; + IndexList.push_back(E->getSourcePosition()); + DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); + ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, + "",RI); + inputVal = EI; + } + std::vector<unsigned> IdxList; + IdxList.push_back(i); + retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI); + } + DEBUG(errs() << "Extracted all\n"); + retVal->setName("output"); + ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); + ReplaceInstWithInst(RI, newRI); + } + + // Right now, only targeting the one level case. In general, device functions // can return values so we don't need to change them void CodeGenTraversal::codeGen(DFInternalNode* N) { @@ -144,7 +519,7 @@ namespace { std::vector<IntrinsicInst *> IItoRemove; BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap; - // Get the function associated with the dataflow node + // Get the function associated with the dataflow node Function *F = N->getFuncPointer(); // Look up if we have visited this function before. If we have, then just @@ -381,7 +756,7 @@ namespace { } else { //TODO: how to handle address space qualifiers in load/store } - + } // We need to do this explicitly: DCE pass will not remove them because we diff --git a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp index 71de15d38546b00fa46c0dc11888a564ee5bef76..8b61d42af9cf2987db155cb88f246315519dc597 100644 --- a/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp @@ -67,15 +67,6 @@ private: Module* runtimeModule; Constant* llvm_visc_x86_launch; Constant* llvm_visc_x86_wait; - Constant* llvm_visc_ptx_launch; - Constant* llvm_visc_ptx_wait; - Constant* llvm_visc_ptx_initContext; - Constant* llvm_visc_ptx_input_scalar; - Constant* llvm_visc_ptx_input_ptr; - Constant* llvm_visc_ptx_output_ptr; - Constant* llvm_visc_ptx_getOutput; - Constant* llvm_visc_ptx_executeNode; - FunctionType* AppFuncTy; //Functions @@ -162,38 +153,6 @@ void CodeGenTraversal::initRuntimeAPI() { llvm_visc_x86_wait = M.getOrInsertFunction("llvm_visc_x86_wait", runtimeModule->getFunction("llvm_visc_x86_wait")->getFunctionType()); DEBUG(errs() << *llvm_visc_x86_wait); - - llvm_visc_ptx_launch = M.getOrInsertFunction("llvm_visc_ptx_launch", - runtimeModule->getFunction("llvm_visc_ptx_launch")->getFunctionType()); - DEBUG(errs() << *llvm_visc_ptx_launch); - - llvm_visc_ptx_wait = M.getOrInsertFunction("llvm_visc_ptx_wait", - runtimeModule->getFunction("llvm_visc_ptx_wait")->getFunctionType()); - DEBUG(errs() << *llvm_visc_ptx_wait); - - llvm_visc_ptx_initContext = M.getOrInsertFunction("llvm_visc_ptx_initContext" , - runtimeModule->getFunction("llvm_visc_ptx_initContext")->getFunctionType()); - DEBUG(errs() << *llvm_visc_ptx_initContext); - - llvm_visc_ptx_input_scalar = M.getOrInsertFunction("llvm_visc_ptx_input_scalar", - runtimeModule->getFunction("llvm_visc_ptx_input_scalar")->getFunctionType()); - DEBUG(errs() << *llvm_visc_ptx_input_scalar); - - llvm_visc_ptx_input_ptr = M.getOrInsertFunction("llvm_visc_ptx_input_ptr", - runtimeModule->getFunction("llvm_visc_ptx_input_ptr")->getFunctionType()); - DEBUG(errs() << *llvm_visc_ptx_input_ptr); - - llvm_visc_ptx_output_ptr = M.getOrInsertFunction("llvm_visc_ptx_output_ptr", - runtimeModule->getFunction("llvm_visc_ptx_output_ptr")->getFunctionType()); - DEBUG(errs() << *llvm_visc_ptx_output_ptr); - - llvm_visc_ptx_getOutput = M.getOrInsertFunction("llvm_visc_ptx_getOutput", - runtimeModule->getFunction("llvm_visc_ptx_getOutput")->getFunctionType()); - DEBUG(errs() << *llvm_visc_ptx_getOutput); - - llvm_visc_ptx_executeNode = M.getOrInsertFunction("llvm_visc_ptx_executeNode", - runtimeModule->getFunction("llvm_visc_ptx_executeNode")->getFunctionType()); - DEBUG(errs() << *llvm_visc_ptx_executeNode); } @@ -345,7 +304,7 @@ void CodeGenTraversal::codeGenLaunch(DFInternalNode* Root) { */ // Create Launch Function of type i8*(i8*) which calls the root function Type* i8Ty = Type::getInt8Ty(M.getContext()); - AppFuncTy = FunctionType::get(i8Ty->getPointerTo(), + FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(), ArrayRef<Type*>(i8Ty->getPointerTo()), false); Function* AppFunc = Function::Create(AppFuncTy, @@ -523,82 +482,6 @@ void CodeGenTraversal::invokeChild_X86(DFNode* C, Function* F_X86, } -void CodeGenTraversal::invokeChild_PTX(DFNode* C, Function* F_X86, - ValueToValueMapTy &VMap, Instruction* IB) { - Function* CF = C->getFuncPointer(); - - //FIXME: A way to check if PTX code has been generated for this child node - /*assert(FMap.count(CF) - && "Found leaf node for which code generation has not happened yet!"); - */ - //assert(C->getTag() == DFNode::PTX && "Cannot generate GPU call for non PTX nodes"); - - // Initialize context - CallInst::Create(llvm_visc_ptx_initContext, None, "", IB); - - // Initialize command queue - // Filename = <DFNode function name>.nvptx.ll - Twine file = CF->getName() + ".nvptx.ll"; - DEBUG(errs() << file << "\n"); - Constant* filename = ConstantDataArray::get(M.getContext(), - ArrayRef<uint8_t>((uint8_t*)file.str().c_str(), file.str().length())); - - CallInst* GraphID = CallInst::Create(llvm_visc_ptx_launch, - ArrayRef<Value*>(filename), - "graph"+CF->getName(), - IB); - - // Iterate over the required input edges of the node and use the visc-rt API - // to set inputs - for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) { - - Value* inputVal = getInValueAt(C, i, F_X86, IB); - // input value has been obtained. - // Check if input is a scalar value or a pointer operand - // For scalar values such as int, float, etc. the size is simply the size of - // type on target machine, but for pointers, the size of data would be the - // next integer argument - Value* inputSize; - if(inputVal->getType()->isPointerTy()) { - // Pointer Input - inputSize = getInValueAt(C, i+1, F_X86, IB); - assert(inputSize->getType()->isIntegerTy() - && "Pointer type input must always be followed by size (integer type)"); - } - else { // Scalar Input - inputSize = ConstantExpr::getSizeOf(inputVal->getType()); - } - - Value* setInputArgs[] = {GraphID, - inputVal, - ConstantInt::get(Type::getInt32Ty(M.getContext()),i), - inputSize - }; - CallInst::Create(llvm_visc_ptx_input_ptr, - ArrayRef<Value*>(setInputArgs, 4), "", IB); - - } - // Setup output - // FIXME: Note - There is a tricky question. In X86 we do not need to care - // about pointer inputs which modify data in memory implicitly (without - // showing it as output). There is no extra cost needed to handle such inputs - // For PTX, we need to read back such data from device memory to host memory. - // The cost is huge and hence we need to differentiate between readonly - // pointer inputs vs read/write pointer inputs. Currently supporting only a - // simple model in which all input edges are readonly and output is - // writeonly. - StructType* OutputTy = C->getOutputType(); - for(unsigned i=0; OutputTy->getNumElements(); i++) { - Type* elemTy = OutputTy->getElementType(i); - } - - // Enqueue kernel - // Read Output - // return output - // free data structures - -} - void CodeGenTraversal::codeGen(DFInternalNode* N) { Function* F = N->getFuncPointer(); @@ -653,11 +536,7 @@ void CodeGenTraversal::codeGen(DFInternalNode* N) { continue; // Check if Child Node has PTX tag or X86 tag - invokeChild_PTX(C, F_X86, VMap, RI); - if (C->getTag() == DFNode::PTX) - invokeChild_PTX(C, F_X86, VMap, RI); - else - invokeChild_X86(C, F_X86, VMap, RI); + invokeChild_X86(C, F_X86, VMap, RI); } DEBUG(errs() << "*** Generating epilogue code for the function****\n"); diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp index 4b05c8178bfedd860728ebdb97546b398e4fa33c..f753d9ad1d38d3178d3e3a4f27ff4f1006a18987 100644 --- a/llvm/projects/visc-rt/visc-rt.cpp +++ b/llvm/projects/visc-rt/visc-rt.cpp @@ -15,6 +15,11 @@ typedef struct { cl_kernel clKernel; } DFNodeContext_PTX; +typedef struct { + cl_mem d_elem; + size_t size; +} OutputTy; + cl_context globalGPUContext; static inline void checkErr(cl_int err, cl_int success, const char * name) { @@ -118,6 +123,18 @@ void* llvm_visc_ptx_getOutput(void* graphID, void* d_output, size_t size) { cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size, h_output, 0, NULL, NULL); checkErr(errcode, CL_SUCCESS, "Failure to read output"); + // Assuming all output is in the format of device pointer followed by size of + // output size format + OutputTy* Output = (OutputTy*) h_output; + unsigned numElems = size/sizeof(OutputTy); + for(unsigned i = 0; i < numElems; i++) { + OutputTy& outputElem = Output[i]; + void* h_outputElem = malloc(outputElem.size); + errcode = clEnqueueReadBuffer(Context->clCommandQue, outputElem.d_elem, CL_TRUE, 0, + outputElem.size, h_outputElem, 0, NULL, NULL); + checkErr(errcode, CL_SUCCESS, "Failure to read output"); + Output[i].d_elem = (cl_mem) h_outputElem; + } return h_output; }