diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp index d7cdfa2e40cc09daf0987900d4619ac923d6e1dc..425cabd0b485db37c9ea1481e05905f43aef8523 100644 --- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -443,8 +443,8 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa DEBUG(errs() << "Initializing commandQ" << "\n"); // Initialize command queue Value* fileStr = getStringPointer(FileName, RI, "Filename"); - errs() << *fileStr << "\n"; - errs() << "Generating code for kernel - " << kernel->KernelFunction->getName()<< "\n"; + DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n"); + DEBUG(errs() << "Generating code for kernel - " << kernel->KernelFunction->getName()<< "\n"); Value* kernelStr = getStringPointer(kernel->KernelFunction->getName(), RI,"KernelName"); Value* LaunchInstArgs[] = {fileStr, kernelStr}; @@ -458,7 +458,6 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa // Iterate over the required input edges of the node and use the visc-rt API // to set inputs DEBUG(errs() << "Iterate over input edges of node and insert visc api\n"); - std::vector<OutputPtr> OutputPointers; for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) { @@ -469,6 +468,7 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa // type on target machine, but for pointers, the size of data would be the // next integer argument if(inputVal->getType()->isPointerTy()) { + // Pointer Input // CheckAttribute Value* isOutput = (hasAttribute(CF, i, Attribute::Out))? True : False; Value* isInput = ((hasAttribute(CF, i, Attribute::Out)) @@ -482,11 +482,11 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa errs() << *A << " is an INPUT argument\n"; } + Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal, Type::getInt8PtrTy(M.getContext()), inputVal->getName()+".i8ptr", RI); - // Pointer Input Value* inputSize = getInValueAt(C, i+1, F_X86, RI); assert(inputSize->getType() == Type::getInt64Ty(M.getContext()) && "Pointer type input must always be followed by size (integer type)"); @@ -503,7 +503,8 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa // memory to read device memory later if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize)); } - else { // Scalar Input + else { + // Scalar Input // Store the scalar value on stack and then pass the pointer to its // location AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), inputVal->getName()+".ptr", RI); @@ -525,15 +526,6 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa } DEBUG(errs() << "Setup output edges of node and insert visc api\n"); - // Setup output - // FIXME: Note - There is a tricky question. In X86 we do not need to care - // about pointer inputs which modify data in memory implicitly (without - // showing it as output). There is no extra cost needed to handle such inputs - // For PTX, we need to read back such data from device memory to host memory. - // The cost is huge and hence we need to differentiate between readonly - // pointer inputs vs read/write pointer inputs. Currently supporting only a - // simple model in which all input edges are readonly and output is - // writeonly. // Set output StructType* OutputTy = C->getOutputType(); @@ -554,8 +546,6 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa // Enqueue kernel // Need work dim, localworksize, globalworksize - // FIXME: Talk to DFG2LLVM_PTX pass to figure out the workdim, loacal work - // size and global work size // Allocate size_t[numDims] space on stack. Store the work group sizes and // pass it as an argument to ExecNode @@ -571,11 +561,13 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa "event."+CF->getName(), RI); DEBUG(errs() << "Execute Node Call: " << *Event << "\n"); + // Wait for Kernel to Finish CallInst::Create(llvm_visc_ptx_wait, ArrayRef<Value*>(GraphID), "", RI); + // Read Output Struct Value* GetOutputArgs[] = {GraphID, Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), @@ -589,7 +581,8 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa // Read each device pointer listed in output struct // Load the output struct CastInst* BI = BitCastInst::CreatePointerCast(h_Output, CF->getReturnType()->getPointerTo(), "output.ptr", RI); - Value* KernelOutput = new LoadInst(BI, "", RI); + Value* KernelOutput = new LoadInst(BI, "output."+CF->getName(), RI); + OutputMap[C] = KernelOutput; // Read all the pointer arguments which had side effects i.e., had out // attribute @@ -602,37 +595,6 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa ArrayRef<Value*>(GetOutputArgs, 4), "", RI); } - /*for(unsigned i=0; i < OutputTy->getNumElements(); i++) { - Type* elemTy = OutputTy->getElementType(i); - if(elemTy->isPointerTy()) { - // Pointer type - assert(OutputTy->getElementType(i+1) == Type::getInt64Ty(M.getContext()) - && "Every Pointer type must be followed by an integer"); - ExtractValueInst* d_ptr = ExtractValueInst::Create(KernelOutput, ArrayRef<unsigned>(i), "", RI); - // Change d_ptr to i8* - CastInst* d_ptr_i8 = BitCastInst::CreatePointerCast(d_ptr, Type::getInt8PtrTy(M.getContext()), "", RI); - ExtractValueInst* len = ExtractValueInst::Create(KernelOutput, ArrayRef<unsigned>(i+1), "", RI); - // GetOutputPtr call - Value* GetOutputArgs[] = {GraphID, - d_ptr_i8, - len}; - CallInst* h_ptr_i8 = CallInst::Create(llvm_visc_ptx_getOutput, - ArrayRef<Value*>(GetOutputArgs, 3), - "", - RI); - // Change h_ptr to correct type - CastInst* h_ptr = CastInst::CreatePointerCast(h_ptr_i8, - cast<StructType>(KernelOutput->getType())->getElementType(i), - "", - RI); - KernelOutput = InsertValueInst::Create(KernelOutput, h_ptr, ArrayRef<unsigned>(i), "", RI); - - } - }*/ - - // Prepare output - KernelOutput->setName("output."+CF->getName()); - OutputMap[C] = KernelOutput; DEBUG(errs() << "*** Generating epilogue code for the function****\n"); // Generate code for output bindings @@ -681,6 +643,7 @@ void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileNa IdxList.push_back(i); retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI); } + DEBUG(errs() << "Extracted all\n"); retVal->setName("output"); ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);