From 90d19a953063bd66aa8e44f0fc93ea6af5ea29e9 Mon Sep 17 00:00:00 2001 From: Adel Ejjeh <aejjeh@tyler.cs.illinois.edu> Date: Fri, 17 Jan 2020 19:03:24 -0600 Subject: [PATCH] adding modifications to NVPTX pass --- .../DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp | 1481 ++++++++++------- 1 file changed, 925 insertions(+), 556 deletions(-) diff --git a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp index 0ee18394ba..c85a8a4f2d 100644 --- a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp +++ b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -1130,7 +1130,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { // constant memory, subject to size of course std::vector<unsigned> ConstantMemArgs = globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx); - F_nvptx = changeArgAddrspace(F_nvptx, ConstantMemArgs, CONSTANT_ADDRSPACE); + F_nvptx = changeArgAddrspace(F_nvptx, ConstantMemArgs, GLOBAL_ADDRSPACE); F_nvptx = changeArgAddrspace(F_nvptx, SharedMemArgs, SHARED_ADDRSPACE); F_nvptx = changeArgAddrspace(F_nvptx, GlobalMemArgs, GLOBAL_ADDRSPACE); @@ -1416,350 +1416,720 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) { default: llvm_unreachable("Unknown VISC Intrinsic!"); break; - } + } + + } + else if(MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) { + IRBuilder<> Builder(I); + Value *Source = MemCpyI->getSource(); + Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts(); + Value *Length = MemCpyI->getOperand(2); + DEBUG(errs() << "Found memcpy instruction: " << *I << "\n"); + DEBUG(errs() << "Source: " << *Source << "\n"); + DEBUG(errs() << "Destination: " << *Destination << "\n"); + DEBUG(errs() << "Length: " << *Length << "\n"); + + size_t memcpy_length; + unsigned int memcpy_count; + if (ConstantInt* CI = dyn_cast<ConstantInt>(Length)) { + if (CI->getBitWidth() <= 64) { + memcpy_length = CI->getSExtValue(); + DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n"); + Type *Source_Type = Source->getType()->getPointerElementType(); + DEBUG(errs() << "Source Type : " << *Source_Type << "\n"); + memcpy_count = memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8); + DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n"); + if (GetElementPtrInst *sourceGEPI = dyn_cast<GetElementPtrInst>(Source)) { + if (GetElementPtrInst *destGEPI = dyn_cast<GetElementPtrInst>(Destination)) { + Value *SourcePtrOperand = sourceGEPI->getPointerOperand(); + Value *DestPtrOperand = destGEPI->getPointerOperand(); + for(int i = 0; i < memcpy_count; ++i) { + Constant *increment; + LoadInst *newLoadI; + StoreInst *newStoreI; + // First, need to increment the correct index for both source and dest + // This invluves checking to see how many indeces the GEP has + // Assume for now only 1 or 2 are the viable options. + + std::vector<Value*> GEPlIndex; + if (sourceGEPI->getNumIndices() == 1) { + Value *Index = sourceGEPI->getOperand(1); + increment = ConstantInt::get(Index->getType(), i, false); + Value *incAdd = Builder.CreateAdd(Index, increment); + DEBUG(errs() << "Add: " << *incAdd << "\n"); + GEPlIndex.push_back(incAdd); + Value *newGEPIl = Builder.CreateGEP(SourcePtrOperand, ArrayRef<Value*>(GEPlIndex)); + DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n"); + newLoadI = Builder.CreateLoad(newGEPIl); + DEBUG(errs() << "Load: " << *newLoadI << "\n"); + } else { + llvm_unreachable("Unhandled case where source GEPI has more than 1 indices!\n"); + } + + + std::vector<Value*> GEPsIndex; + if (destGEPI->getNumIndices() == 1) { + + } else if (destGEPI->getNumIndices() == 2) { + Value *Index0 = destGEPI->getOperand(1); + GEPsIndex.push_back(Index0); + Value *Index1 = destGEPI->getOperand(2); + increment = ConstantInt::get(Index1->getType(), i, false); + Value *incAdd = Builder.CreateAdd(Index1, increment); + DEBUG(errs() << "Add: " << *incAdd << "\n"); + GEPsIndex.push_back(incAdd); + Value *newGEPIs = Builder.CreateGEP(DestPtrOperand, ArrayRef<Value*>(GEPsIndex)); + DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n"); + newStoreI = Builder.CreateStore(newLoadI, newGEPIs, MemCpyI->isVolatile()); + DEBUG(errs() << "Store: " << *newStoreI << "\n"); + } else { + llvm_unreachable("Unhandled case where dest GEPI has more than 2 indices!\n"); + } + } + IItoRemove.push_back(sourceGEPI); + IItoRemove.push_back(destGEPI); + Instruction *destBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(0)); + Instruction *sourceBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(1)); + IItoRemove.push_back(destBitcastI); + IItoRemove.push_back(sourceBitcastI); + IItoRemove.push_back(MemCpyI); + } + } + + } + } else { + llvm_unreachable("MEMCPY length is not a constant, not handled!\n"); + } + // llvm_unreachable("HERE!"); + } + + else if(CallInst* CI = dyn_cast<CallInst>(I)) { + DEBUG(errs() << "Found a call: " << *CI << "\n"); + Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts()); + if(calleeF->isDeclaration()) { + // Add the declaration to kernel module + DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n"); + KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType()); + if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(CI)) { + // Now handle a few specific intrinsics + // For now, sin and cos are translated to their libclc equivalent + switch(II->getIntrinsicID()) { + case Intrinsic::sin: + case Intrinsic::cos: + { + DEBUG(errs() << "Found sincos: " << *II << "\n"); + // Get the libclc function + // libclc uses mangled name for sin cos + assert(II->getType()->isFloatTy() + && "Only handling sin(float) and cos(float)!"); + std::string name; + if(II->getIntrinsicID() == Intrinsic::sin) + name = "sin"; + else + name = "cos"; + + FunctionType* SinCosFT = FunctionType::get(II->getType(), + Type::getFloatTy(KernelM->getContext()), + false); + FunctionCallee LibclcFunction = KernelM->getOrInsertFunction(name, SinCosFT); + CallInst* CI = CallInst::Create(LibclcFunction, II->getArgOperand(0), II->getName(), II); + + II->replaceAllUsesWith(CI); + IItoRemove.push_back(II); + break; + } + case Intrinsic::floor: + { + DEBUG(errs() << "Found floor intrinsic\n"); + F = Intrinsic::getDeclaration(KernelM.get(), Intrinsic::nvvm_floor_f); + FunctionType* FTy = F->getFunctionType(); + DEBUG(errs() << *F << "\n"); + + // Create argument list + std::vector<Value*> args; + assert(CI->getNumArgOperands() == FTy->getNumParams() + && "Number of arguments of call do not match with Intrinsic"); + for(unsigned i=0; i < CI->getNumArgOperands(); i++) { + Value* V = CI->getArgOperand(i); + // Either the type should match or both should be of pointer type + assert((V->getType() == FTy->getParamType(i) || + (V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy())) + && "Dummy function call argument does not match with Intrinsic argument!"); + // If the types do not match, then both must be pointer type and pointer + // cast needs to be performed + if(V->getType() != FTy->getParamType(i)) { + V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI); + } + args.push_back(V); + } + // Insert call instruction + CallInst* Inst = CallInst::Create(F, args, + F->getReturnType()->isVoidTy()? "" : CI->getName(), CI); + DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n"); + CI->replaceAllUsesWith(Inst); + IItoRemove.push_back(II); + break; + } + default: + errs() << "[WARNING] Found Intrinsic: " << *II << "\n" ; + } + } + + } + else { + // Check if the called function has already been cloned before. + Function *NewFunc = CloneAndReplaceCall(CI, calleeF); + // Iterate over the new function to see if it calls any other functions + // in the module. + for(inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); i != e; ++i) { + if(auto *Call = dyn_cast<CallInst>(&*i)) { + Function *CalledFunc = cast<Function>(Call->getCalledValue()->stripPointerCasts()); + CloneAndReplaceCall(Call, CalledFunc); + } + } + } + //TODO: how to handle address space qualifiers in load/store + } + + } + // search for pattern where float is being casted to int and loaded/stored and change it. + DEBUG(errs() << "finding pattern for replacement!\n"); + for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) { + bool cont = false; + bool keepGEPI = false; + bool keepGEPI2= false; + Instruction *I = &(*i); + GetElementPtrInst* GEPI = dyn_cast<GetElementPtrInst>(I); + if (!GEPI) { + // did nod find pattern start, continue + continue; + } + // may have found pattern, check + DEBUG(errs() << "GEPI " << *GEPI << "\n"); + // print whatever we want for debug + Value* PtrOp = GEPI->getPointerOperand(); + Type *SrcTy = GEPI->getSourceElementType(); + unsigned GEPIaddrspace = GEPI->getAddressSpace(); + + if (SrcTy->isArrayTy()) + DEBUG(errs() << *SrcTy << " is an array type! " << *(SrcTy->getArrayElementType()) << "\n"); + else + DEBUG(errs() << *SrcTy << " is not an array type!\n"); + // check that source element type is float + if (SrcTy->isArrayTy()) { + if (!(SrcTy->getArrayElementType()->isFloatTy())) { + DEBUG(errs() << "GEPI type is array but not float!\n"); + continue; + } + } + else if (!(SrcTy->isFPOrFPVectorTy()/*isFloatTy()*/)) { + DEBUG(errs() << "GEPI type is " << *SrcTy << "\n"); + // does not fit this pattern - no float GEP instruction + continue; + } + // check that addressspace is 1 + // if (GEPIaddrspace != 1) { + // // does not fit this pattern - addrspace of pointer argument is not global + // continue; + // } + if (!(GEPI->hasOneUse())) { + // does not fit this pattern - more than one uses + //continue; + // Keep GEPI around if it has other uses + keepGEPI = true; } - else if(CallInst* CI = dyn_cast<CallInst>(I)) { - DEBUG(errs() << "Found a call: " << *CI << "\n"); - Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts()); - if(calleeF->isDeclaration()) { - // Add the declaration to kernel module - DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n"); - KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType()); - if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(CI)) { - // Now handle a few specific intrinsics - // For now, sin and cos are translated to their libclc equivalent - switch(II->getIntrinsicID()) { - case Intrinsic::sin: - case Intrinsic::cos: - { - DEBUG(errs() << "Found sincos: " << *II << "\n"); - // Get the libclc function - // libclc uses mangled name for sin cos - assert(II->getType()->isFloatTy() - && "Only handling sin(float) and cos(float)!"); - std::string name; - if(II->getIntrinsicID() == Intrinsic::sin) - name = "sin"; - else - name = "cos"; - - FunctionType* SinCosFT = FunctionType::get(II->getType(), - Type::getFloatTy(KernelM->getContext()), - false); - FunctionCallee LibclcFunction = KernelM->getOrInsertFunction(name, SinCosFT); - CallInst* CI = CallInst::Create(LibclcFunction, II->getArgOperand(0), II->getName(), II); - - II->replaceAllUsesWith(CI); - IItoRemove.push_back(II); - break; - } - case Intrinsic::floor: - { - DEBUG(errs() << "Found floor intrinsic\n"); - F = Intrinsic::getDeclaration(KernelM.get(), Intrinsic::nvvm_floor_f); - FunctionType* FTy = F->getFunctionType(); - DEBUG(errs() << *F << "\n"); - - // Create argument list - std::vector<Value*> args; - assert(CI->getNumArgOperands() == FTy->getNumParams() - && "Number of arguments of call do not match with Intrinsic"); - for(unsigned i=0; i < CI->getNumArgOperands(); i++) { - Value* V = CI->getArgOperand(i); - // Either the type should match or both should be of pointer type - assert((V->getType() == FTy->getParamType(i) || - (V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy())) - && "Dummy function call argument does not match with Intrinsic argument!"); - // If the types do not match, then both must be pointer type and pointer - // cast needs to be performed - if(V->getType() != FTy->getParamType(i)) { - V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI); - } - args.push_back(V); - } - // Insert call instruction - CallInst* Inst = CallInst::Create(F, args, - F->getReturnType()->isVoidTy()? "" : CI->getName(), CI); - DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n"); - CI->replaceAllUsesWith(Inst); - IItoRemove.push_back(II); - break; - } - default: - errs() << "[WARNING] Found Intrinsic: " << *II << "\n" ; - } + DEBUG(errs() << "Found GEPI " << *GEPI << "\n"); + + // 1st GEPI it has one use + // assert(GEPI->hasOneUse() && "GEPI has a single use"); + + // See if it is a bitcast + BitCastInst *BitCastI; + for (User * U : GEPI->users()) { + if(Instruction *ui = dyn_cast<Instruction> (U)) { + DEBUG(errs() << "--" << *ui << "\n"); + if (isa<BitCastInst>(ui)) { + BitCastI = dyn_cast<BitCastInst>(ui); + DEBUG(errs() << "---Found bitcast as only use of GEP\n"); + break; } - } - else { - // Check if the called function has already been cloned before. - Function *NewFunc = CloneAndReplaceCall(CI, calleeF); - // Iterate over the new function to see if it calls any other functions - // in the module. - for(inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); i != e; ++i) { - if(auto *Call = dyn_cast<CallInst>(&*i)) { - Function *CalledFunc = cast<Function>(Call->getCalledValue()->stripPointerCasts()); - CloneAndReplaceCall(Call, CalledFunc); - } + DEBUG(errs() << "GEPI does not have a bitcast user, continue\n"); + cont = true; + } + // for (Value::user_iterator ui = GEPI->user_begin(), + // ue = GEPI->user_end(); ui!=ue; ++ui) { + // DEBUG(errs() << "--" << *ui << "\n"); + // if (isa<BitCastInst>(*ui)) { + // BitCastI = dyn_cast<BitCastInst>(*ui); + // DEBUG(errs() << "Found bitcast as only use of GEP\n"); + // } + // } + + if (cont/*!BitCastI*/) { + continue; // not in pattern + } + + // DEBUG(errs() << *BitCastI << "\n"); + // Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand has to be the GEP, since this is a use of the GEP. + Value *Op2 = BitCastI->getOperand(0); + DEBUG(errs() << "----" << *Op2 << "\n"); + // assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n"); + // Type *OpTy = cast<Type>(Op2); + Type *OpTy = BitCastI->getDestTy(); + DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n"); + // DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) << "\n"); + if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) { + // maybe right syntax is (Type::getInt32Ty)->getPointerTo() + continue; // not in pattern + } + + DEBUG(errs() << "----Here!\n"); + // We are in GEP, bitcast. + + // user_iterator, to find the load. + + if (!(BitCastI->hasOneUse())) { + // does not fit this pattern - more than one uses + continue; + } + DEBUG(errs() << "----Bitcast has one use!\n"); + // it has one use + assert(BitCastI->hasOneUse() && "BitCastI has a single use"); + LoadInst *LoadI; + for (User * U : BitCastI->users()) { + if (Instruction *ui = dyn_cast<Instruction> (U)) { + DEBUG(errs() << "-----" << *ui << "\n"); + if (isa<LoadInst>(ui)) { + LoadI = dyn_cast<LoadInst>(ui); + DEBUG(errs() << "-----Found load as only use of bitcast\n"); + break; } } - //TODO: how to handle address space qualifiers in load/store + DEBUG(errs() << "Bitcast does not have a load user, continue!\n"); + cont = true; + } + // for (Value::user_iterator ui = BitCastI->user_begin(), + // ue = BitCastI->user_end(); ui!=ue; ++ui) { + // if (isa<LoadInst>(*ui)) { + // LoadI = dyn_cast<LoadInst>(*ui); + // errs() << "Found load as only use of bitcast\n"; + // } + // } + + if (cont) { + continue; // not in pattern } - } + DEBUG("HERE!\n"); + // check that we load from pointer we got from bitcast - assert - the unique argument must be the use we found it from + assert(LoadI->getPointerOperand() == BitCastI && "Unexpected Load Instruction Operand\n"); - // We need to do this explicitly: DCE pass will not remove them because we - // have assumed theworst memory behaviour for these function calls - // Traverse the vector backwards, otherwise definitions are deleted while - // their subsequent uses are still around - for (auto *I : reverse(IItoRemove)) { - DEBUG(errs() << "Erasing: " << *I << "\n"); - I->eraseFromParent(); - } + // Copy user_iterator, to find the store. - // Removed the cloned functions from the parent module into the new module - for(auto *F : FuncToBeRemoved) { - F->removeFromParent(); //TODO: MARIA check - KernelM->getFunctionList().push_back(F); - } + if (!(LoadI->hasOneUse())) { + // does not fit this pattern - more than one uses + continue; + // TODO: generalize: one load can have more than one store users + } + + // it has one use + assert(LoadI->hasOneUse() && "LoadI has a single use"); + Value::user_iterator ui = LoadI->user_begin(); + // skipped loop, because is has a single use + StoreInst *StoreI = dyn_cast<StoreInst>(*ui); + if (!StoreI) { + continue; // not in pattern + } + + // Also check that the store uses the loaded value as the value operand + if (StoreI->getValueOperand() != LoadI) { + continue; + } + + DEBUG(errs() << "-------Found store instruction\n"); + + // Look for its bitcast, which is its pointer operand + Value *StPtrOp = StoreI->getPointerOperand(); + DEBUG(errs() << "-------" << *StPtrOp << "\n"); + BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp); + DEBUG(errs() << "-------" << *BitCastI2 << "\n"); + if (!BitCastI2) { + continue; //not in pattern + } + + DEBUG(errs() << "-------- Found Bit Cast of store!\n" ); + // found bitcast. Look for the second GEP, its from operand. + Value *BCFromOp = BitCastI2->getOperand(0); + GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp); + DEBUG(errs() << "---------- " << *GEPI2 << "\n"); + if (!GEPI2) { + continue; //not in pattern + } + + if (!(GEPI2->hasOneUse())) { + // does not fit this pattern - more than one uses + //continue; + // Keep GEPI around if it has other uses + keepGEPI2 = true; + } + DEBUG(errs() << "---------- Found GEPI of Bitcast!\n"); - addCLMetadata(F_nvptx); - kernel->KernelFunction = F_nvptx; - errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n"; - DEBUG(errs() << *KernelM); + Value *PtrOp2 = GEPI2->getPointerOperand(); + + // Found GEPI2. TODO: kind of confused as o what checks I need to add here, let's add them together- all the code for int-float type checks is already above. + + // Assume we found pattern + if (!keepGEPI) { + IItoRemove.push_back(GEPI); + DEBUG(errs() << "Pushing " << *GEPI << " for removal\n"); + } else { + DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n"); + } + IItoRemove.push_back(BitCastI); + DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n"); + IItoRemove.push_back(LoadI); + DEBUG(errs() << "Pushing " << *LoadI << " for removal\n"); + IItoRemove.push_back(GEPI2); + DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n"); + IItoRemove.push_back(BitCastI2); + DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n"); + if (!keepGEPI2) { + IItoRemove.push_back(StoreI); + DEBUG(errs() << "Pushing " << *StoreI << " for removal\n"); + } else { + + DEBUG(errs() << "Keeping " << *StoreI << " since it has multiple uses!\n"); + } + + std::vector<Value*> GEPlIndex; + if (GEPI->hasIndices()) { + for(auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) { + Value *Index = dyn_cast<Value>(&*ii); + DEBUG(errs() << "GEP-1 Index: " << *Index << "\n"); + GEPlIndex.push_back(Index); + } + } + // ArrayRef<Value*> GEPlArrayRef(GEPlIndex); + + std::vector<Value*> GEPsIndex; + if (GEPI2->hasIndices()) { + for(auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) { + Value *Index = dyn_cast<Value>(&*ii); + DEBUG(errs() << "GEP-2 Index: " << *Index << "\n"); + GEPsIndex.push_back(Index); + } + } + // ArrayRef<Value*> GEPsArrayRef(GEPlIndex); + + + + // ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end()); + GetElementPtrInst* newlGEP = + GetElementPtrInst::Create(GEPI->getSourceElementType(), //Type::getFloatTy(M.getContext()), + PtrOp, // operand from 1st GEP + ArrayRef<Value*>(GEPlIndex), + Twine(), + StoreI); + DEBUG(errs() << "Adding: " << *newlGEP << "\n"); + // insert load before GEPI + LoadInst *newLoadI = + new LoadInst(Type::getFloatTy(M.getContext()), + newlGEP, // new GEP + Twine(), + LoadI->isVolatile(), + LoadI->getAlignment(), + LoadI->getOrdering(), + LoadI->getSyncScopeID(), + StoreI); + DEBUG(errs() << "Adding: " << *newLoadI << "\n"); + // same for GEP for store, for store operand + GetElementPtrInst* newsGEP = + GetElementPtrInst::Create(GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()), + PtrOp2, // operand from 2nd GEP + ArrayRef<Value*>(GEPsIndex), + Twine(), + StoreI); + DEBUG(errs() << "Adding: " << *newsGEP << "\n"); + // insert store before GEPI + StoreInst *newStoreI = + new StoreInst(newLoadI, + newsGEP, // new GEP + StoreI->isVolatile(), + StoreI->getAlignment(), + StoreI->getOrdering(), + StoreI->getSyncScopeID(), + StoreI); + DEBUG(errs() << "Adding: " << *newStoreI << "\n"); - return; + } + + // We need to do this explicitly: DCE pass will not remove them because we + // have assumed theworst memory behaviour for these function calls + // Traverse the vector backwards, otherwise definitions are deleted while + // their subsequent uses are still around + for (auto *I : reverse(IItoRemove)) { + DEBUG(errs() << "Erasing: " << *I << "\n"); + I->eraseFromParent(); + } + + // Removed the cloned functions from the parent module into the new module + for(auto *F : FuncToBeRemoved) { + F->removeFromParent(); //TODO: MARIA check + KernelM->getFunctionList().push_back(F); + } + + addCLMetadata(F_nvptx); + kernel->KernelFunction = F_nvptx; + errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n"; + DEBUG(errs() << *KernelM); + + return; } bool DFG2LLVM_NVPTX::runOnModule(Module &M) { - errs() << "\nDFG2LLVM_NVPTX PASS\n"; + errs() << "\nDFG2LLVM_NVPTX PASS\n"; - // Get the BuildDFG Analysis Results: - // - Dataflow graph - // - Maps from i8* hansles to DFNode and DFEdge - BuildDFG &DFG = getAnalysis<BuildDFG>(); + // Get the BuildDFG Analysis Results: + // - Dataflow graph + // - Maps from i8* hansles to DFNode and DFEdge + BuildDFG &DFG = getAnalysis<BuildDFG>(); - // DFInternalNode *Root = DFG.getRoot(); - std::vector<DFInternalNode*> Roots = DFG.getRoots(); - // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); - // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); + // DFInternalNode *Root = DFG.getRoot(); + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); + // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); - // Visitor for Code Generation Graph Traversal - CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG); + // Visitor for Code Generation Graph Traversal + CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG); - // Iterate over all the DFGs and produce code for each one of them - for (auto rootNode: Roots) { - // Initiate code generation for root DFNode - CGTVisitor->visit(rootNode); - } + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode: Roots) { + // Initiate code generation for root DFNode + CGTVisitor->visit(rootNode); + } - CGTVisitor->writeKernelsModule(); + CGTVisitor->writeKernelsModule(); - //TODO: Edit module epilogue to remove the VISC intrinsic declarations - delete CGTVisitor; + //TODO: Edit module epilogue to remove the VISC intrinsic declarations + delete CGTVisitor; - return true; + return true; } std::string CGT_NVPTX::getKernelsModuleName(Module &M) { - /*SmallString<128> currentDir; - llvm::sys::fs::current_path(currentDir); - std::string fileName = getFilenameFromModule(M); - Twine output = Twine(currentDir) + "/Output/" + fileName + ""; - return output.str().append(".kernels.ll");*/ - std::string mid = M.getModuleIdentifier(); - return mid.append(".kernels.ll"); + /*SmallString<128> currentDir; + llvm::sys::fs::current_path(currentDir); + std::string fileName = getFilenameFromModule(M); + Twine output = Twine(currentDir) + "/Output/" + fileName + ""; + return output.str().append(".kernels.ll");*/ + std::string mid = M.getModuleIdentifier(); + return mid.append(".kernels.ll"); } void CGT_NVPTX::fixValueAddrspace(Value* V, unsigned addrspace) { - assert(isa<PointerType>(V->getType()) - && "Value should be of Pointer Type!"); - PointerType* OldTy = cast<PointerType>(V->getType()); - PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace); - V->mutateType(NewTy); - for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) { - // Change all uses producing pointer type in same address space to new - // addressspace. - if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) { - if(PTy->getAddressSpace() == OldTy->getAddressSpace()) { - fixValueAddrspace(*ui, addrspace); - } - } - } + assert(isa<PointerType>(V->getType()) + && "Value should be of Pointer Type!"); + PointerType* OldTy = cast<PointerType>(V->getType()); + PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace); + V->mutateType(NewTy); + for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) { + // Change all uses producing pointer type in same address space to new + // addressspace. + if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) { + if(PTy->getAddressSpace() == OldTy->getAddressSpace()) { + fixValueAddrspace(*ui, addrspace); + } + } + } } std::vector<unsigned> CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned>* GlobalMemArgs, Function* F) { - std::vector<unsigned> ConstantMemArgs; - for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - Argument* arg = &*ai; - std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(), - GlobalMemArgs->end(), arg->getArgNo()); - // It has to be a global memory argument to be promotable - if(pos == GlobalMemArgs->end()) - continue; - - // Check if it can/should be promoted - if(canBePromoted(arg, F)) { - errs() << "Promoting << " << arg->getName() << " to constant memory."<< "\n"; - ConstantMemArgs.push_back(arg->getArgNo()); - GlobalMemArgs->erase(pos); - } - } - return ConstantMemArgs; + std::vector<unsigned> ConstantMemArgs; + for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + Argument* arg = &*ai; + std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(), + GlobalMemArgs->end(), arg->getArgNo()); + // It has to be a global memory argument to be promotable + if(pos == GlobalMemArgs->end()) + continue; + + // Check if it can/should be promoted + if(canBePromoted(arg, F)) { + errs() << "Promoting << " << arg->getName() << " to constant memory."<< "\n"; + ConstantMemArgs.push_back(arg->getArgNo()); + GlobalMemArgs->erase(pos); + } + } + return ConstantMemArgs; } Function* CGT_NVPTX::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) { - unsigned idx = 0; - std::vector<Type*> ArgTypes; - for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - Argument *arg = &*ai; - DEBUG(errs() << *arg << "\n"); - unsigned argno = arg->getArgNo(); - if ((idx < Args.size()) && (argno == Args[idx])) { - fixValueAddrspace(arg, addrspace); - idx++; - } - ArgTypes.push_back(arg->getType()); - } - FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false); - - //F->mutateType(PTy); - Function* newF = cloneFunction(F, newFT, false); - replaceNodeFunctionInIR(*F->getParent(), F, newF); - - DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n"); - return newF; + unsigned idx = 0; + std::vector<Type*> ArgTypes; + for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + Argument *arg = &*ai; + DEBUG(errs() << *arg << "\n"); + unsigned argno = arg->getArgNo(); + if ((idx < Args.size()) && (argno == Args[idx])) { + fixValueAddrspace(arg, addrspace); + idx++; + } + ArgTypes.push_back(arg->getType()); + } + FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false); + + //F->mutateType(PTy); + Function* newF = cloneFunction(F, newFT, false); + replaceNodeFunctionInIR(*F->getParent(), F, newF); + + DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n"); + return newF; } /* Add metadata to module KernelM, for OpenCL kernels */ void CGT_NVPTX::addCLMetadata(Function *F) { - IRBuilder<> Builder(&*F->begin()); + IRBuilder<> Builder(&*F->begin()); - SmallVector<Metadata*,8> KernelMD; - KernelMD.push_back(ValueAsMetadata::get(F)); + SmallVector<Metadata*,8> KernelMD; + KernelMD.push_back(ValueAsMetadata::get(F)); - // TODO: There is additional metadata used by kernel files but we skip them as - // they are not mandatory. In future they might be useful to enable - // optimizations + // TODO: There is additional metadata used by kernel files but we skip them as + // they are not mandatory. In future they might be useful to enable + // optimizations - MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD); - NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels"); - MDN_kernels->addOperand(MDKernelNode); + MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD); + NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels"); + MDN_kernels->addOperand(MDKernelNode); - KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel")); - // TODO: Replace 1 with the number of the kernel. - // Add when support for multiple launces is added - KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1))); - MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD); - NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations"); - MDN_annotations->addOperand(MDNvvmAnnotationsNode); + KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel")); + // TODO: Replace 1 with the number of the kernel. + // Add when support for multiple launces is added + KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1))); + MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD); + NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations"); + MDN_annotations->addOperand(MDNvvmAnnotationsNode); } void CGT_NVPTX::writeKernelsModule() { - // In addition to deleting all other functions, we also want to spiff it - // up a little bit. Do this now. - legacy::PassManager Passes; + // In addition to deleting all other functions, we also want to spiff it + // up a little bit. Do this now. + legacy::PassManager Passes; - errs() << "Writing to File --- "; - errs() << getKernelsModuleName(M).c_str() << "\n"; - std::error_code EC; - ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None); - if (EC) { - errs() << EC.message() << '\n'; - } + errs() << "Writing to File --- "; + errs() << getKernelsModuleName(M).c_str() << "\n"; + std::error_code EC; + ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None); + if (EC) { + errs() << EC.message() << '\n'; + } - Passes.add( - createPrintModulePass(Out.os())); + Passes.add( + createPrintModulePass(Out.os())); - Passes.run(*KernelM); + Passes.run(*KernelM); - // Declare success. - Out.keep(); + // Declare success. + Out.keep(); } Function* CGT_NVPTX::transformFunctionToVoid(Function* F) { - DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n"); - // FIXME: Maybe do that using the Node? - StructType* FRetTy = dyn_cast<StructType>(F->getReturnType()); - assert(FRetTy && "Return Type must always be a struct"); - - // Keeps return statements, because we will need to replace them - std::vector<ReturnInst *> RItoRemove; - findReturnInst(F, RItoRemove); - - - // Check for { } return struct, which means that the function returns void - if (FRetTy->isEmptyTy()) { - - DEBUG(errs() << "\tFunction output struct is void\n"); - DEBUG(errs() << "\tNo parameters added\n"); - - // Replacing return statements with others returning void - for (auto *RI : RItoRemove) { - ReturnInst::Create((F->getContext()), 0, RI); - RI->eraseFromParent(); - } - DEBUG(errs() << "\tChanged return statements to return void\n"); - } - else { - // The struct has return values, thus needs to be converted to parameter - - // Iterate over all element types of return struct and add arguments to the - // function - std::vector<Argument*> Args; - for (unsigned i=0; i<FRetTy->getNumElements(); i++) { - Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F); - Args.push_back(RetArg); - DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n"); - } - - DEBUG(errs() << "\tReplacing Return statements\n"); - // Replace return statements with extractValue and store instructions - for (auto *RI : RItoRemove) { - Value* RetVal = RI->getReturnValue(); - for(unsigned i = 0; i < Args.size(); i++) { - ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i), - Args[i]->getName()+".val", RI); - new StoreInst(EI, Args[i], RI); - } - // assert(RetVal && "Return value should not be null at this point"); - // StructType* RetType = cast<StructType>(RetVal->getType()); - // assert(RetType && "Return type is not a struct"); - - ReturnInst::Create((F->getContext()), 0, RI); - RI->eraseFromParent(); - - } - } - DEBUG(errs() << "\tReplaced return statements\n"); - - // Create the argument type list with the added argument's type - std::vector<Type*> ArgTypes; - for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - ArgTypes.push_back(ai->getType()); - } - - // Adding new arguments to the function argument list, would not change the - // function type. We need to change the type of this function to reflect the - // added arguments - Type* VoidRetType = Type::getVoidTy(F->getContext()); - FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg()); - - // Change the function type - //F->mutateType(PTy); - Function* newF = cloneFunction(F, newFT, false); - replaceNodeFunctionInIR(*F->getParent(), F, newF); - //F->eraseFromParent(); - return newF; + DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n"); + // FIXME: Maybe do that using the Node? + StructType* FRetTy = dyn_cast<StructType>(F->getReturnType()); + assert(FRetTy && "Return Type must always be a struct"); + + // Keeps return statements, because we will need to replace them + std::vector<ReturnInst *> RItoRemove; + findReturnInst(F, RItoRemove); + + std::vector<Type *> RetArgTypes; + std::vector<Argument*> RetArgs; + std::vector<Argument*> Args; + // Check for { } return struct, which means that the function returns void + if (FRetTy->isEmptyTy()) { + + DEBUG(errs() << "\tFunction output struct is void\n"); + DEBUG(errs() << "\tNo parameters added\n"); + + // Replacing return statements with others returning void + for (auto *RI : RItoRemove) { + ReturnInst::Create((F->getContext()), 0, RI); + RI->eraseFromParent(); + } + DEBUG(errs() << "\tChanged return statements to return void\n"); + } + else { + // The struct has return values, thus needs to be converted to parameter + + // Iterate over all element types of return struct and add arguments to the + // function + for (unsigned i=0; i<FRetTy->getNumElements(); i++) { + Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F); + RetArgs.push_back(RetArg); + RetArgTypes.push_back(RetArg->getType()); + DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n"); + } + + DEBUG(errs() << "\tReplacing Return statements\n"); + // Replace return statements with extractValue and store instructions + for (auto *RI : RItoRemove) { + Value* RetVal = RI->getReturnValue(); + for(unsigned i = 0; i < RetArgs.size(); i++) { + ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i), + RetArgs[i]->getName()+".val", RI); + new StoreInst(EI, RetArgs[i], RI); + } + // assert(RetVal && "Return value should not be null at this point"); + // StructType* RetType = cast<StructType>(RetVal->getType()); + // assert(RetType && "Return type is not a struct"); + + ReturnInst::Create((F->getContext()), 0, RI); + RI->eraseFromParent(); + + } + } + DEBUG(errs() << "\tReplaced return statements\n"); + + // Create the argument type list with the added argument's type + std::vector<Type*> ArgTypes; + for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + ArgTypes.push_back(ai->getType()); + } + for(auto *RATy: RetArgTypes) { + ArgTypes.push_back(RATy); + } + + // Creating Args vector to use in cloning! + for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + Args.push_back(&*ai); + } + for(auto *ai : RetArgs) { + Args.push_back(ai); + } + + // Adding new arguments to the function argument list, would not change the + // function type. We need to change the type of this function to reflect the + // added arguments + Type* VoidRetType = Type::getVoidTy(F->getContext()); + FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg()); + + // Change the function type + //F->mutateType(PTy); + Function* newF = cloneFunction(F, newFT, false, NULL, &Args); + replaceNodeFunctionInIR(*F->getParent(), F, newF); + //F->eraseFromParent(); + return newF; } /****************************************************************************** @@ -1771,334 +2141,333 @@ Function* CGT_NVPTX::transformFunctionToVoid(Function* F) { // 2. Loads not dependent on getNodeInstanceID itrinsic static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector<Value*>*VisitedList) { - if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) { - DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); - return false; - } - VisitedList->push_back(V); - for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); - ui != ue; ++ui) { - Instruction* I = dyn_cast<Instruction>(*ui); - if(!I) { - // if use is not an instruction, then skip it - continue; - } - DEBUG(errs() << "\t" << *I << "\n"); - if(isa<LoadInst>(I)) { - DEBUG(errs() << "\tFound load instruction: " << *I << "\n"); - DEBUG(errs() << "\tAdd to use list: " << *V << "\n"); - UseList->push_back(V); - } - else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) { - // found a store in use chain - DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n"); - return true; - } - else if(BuildDFG::isViscIntrinsic(I)) { - // If it is an atomic intrinsic, we found a store - IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); - assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic") - && "Only visc atomic intrinsics can have an argument as input"); - return true; - } - else { - DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n"); - if(findLoadStoreUses(I, UseList, VisitedList)) - return true; - } - } - return false; + if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) { + DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); + return false; + } + VisitedList->push_back(V); + for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); + ui != ue; ++ui) { + Instruction* I = dyn_cast<Instruction>(*ui); + if(!I) { + // if use is not an instruction, then skip it + continue; + } + DEBUG(errs() << "\t" << *I << "\n"); + if(isa<LoadInst>(I)) { + DEBUG(errs() << "\tFound load instruction: " << *I << "\n"); + DEBUG(errs() << "\tAdd to use list: " << *V << "\n"); + UseList->push_back(V); + } + else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) { + // found a store in use chain + DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n"); + return true; + } + else if(BuildDFG::isViscIntrinsic(I)) { + // If it is an atomic intrinsic, we found a store + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic") + && "Only visc atomic intrinsics can have an argument as input"); + return true; + } + else { + DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n"); + if(findLoadStoreUses(I, UseList, VisitedList)) + return true; + } + } + return false; } static bool isDependentOnNodeInstanceID(Value* V, std::vector<Value*>*DependenceList) { - if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) { - DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); - return false; - } - DependenceList->push_back(V); - // If not an instruction, then not dependent on node instance id - if(!isa<Instruction>(V) || isa<Constant>(V)) { - DEBUG(errs() << "\tStop\n"); - return false; - } - - Instruction* I = cast<Instruction>(V); - for(unsigned i = 0; i < I->getNumOperands(); i++) { - Value* operand = I->getOperand(i); - if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) { - if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x - || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y - || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) { - Value* Node = II->getArgOperand(0); - IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node); - assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n"); - if(GN->getIntrinsicID() == Intrinsic::visc_getNode) { - DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n"); - return true; - } - } - } - if(CmpInst* CI = dyn_cast<CmpInst>(operand)) { - DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n"); - continue; - } - DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n"); - if(isDependentOnNodeInstanceID(operand, DependenceList)) { - return true; - } - } - return false; + if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) { + DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); + return false; + } + DependenceList->push_back(V); + // If not an instruction, then not dependent on node instance id + if(!isa<Instruction>(V) || isa<Constant>(V)) { + DEBUG(errs() << "\tStop\n"); + return false; + } + + Instruction* I = cast<Instruction>(V); + for(unsigned i = 0; i < I->getNumOperands(); i++) { + Value* operand = I->getOperand(i); + if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) { + if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x + || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y + || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) { + Value* Node = II->getArgOperand(0); + IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node); + assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n"); + if(GN->getIntrinsicID() == Intrinsic::visc_getNode) { + DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n"); + return true; + } + } + } + if(CmpInst* CI = dyn_cast<CmpInst>(operand)) { + DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n"); + continue; + } + DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n"); + if(isDependentOnNodeInstanceID(operand, DependenceList)) { + return true; + } + } + return false; } // Function to check if argument arg can be changed to a constant memory pointer static bool canBePromoted(Argument* arg, Function* F) { - DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n"); - std::vector<Value*> UseList; - std::vector<Value*> VisitedList; - // recursively traverse use chain - // if find a store instruction return false, everything fails, cannot be - // promoted - // if find a load instruction as use, add the GEP instruction to list - bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList); - if(foundStore == true) - return false; - // See that the GEP instructions are not dependent on getNodeInstanceID - // intrinsic - DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n"); - std::vector<Value*>DependenceList; - for(auto U: UseList) { - if(isDependentOnNodeInstanceID(U, &DependenceList)) - return false; - } - DEBUG(errs() << "\tYes, Promotable to Constant Memory\n"); - return true; + DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n"); + std::vector<Value*> UseList; + std::vector<Value*> VisitedList; + // recursively traverse use chain + // if find a store instruction return false, everything fails, cannot be + // promoted + // if find a load instruction as use, add the GEP instruction to list + bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList); + if(foundStore == true) + return false; + // See that the GEP instructions are not dependent on getNodeInstanceID + // intrinsic + DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n"); + std::vector<Value*>DependenceList; + for(auto U: UseList) { + if(isDependentOnNodeInstanceID(U, &DependenceList)) + return false; + } + DEBUG(errs() << "\tYes, Promotable to Constant Memory\n"); + return true; } // Calculate execute node parameters which include, number of diemnsions for // dynamic instances of the kernel, local and global work group sizes. static void getExecuteNodeParams(Module &M, Value* &workDim, Value* &LocalWGPtr, Value* - &GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) { - - // Assign number of dimenstions a constant value - workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim); - - // If local work group size if null - if(!kernel->hasLocalWG()) { - LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext())); - } - else { - for(unsigned i = 0; i < kernel->localWGSize.size(); i++) { - if(isa<Argument>(kernel->localWGSize[i])) - kernel->localWGSize[i] = VMap[kernel->localWGSize[i]]; - } - LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize"); - } - - for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) { - if(isa<Argument>(kernel->globalWGSize[i])) - kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]]; - } - - // For OpenCL, global work group size is the total bumber of instances in each - // dimension. So, multiply local and global dim limits. - std::vector<Value*> globalWGSizeInsts; - if(kernel->hasLocalWG()) { - for (unsigned i = 0; i < kernel->gridDim; i++) { - BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB); - globalWGSizeInsts.push_back(MulInst); - } - } - else { - globalWGSizeInsts = kernel->globalWGSize; - } - GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize"); - DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n"); + &GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) { + + // Assign number of dimenstions a constant value + workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim); + + // If local work group size if null + if(!kernel->hasLocalWG()) { + LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext())); + } + else { + for(unsigned i = 0; i < kernel->localWGSize.size(); i++) { + if(isa<Argument>(kernel->localWGSize[i])) + kernel->localWGSize[i] = VMap[kernel->localWGSize[i]]; + } + LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize"); + } + + for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) { + if(isa<Argument>(kernel->globalWGSize[i])) + kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]]; + } + + // For OpenCL, global work group size is the total bumber of instances in each + // dimension. So, multiply local and global dim limits. + std::vector<Value*> globalWGSizeInsts; + if(kernel->hasLocalWG()) { + for (unsigned i = 0; i < kernel->gridDim; i++) { + BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB); + globalWGSizeInsts.push_back(MulInst); + } + } + else { + globalWGSizeInsts = kernel->globalWGSize; + } + GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize"); + DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n"); } // CodeGen for allocating space for Work Group on stack and returning a pointer // to its address static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) { - Value* WGPtr; - // Get int64_t and or ease of use - Type* Int64Ty = Type::getInt64Ty(M.getContext()); - - // Work Group type is [#dim x i64] - Type* WGTy = ArrayType::get(Int64Ty, WGSize.size()); - // Allocate space of Global work group data on stack and get pointer to - // first element. - AllocaInst* WG = new AllocaInst(WGTy, 0, WGName, IB); - WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB); - Value* nextDim = WGPtr; - DEBUG(errs() << *WGPtr << "\n"); - - // Iterate over the number of dimensions and store the global work group - // size in that dimension - for(unsigned i=0; i < WGSize.size(); i++) { - DEBUG(errs() << *WGSize[i] << "\n"); - assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!"); - - if(WGSize[i]->getType() != Int64Ty) { - // If number of dimensions are mentioned in any other integer format, - // generate code to extend it to i64. We need to use the mapped value in - // the new generated function, hence the use of VMap - // FIXME: Why are we changing the kernel WGSize vector here? - DEBUG(errs() << "Not i64. Zero extend required.\n"); - DEBUG(errs() << *WGSize[i] << "\n"); - CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB); - DEBUG(errs() << "Bitcast done.\n"); - StoreInst* SI = new StoreInst(CI, nextDim, IB); - DEBUG(errs() << "Zero extend done.\n"); - DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n"); - } else { - // Store the value representing work group size in ith dimension on - // stack - StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB); - - DEBUG(errs() << "\t Work group size: " << *SI << "\n"); - } - if(i+1 < WGSize.size()) { - // Move to next dimension - GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim, - ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)), - WG->getName()+"."+Twine(i+1), - IB); - DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n"); - nextDim = GEP; - } - } - return WGPtr; + Value* WGPtr; + // Get int64_t and or ease of use + Type* Int64Ty = Type::getInt64Ty(M.getContext()); + + // Work Group type is [#dim x i64] + Type* WGTy = ArrayType::get(Int64Ty, WGSize.size()); + // Allocate space of Global work group data on stack and get pointer to + // first element. + AllocaInst* WG = new AllocaInst(WGTy, 0, WGName, IB); + WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB); + Value* nextDim = WGPtr; + DEBUG(errs() << *WGPtr << "\n"); + + // Iterate over the number of dimensions and store the global work group + // size in that dimension + for(unsigned i=0; i < WGSize.size(); i++) { + DEBUG(errs() << *WGSize[i] << "\n"); + assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!"); + + if(WGSize[i]->getType() != Int64Ty) { + // If number of dimensions are mentioned in any other integer format, + // generate code to extend it to i64. We need to use the mapped value in + // the new generated function, hence the use of VMap + // FIXME: Why are we changing the kernel WGSize vector here? + DEBUG(errs() << "Not i64. Zero extend required.\n"); + DEBUG(errs() << *WGSize[i] << "\n"); + CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB); + DEBUG(errs() << "Bitcast done.\n"); + StoreInst* SI = new StoreInst(CI, nextDim, IB); + DEBUG(errs() << "Zero extend done.\n"); + DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n"); + } else { + // Store the value representing work group size in ith dimension on + // stack + StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB); + + DEBUG(errs() << "\t Work group size: " << *SI << "\n"); + } + if(i+1 < WGSize.size()) { + // Move to next dimension + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim, + ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)), + WG->getName()+"."+Twine(i+1), + IB); + DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n"); + nextDim = GEP; + } + } + return WGPtr; } // Get generated PTX binary name static std::string getPTXFilename(const Module& M) { - std::string moduleID = M.getModuleIdentifier(); - moduleID.append(".kernels.cl"); - return moduleID; + std::string moduleID = M.getModuleIdentifier(); + moduleID.append(".kernels.cl"); + return moduleID; } // Get the name of the input file from module ID static std::string getFilenameFromModule(const Module& M) { - std::string moduleID = M.getModuleIdentifier(); - return moduleID.substr(moduleID.find_last_of("/")+1); + std::string moduleID = M.getModuleIdentifier(); + return moduleID.substr(moduleID.find_last_of("/")+1); } // Changes the data layout of the Module to be compiled with NVPTX backend // TODO: Figure out when to call it, probably after duplicating the modules static void changeDataLayout(Module &M) { - std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"; - std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64"; + std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"; + std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64"; - if (TARGET_PTX == 32) - M.setDataLayout(StringRef(nvptx32_layoutStr)); - else if (TARGET_PTX == 64) - M.setDataLayout(StringRef(nvptx64_layoutStr)); - else assert(false && "Invalid PTX target"); + if (TARGET_PTX == 32) + M.setDataLayout(StringRef(nvptx32_layoutStr)); + else if (TARGET_PTX == 64) + M.setDataLayout(StringRef(nvptx64_layoutStr)); + else assert(false && "Invalid PTX target"); - return; + return; } static void changeTargetTriple(Module &M) { - std::string nvptx32_TargetTriple = "nvptx--nvidiacl"; - std::string nvptx64_TargetTriple = "nvptx64--nvidiacl"; + std::string nvptx32_TargetTriple = "nvptx--nvidiacl"; + std::string nvptx64_TargetTriple = "nvptx64--nvidiacl"; - if (TARGET_PTX == 32) - M.setTargetTriple(StringRef(nvptx32_TargetTriple)); - else if (TARGET_PTX == 64) - M.setTargetTriple(StringRef(nvptx64_TargetTriple)); - else assert(false && "Invalid PTX target"); + if (TARGET_PTX == 32) + M.setTargetTriple(StringRef(nvptx32_TargetTriple)); + else if (TARGET_PTX == 64) + M.setTargetTriple(StringRef(nvptx64_TargetTriple)); + else assert(false && "Invalid PTX target"); - return; + return; } // Helper function, populate a vector with all return statements in a function static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) { - for (auto &BB : *F) { - if(auto *RI = dyn_cast<ReturnInst>(BB.getTerminator())) - ReturnInstVec.push_back(RI); - } + for (auto &BB : *F) { + if(auto *RI = dyn_cast<ReturnInst>(BB.getTerminator())) + ReturnInstVec.push_back(RI); + } } // Helper function, populate a vector with all IntrinsicID intrinsics in a function static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) { - for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { - Instruction *I = &(*i); - IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); - if (II && II->getIntrinsicID() == IntrinsicID) { - IntrinsicInstVec.push_back(II); - } - } + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &(*i); + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + if (II && II->getIntrinsicID() == IntrinsicID) { + IntrinsicInstVec.push_back(II); + } + } } // Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic op static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) { - switch(ID) { - case Intrinsic::visc_atomic_add: - return AtomicRMWInst::Add; - case Intrinsic::visc_atomic_sub: - return AtomicRMWInst::Sub; - case Intrinsic::visc_atomic_min: - return AtomicRMWInst::Min; - case Intrinsic::visc_atomic_umin: - return AtomicRMWInst::UMin; - case Intrinsic::visc_atomic_max: - return AtomicRMWInst::Max; - case Intrinsic::visc_atomic_umax: - return AtomicRMWInst::UMax; - //case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc; - //case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec; - case Intrinsic::visc_atomic_xchg: - return AtomicRMWInst::Xchg; - case Intrinsic::visc_atomic_and: - return AtomicRMWInst::And; - case Intrinsic::visc_atomic_or: - return AtomicRMWInst::Or; - case Intrinsic::visc_atomic_xor: - return AtomicRMWInst::Xor; - default: - llvm_unreachable("Unsupported atomic intrinsic!"); - }; + switch(ID) { + case Intrinsic::visc_atomic_add: + return AtomicRMWInst::Add; + case Intrinsic::visc_atomic_sub: + return AtomicRMWInst::Sub; + case Intrinsic::visc_atomic_min: + return AtomicRMWInst::Min; + case Intrinsic::visc_atomic_umin: + return AtomicRMWInst::UMin; + case Intrinsic::visc_atomic_max: + return AtomicRMWInst::Max; + case Intrinsic::visc_atomic_umax: + return AtomicRMWInst::UMax; + //case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc; + //case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec; + case Intrinsic::visc_atomic_xchg: + return AtomicRMWInst::Xchg; + case Intrinsic::visc_atomic_and: + return AtomicRMWInst::And; + case Intrinsic::visc_atomic_or: + return AtomicRMWInst::Or; + case Intrinsic::visc_atomic_xor: + return AtomicRMWInst::Xor; + default: + llvm_unreachable("Unsupported atomic intrinsic!"); + }; } // Helper funtion, returns the OpenCL function name, corresponding to atomic op static std::string getAtomicOpName(Intrinsic::ID ID) { - switch(ID) { - case Intrinsic::visc_atomic_cmpxchg: - return "atom_cmpxchg"; - case Intrinsic::visc_atomic_add: - return "atom_add"; - case Intrinsic::visc_atomic_sub: - return "atom_sub"; - case Intrinsic::visc_atomic_min: - return "atom_min"; - case Intrinsic::visc_atomic_max: - return "atom_max"; - case Intrinsic::visc_atomic_inc: - return "atom_inc"; - case Intrinsic::visc_atomic_dec: - return "atom_dec"; - case Intrinsic::visc_atomic_xchg: - return "atom_xchg"; - case Intrinsic::visc_atomic_and: - return "atom_and"; - case Intrinsic::visc_atomic_or: - return "atom_or"; - case Intrinsic::visc_atomic_xor: - return "atom_xor"; - default: - llvm_unreachable("Unsupported atomic intrinsic!"); - }; + switch(ID) { + case Intrinsic::visc_atomic_cmpxchg: + return "atom_cmpxchg"; + case Intrinsic::visc_atomic_add: + return "atom_add"; + case Intrinsic::visc_atomic_sub: + return "atom_sub"; + case Intrinsic::visc_atomic_min: + return "atom_min"; + case Intrinsic::visc_atomic_max: + return "atom_max"; + case Intrinsic::visc_atomic_inc: + return "atom_inc"; + case Intrinsic::visc_atomic_dec: + return "atom_dec"; + case Intrinsic::visc_atomic_xchg: + return "atom_xchg"; + case Intrinsic::visc_atomic_and: + return "atom_and"; + case Intrinsic::visc_atomic_or: + return "atom_or"; + case Intrinsic::visc_atomic_xor: + return "atom_xor"; + default: + llvm_unreachable("Unsupported atomic intrinsic!"); + }; } } // End of namespace char DFG2LLVM_NVPTX::ID = 0; static RegisterPass<DFG2LLVM_NVPTX> X("dfg2llvm-nvptx", - "Dataflow Graph to LLVM for NVPTX Pass", - false /* does not modify the CFG */, - true /* transformation, * - * not just analysis */); - + "Dataflow Graph to LLVM for NVPTX Pass", + false /* does not modify the CFG */, + true /* transformation, * + * not just analysis */); -- GitLab