From 9b7e9cf320731dea06919412a3120bae6d7fe09b Mon Sep 17 00:00:00 2001 From: Akash Kothari <akashk4@tyler.cs.illinois.edu> Date: Wed, 22 Jan 2020 11:59:53 -0600 Subject: [PATCH] Removed support for unused intrinsics --- .../DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp | 2906 ++++++++--------- hpvm/lib/Transforms/GenVISC/GenVISC.cpp | 818 +++-- 2 files changed, 1771 insertions(+), 1953 deletions(-) diff --git a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp index 5f861eaf1e..84d17b2657 100644 --- a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp +++ b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -15,28 +15,29 @@ #define SHARED_ADDRSPACE 3 #define DEBUG_TYPE "DFG2LLVM_NVPTX" -#include "SupportVISC/DFG2LLVM.h" -#include "SupportVISC/VISCTimer.h" -#include "SupportVISC/VISCUtils.h" -#include "llvm-c/Core.h" -#include "llvm/IR/Attributes.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstIterator.h" #include "llvm/IR/Module.h" -#include "llvm/IRReader/IRReader.h" -#include "llvm/Linker/Linker.h" #include "llvm/Pass.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/SourceMgr.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils/ValueMapper.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/IR/Attributes.h" +#include "llvm-c/Core.h" +#include "SupportVISC/VISCTimer.h" +#include "SupportVISC/DFG2LLVM.h" +#include "SupportVISC/VISCUtils.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/UseListOrder.h" #include "llvm/Support/ToolOutputFile.h" +#include "llvm/IR/UseListOrder.h" + #include <sstream> @@ -46,8 +47,8 @@ using namespace dfg2llvm; using namespace viscUtils; // VISC Command line option to use timer or not -static cl::opt<bool> VISCTimer_NVPTX("visc-timers-ptx", - cl::desc("Enable visc timers")); +static cl::opt<bool> +VISCTimer_NVPTX("visc-timers-ptx", cl::desc("Enable visc timers")); namespace { // Helper class declarations @@ -56,88 +57,94 @@ namespace { // in bytes. Would have preferred to use tuple but support not yet available class OutputPtr { public: - OutputPtr(Value *_h_ptr, Value *_d_ptr, Value *_bytes) - : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {} + OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes) + : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {} - Value *h_ptr; - Value *d_ptr; - Value *bytes; + Value* h_ptr; + Value* d_ptr; + Value* bytes; }; // Class to maintain important kernel info required for generating runtime // calls class Kernel { public: - Kernel( - Function *_KF, DFLeafNode *_KLeafNode, - std::map<unsigned, unsigned> _inArgMap = std::map<unsigned, unsigned>(), - std::map<unsigned, std::pair<Value *, unsigned>> _sharedInArgMap = - std::map<unsigned, std::pair<Value *, unsigned>>(), - std::vector<unsigned> _outArgMap = std::vector<unsigned>(), - unsigned _gridDim = 0, - std::vector<Value *> _globalWGSize = std::vector<Value *>(), - unsigned _blockDim = 0, - std::vector<Value *> _localWGSize = std::vector<Value *>()) - : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap), - sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), - gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim), - localWGSize(_localWGSize) { - - assert(gridDim == globalWGSize.size() && - "gridDim should be same as the size of vector globalWGSize"); - assert(blockDim == localWGSize.size() && - "blockDim should be same as the size of vector localWGSize"); + Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::map<unsigned, unsigned> _inArgMap = + std::map<unsigned, unsigned>(), + std::map<unsigned, std::pair<Value*, unsigned> > _sharedInArgMap = + std::map<unsigned, std::pair<Value*, unsigned> >(), + std::vector<unsigned> _outArgMap = std::vector<unsigned>(), + unsigned _gridDim = 0, std::vector<Value*> _globalWGSize = std::vector<Value*>(), + unsigned _blockDim = 0, std::vector<Value*> _localWGSize = std::vector<Value*>()) + : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap), + sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), gridDim(_gridDim), + globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) { + + assert(gridDim == globalWGSize.size() + && "gridDim should be same as the size of vector globalWGSize"); + assert(blockDim == localWGSize.size() + && "blockDim should be same as the size of vector localWGSize"); } - Function *KernelFunction; - DFLeafNode *KernelLeafNode; + Function* KernelFunction; + DFLeafNode* KernelLeafNode; std::map<unsigned, unsigned> inArgMap; // Map for shared memory arguments - std::map<unsigned, std::pair<Value *, unsigned>> sharedInArgMap; + std::map<unsigned, std::pair<Value*, unsigned> > sharedInArgMap; // Fields for (potential) allocation node - DFLeafNode *AllocationNode; - Function *AllocationFunction; + DFLeafNode* AllocationNode; + Function* AllocationFunction; std::map<unsigned, unsigned> allocInArgMap; std::vector<unsigned> outArgMap; unsigned gridDim; - std::vector<Value *> globalWGSize; + std::vector<Value*> globalWGSize; unsigned blockDim; - std::vector<Value *> localWGSize; + std::vector<Value*> localWGSize; std::vector<int> localDimMap; - std::map<unsigned, unsigned> &getInArgMap() { return inArgMap; } - void setInArgMap(std::map<unsigned, unsigned> map) { inArgMap = map; } + std::map<unsigned, unsigned> &getInArgMap() { + return inArgMap; + } + void setInArgMap(std::map<unsigned, unsigned> map) { + inArgMap = map; + } - std::map<unsigned, std::pair<Value *, unsigned>> &getSharedInArgMap() { + std::map<unsigned, std::pair<Value*, unsigned> > &getSharedInArgMap() { return sharedInArgMap; } - void setSharedInArgMap(std::map<unsigned, std::pair<Value *, unsigned>> map) { + void setSharedInArgMap(std::map<unsigned, std::pair<Value*, unsigned> > map) { sharedInArgMap = map; } - std::vector<unsigned> &getOutArgMap() { return outArgMap; } - void setOutArgMap(std::vector<unsigned> map) { outArgMap = map; } + std::vector<unsigned> &getOutArgMap() { + return outArgMap; + } + void setOutArgMap(std::vector<unsigned> map) { + outArgMap = map; + } - void setLocalWGSize(std::vector<Value *> V) { localWGSize = V; } + void setLocalWGSize(std::vector<Value*> V) { + localWGSize = V; + } - bool hasLocalWG() const { return blockDim != 0; } + bool hasLocalWG() const { + return blockDim != 0; + } }; // Helper function declarations -static bool canBePromoted(Argument *arg, Function *F); -static void getExecuteNodeParams(Module &M, Value *&, Value *&, Value *&, - Kernel *, ValueToValueMapTy &, Instruction *); -static Value *genWorkGroupPtr(Module &M, std::vector<Value *>, - ValueToValueMapTy &, Instruction *, - const Twine &WGName = "WGSize"); -static std::string getPTXFilename(const Module &); -static std::string getFilenameFromModule(const Module &M); +static bool canBePromoted(Argument* arg, Function* F); +static void getExecuteNodeParams(Module &M, Value* &, Value* &, Value* &, Kernel*, + ValueToValueMapTy&, Instruction*); +static Value* genWorkGroupPtr(Module &M, std::vector<Value*>, ValueToValueMapTy&, + Instruction*, const Twine& WGName = "WGSize"); +static std::string getPTXFilename(const Module&); +static std::string getFilenameFromModule(const Module& M); static void changeDataLayout(Module &); static void changeTargetTriple(Module &); static void findReturnInst(Function *, std::vector<ReturnInst *> &); -static void findIntrinsicInst(Function *, Intrinsic::ID, - std::vector<IntrinsicInst *> &); +static void findIntrinsicInst(Function *, Intrinsic::ID, std::vector<IntrinsicInst *> &); static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID); static std::string getAtomicOpName(Intrinsic::ID); @@ -147,6 +154,7 @@ struct DFG2LLVM_NVPTX : public DFG2LLVM { DFG2LLVM_NVPTX() : DFG2LLVM(ID) {} private: + public: bool runOnModule(Module &M); }; @@ -155,10 +163,10 @@ public: class CGT_NVPTX : public CodeGenTraversal { private: - // Member variables + //Member variables std::unique_ptr<Module> KernelM; - DFNode *KernelLaunchNode = NULL; - Kernel *kernel; + DFNode* KernelLaunchNode = NULL; + Kernel* kernel; // VISC Runtime API FunctionCallee llvm_visc_ocl_launch; @@ -173,16 +181,14 @@ private: FunctionCallee llvm_visc_ocl_getOutput; FunctionCallee llvm_visc_ocl_executeNode; - // Functions + //Functions std::string getKernelsModuleName(Module &M); - void fixValueAddrspace(Value *V, unsigned addrspace); - std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned> *, - Function *); - Function *changeArgAddrspace(Function *F, std::vector<unsigned> &Ags, - unsigned i); - void addCLMetadata(Function *F); - Function *transformFunctionToVoid(Function *F); - void insertRuntimeCalls(DFInternalNode *N, Kernel *K, const Twine &FileName); + void fixValueAddrspace(Value* V, unsigned addrspace); + std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned>*, Function*); + Function* changeArgAddrspace(Function* F, std::vector<unsigned> &Ags, unsigned i); + void addCLMetadata(Function* F); + Function* transformFunctionToVoid(Function* F); + void insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName); // Virtual Functions void init() { @@ -190,25 +196,24 @@ private: TargetName = "NVPTX"; } void initRuntimeAPI(); - void codeGen(DFInternalNode *N); - void codeGen(DFLeafNode *N); + void codeGen(DFInternalNode* N); + void codeGen(DFLeafNode* N); public: + // Constructor - CGT_NVPTX(Module &_M, BuildDFG &_DFG) - : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) { + CGT_NVPTX(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) { init(); initRuntimeAPI(); - DEBUG(errs() << "Old module pointer: " << &_M << "\n"); - DEBUG(errs() << "New module pointer: " << KernelM.get() << "\n"); + errs() << "Old module pointer: " << &_M << "\n"; + errs() << "New module pointer: " << KernelM.get() << "\n"; - // Copying instead of creating new, in order to preserve required info - // (metadata) Remove functions, global variables and aliases - std::vector<GlobalVariable *> GVVect; + // Copying instead of creating new, in order to preserve required info (metadata) + // Remove functions, global variables and aliases + std::vector<GlobalVariable*> GVVect; for (Module::global_iterator mi = KernelM->global_begin(), - me = KernelM->global_end(); - (mi != me); ++mi) { - GlobalVariable *GV = &*mi; + me = KernelM->global_end(); (mi != me); ++mi) { + GlobalVariable* GV = &*mi; GVVect.push_back(GV); } for (auto *GV : GVVect) { @@ -216,10 +221,10 @@ public: GV->eraseFromParent(); } - std::vector<Function *> FuncVect; - for (Module::iterator mi = KernelM->begin(), me = KernelM->end(); - (mi != me); ++mi) { - Function *F = &*mi; + std::vector<Function*> FuncVect; + for (Module::iterator mi = KernelM->begin(), + me = KernelM->end(); (mi != me); ++mi) { + Function* F = &*mi; FuncVect.push_back(F); } for (auto *F : FuncVect) { @@ -227,11 +232,10 @@ public: F->eraseFromParent(); } - std::vector<GlobalAlias *> GAVect; + std::vector<GlobalAlias*> GAVect; for (Module::alias_iterator mi = KernelM->alias_begin(), - me = KernelM->alias_end(); - (mi != me); ++mi) { - GlobalAlias *GA = &*mi; + me = KernelM->alias_end(); (mi != me); ++mi) { + GlobalAlias* GA = &*mi; GAVect.push_back(GA); } for (auto *GA : GAVect) { @@ -242,7 +246,9 @@ public: changeDataLayout(*KernelM); changeTargetTriple(*KernelM); + DEBUG(errs() << *KernelM); + } void writeKernelsModule(); @@ -254,14 +260,14 @@ void CGT_NVPTX::initRuntimeAPI() { // Load Runtime API Module SMDiagnostic Err; - char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); Twine llvmSrcRoot = LLVM_SRC_ROOT; Twine runtimeAPI = llvmSrcRoot + "/tools/hpvm/projects/visc-rt/visc-rt.ll"; runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); - if (runtimeModule == nullptr) + if(runtimeModule == nullptr) DEBUG(errs() << Err.getMessage()); else DEBUG(errs() << "Successfully loaded visc-rt API module\n"); @@ -284,25 +290,27 @@ void CGT_NVPTX::initRuntimeAPI() { // Insert init context in main DEBUG(errs() << "Gen Code to initialize NVPTX Timer\n"); - Function *VI = M.getFunction("llvm.visc.init"); + Function* VI = M.getFunction("llvm.visc.init"); assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); InitCall = cast<Instruction>(*VI->user_begin()); initializeTimerSet(InitCall); switchToTimer(visc_TimerID_INIT_CTX, InitCall); CallInst::Create(llvm_visc_ocl_initContext, - ArrayRef<Value *>(getTargetID(M, visc::GPU_TARGET)), "", - InitCall); + ArrayRef<Value*>(getTargetID(M, visc::GPU_TARGET)), + "", InitCall); switchToTimer(visc_TimerID_NONE, InitCall); // Insert print instruction at visc exit DEBUG(errs() << "Gen Code to print NVPTX Timer\n"); - Function *VC = M.getFunction("llvm.visc.cleanup"); + Function* VC = M.getFunction("llvm.visc.cleanup"); DEBUG(errs() << *VC << "\n"); assert(VC->getNumUses() == 1 && "__visc__clear should only be used once"); CleanupCall = cast<Instruction>(*VC->user_begin()); printTimerSet(CleanupCall); + + } // Generate Code to call the kernel @@ -310,37 +318,36 @@ void CGT_NVPTX::initRuntimeAPI() { // used to generate a function to associate with this leaf node. The function // is responsible for all the memory allocation/transfer and invoking the // kernel call on the device -void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, - const Twine &FileName) { +void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName) { // Check if clone already exists. If it does, it means we have visited this // function before. - // assert(N->getGenFunc() == NULL && "Code already generated for this node"); +// assert(N->getGenFunc() == NULL && "Code already generated for this node"); assert(N->getGenFuncForTarget(visc::GPU_TARGET) == NULL && "Code already generated for this node"); // Useful values - Value *True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1); - Value *False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0); + Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1); + Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0); // If kernel struct has not been initialized with kernel function, then fail assert(K != NULL && "No kernel found!!"); DEBUG(errs() << "Generating kernel call code\n"); - Function *F = N->getFuncPointer(); + Function* F = N->getFuncPointer(); + // Create of clone of F with no instructions. Only the type is the same as F // without the extra arguments. - Function *F_X86; + Function* F_X86; // Clone the function, if we are seeing this function for the first time. We // only need a clone in terms of type. ValueToValueMapTy VMap; // Create new function with the same type - F_X86 = - Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); + F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M); // Loop over the arguments, copying the names of arguments over. Function::arg_iterator dest_iterator = F_X86->arg_begin(); @@ -353,25 +360,26 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, // Add a basic block to this empty function BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86); - ReturnInst *RI = ReturnInst::Create( - M.getContext(), UndefValue::get(F_X86->getReturnType()), BB); + ReturnInst* RI = ReturnInst::Create(M.getContext(), + UndefValue::get(F_X86->getReturnType()), BB); // FIXME: Adding Index and Dim arguments are probably not required except // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do // have those arguments) // Add Index and Dim arguments except for the root node - if (!N->isRoot() && !N->getParent()->isChildGraphStreaming()) + if(!N->isRoot() && !N->getParent()->isChildGraphStreaming()) F_X86 = addIdxDimArgs(F_X86); BB = &*F_X86->begin(); RI = cast<ReturnInst>(BB->getTerminator()); - // Add the generated function info to DFNode - // N->setGenFunc(F_X86, visc::CPU_TARGET); + //Add the generated function info to DFNode +// N->setGenFunc(F_X86, visc::CPU_TARGET); N->addGenFunc(F_X86, visc::GPU_TARGET, true); - DEBUG(errs() << "Added GPUGenFunc: " << F_X86->getName() << " for node " - << N->getFuncPointer()->getName() << "\n"); + errs() << "Added GPUGenFunc: " << F_X86->getName() << " for node " + << N->getFuncPointer()->getName() << "\n"; + // Loop over the arguments, to create the VMap dest_iterator = F_X86->arg_begin(); @@ -404,53 +412,51 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, break; } - assert(C->isDummyNode() == false && "Internal Node only contains dummy - nodes!"); + assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!"); Function* CF = C->getFuncPointer(); */ - Function *KF = K->KernelLeafNode->getFuncPointer(); + Function* KF = K->KernelLeafNode->getFuncPointer(); // Initialize context - // DEBUG(errs() << "Initializing context" << "\n"); - // CallInst::Create(llvm_visc_ocl_initContext, None, "", RI); + //DEBUG(errs() << "Initializing context" << "\n"); + //CallInst::Create(llvm_visc_ocl_initContext, None, "", RI); - DEBUG(errs() << "Initializing commandQ" - << "\n"); + DEBUG(errs() << "Initializing commandQ" << "\n"); // Initialize command queue switchToTimer(visc_TimerID_SETUP, InitCall); - Value *fileStr = getStringPointer(FileName, InitCall, "Filename"); + Value* fileStr = getStringPointer(FileName, InitCall, "Filename"); DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n"); - DEBUG(errs() << "Generating code for kernel - " - << K->KernelFunction->getName() << "\n"); - Value *kernelStr = - getStringPointer(K->KernelFunction->getName(), InitCall, "KernelName"); - - Value *LaunchInstArgs[] = {fileStr, kernelStr}; - - DEBUG(errs() << "Inserting launch call" - << "\n"); - CallInst *NVPTX_Ctx = CallInst::Create(llvm_visc_ocl_launch, - ArrayRef<Value *>(LaunchInstArgs, 2), - "graph" + KF->getName(), InitCall); + DEBUG(errs() << "Generating code for kernel - " << K->KernelFunction->getName()<< "\n"); + Value* kernelStr = getStringPointer(K->KernelFunction->getName(), InitCall,"KernelName"); + + Value* LaunchInstArgs[] = {fileStr, kernelStr}; + + DEBUG(errs() << "Inserting launch call" << "\n"); + CallInst* NVPTX_Ctx = CallInst::Create(llvm_visc_ocl_launch, + ArrayRef<Value*>(LaunchInstArgs, 2), + "graph"+KF->getName(), + InitCall); DEBUG(errs() << *NVPTX_Ctx << "\n"); - GraphIDAddr = new GlobalVariable(M, NVPTX_Ctx->getType(), false, + GraphIDAddr = new GlobalVariable(M, + NVPTX_Ctx->getType(), + false, GlobalValue::CommonLinkage, Constant::getNullValue(NVPTX_Ctx->getType()), - "graph" + KF->getName() + ".addr"); + "graph"+KF->getName()+".addr"); DEBUG(errs() << "Store at: " << *GraphIDAddr << "\n"); - StoreInst *SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall); + StoreInst* SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall); DEBUG(errs() << *SI << "\n"); switchToTimer(visc_TimerID_NONE, InitCall); switchToTimer(visc_TimerID_SETUP, RI); - Value *GraphID = new LoadInst(GraphIDAddr, "graph." + KF->getName(), RI); + Value* GraphID = new LoadInst(GraphIDAddr, "graph."+KF->getName(), RI); // Iterate over the required input edges of the node and use the visc-rt API // to set inputs DEBUG(errs() << "Iterate over input edges of node and insert visc api\n"); std::vector<OutputPtr> OutputPointers; - // Vector to hold the device memory object that need to be cleared before we - // release context - std::vector<Value *> DevicePointers; + // Vector to hold the device memory object that need to be cleared before we release + // context + std::vector<Value*> DevicePointers; std::map<unsigned, unsigned> &kernelInArgMap = K->getInArgMap(); /* @@ -462,134 +468,133 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, */ - for (auto &InArgMapPair : kernelInArgMap) { + for(auto &InArgMapPair : kernelInArgMap) { unsigned i = InArgMapPair.first; - Value *inputVal = getArgumentAt(F_X86, InArgMapPair.second); - DEBUG(errs() << "\tArgument " << i << " = " << *inputVal << "\n"); + Value* inputVal = getArgumentAt(F_X86, InArgMapPair.second); + DEBUG(errs() << "\tArgument "<< i<< " = " << *inputVal << "\n"); // input value has been obtained. // Check if input is a scalar value or a pointer operand // For scalar values such as int, float, etc. the size is simply the size of // type on target machine, but for pointers, the size of data would be the // next integer argument - if (inputVal->getType()->isPointerTy()) { + if(inputVal->getType()->isPointerTy()) { switchToTimer(visc_TimerID_COPY_PTR, RI); // Pointer Input // CheckAttribute - Value *isOutput = (hasAttribute(KF, i, Attribute::Out)) ? True : False; - Value *isInput = ((hasAttribute(KF, i, Attribute::Out)) && - !(hasAttribute(KF, i, Attribute::In))) - ? False - : True; - - Argument *A = getArgumentAt(KF, i); - if (isOutput == True) { + Value* isOutput = (hasAttribute(KF, i, Attribute::Out))? True : False; + Value* isInput = ((hasAttribute(KF, i, Attribute::Out)) + && !(hasAttribute(KF, i, Attribute::In)))? False : True; + + Argument* A = getArgumentAt(KF, i); + if(isOutput == True) { DEBUG(errs() << *A << " is an OUTPUT argument\n"); } - if (isInput == True) { + if(isInput == True) { DEBUG(errs() << *A << " is an INPUT argument\n"); } - Value *inputValI8Ptr = CastInst::CreatePointerCast( - inputVal, Type::getInt8PtrTy(M.getContext()), - inputVal->getName() + ".i8ptr", RI); + + Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal, + Type::getInt8PtrTy(M.getContext()), + inputVal->getName()+".i8ptr", + RI); // Assert that the pointer argument size (next argument) is in the map - assert(kernelInArgMap.find(i + 1) != kernelInArgMap.end()); - - Value *inputSize = getArgumentAt(F_X86, kernelInArgMap[i + 1]); - assert( - inputSize->getType() == Type::getInt64Ty(M.getContext()) && - "Pointer type input must always be followed by size (integer type)"); - Value *setInputArgs[] = { - GraphID, - inputValI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()), i), - inputSize, - isInput, - isOutput}; - Value *d_ptr = - CallInst::Create(llvm_visc_ocl_argument_ptr, - ArrayRef<Value *>(setInputArgs, 6), "", RI); + assert(kernelInArgMap.find(i+1) != kernelInArgMap.end()); + + Value* inputSize = getArgumentAt(F_X86, kernelInArgMap[i+1]); + assert(inputSize->getType() == Type::getInt64Ty(M.getContext()) + && "Pointer type input must always be followed by size (integer type)"); + Value* setInputArgs[] = {GraphID, + inputValI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),i), + inputSize, + isInput, + isOutput + }; + Value* d_ptr = CallInst::Create(llvm_visc_ocl_argument_ptr, + ArrayRef<Value*>(setInputArgs, 6), "", RI); DevicePointers.push_back(d_ptr); // If this has out attribute, store the returned device pointer in // memory to read device memory later - if (isOutput == True) - OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize)); - } else { + if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize)); + } + else { switchToTimer(visc_TimerID_COPY_SCALAR, RI); // Scalar Input // Store the scalar value on stack and then pass the pointer to its // location - AllocaInst *inputValPtr = new AllocaInst( - inputVal->getType(), 0, inputVal->getName() + ".ptr", RI); - StoreInst *SI = new StoreInst(inputVal, inputValPtr, RI); - - Value *inputValI8Ptr = CastInst::CreatePointerCast( - inputValPtr, Type::getInt8PtrTy(M.getContext()), - inputVal->getName() + ".i8ptr", RI); - - Value *setInputArgs[] = { - GraphID, inputValI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()), i), - ConstantExpr::getSizeOf(inputVal->getType())}; + AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), 0, inputVal->getName()+".ptr", RI); + StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI); + + Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr, + Type::getInt8PtrTy(M.getContext()), + inputVal->getName()+".i8ptr", + RI); + + Value* setInputArgs[] = {GraphID, + inputValI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),i), + ConstantExpr::getSizeOf(inputVal->getType()) + }; CallInst::Create(llvm_visc_ocl_argument_scalar, - ArrayRef<Value *>(setInputArgs, 4), "", RI); + ArrayRef<Value*>(setInputArgs, 4), "", RI); } } - DEBUG( - errs() << "Setup shared memory arguments of node and insert visc api\n"); + DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n"); // Check to see if all the allocation sizes are constant (determined // statically) bool constSizes = true; - for (auto &e : K->getSharedInArgMap()) { + for (auto& e: K->getSharedInArgMap()) { constSizes &= isa<Constant>(e.second.first); } // If the sizes are all constant if (constSizes) { - for (auto &e : K->getSharedInArgMap()) { + for (auto& e: K->getSharedInArgMap()) { unsigned argNum = e.first; - Value *allocSize = e.second.first; + Value* allocSize = e.second.first; - DEBUG(errs() << "\tLocal Memory at " << argNum - << ", size = " << *allocSize << "\n"); + DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = " << *allocSize << "\n"); if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) { // Shared memory ptr argument - scalar at size position switchToTimer(visc_TimerID_COPY_SCALAR, RI); - assert(isa<Constant>(allocSize) && - "Constant shared memory size is expected"); + assert(isa<Constant>(allocSize) && "Constant shared memory size is expected"); - Value *setInputArgs[] = { - GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum), - allocSize}; + Value* setInputArgs[] = {GraphID, + ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), + allocSize + }; CallInst::Create(llvm_visc_ocl_argument_shared, - ArrayRef<Value *>(setInputArgs, 3), "", RI); - } else { + ArrayRef<Value*>(setInputArgs, 3), "", RI); + } + else { // Sharem memory size argument - scalar at address position switchToTimer(visc_TimerID_COPY_SCALAR, RI); // Store the scalar value on stack and then pass the pointer to its // location - AllocaInst *allocSizePtr = - new AllocaInst(allocSize->getType(), 0, - allocSize->getName() + ".sharedMem.ptr", RI); - StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI); - - Value *allocSizeI8Ptr = CastInst::CreatePointerCast( - allocSizePtr, Type::getInt8PtrTy(M.getContext()), - allocSize->getName() + ".sharedMem.i8ptr", RI); - - Value *setInputArgs[] = { - GraphID, allocSizeI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum), - ConstantExpr::getSizeOf(allocSize->getType())}; + AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), 0, + allocSize->getName()+".sharedMem.ptr", RI); + StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI); + + Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr, + Type::getInt8PtrTy(M.getContext()), + allocSize->getName()+".sharedMem.i8ptr", + RI); + + Value* setInputArgs[] = {GraphID, + allocSizeI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), + ConstantExpr::getSizeOf(allocSize->getType()) + }; CallInst::Create(llvm_visc_ocl_argument_scalar, - ArrayRef<Value *>(setInputArgs, 4), "", RI); + ArrayRef<Value*>(setInputArgs, 4), "", RI); } } } else { @@ -610,64 +615,68 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, ExtractValueInstVec.push_back(EI); } - for (auto &e : K->getSharedInArgMap()) { + for (auto& e: K->getSharedInArgMap()) { unsigned argNum = e.first; - Value *allocSize = ExtractValueInstVec[e.second.second / 2]; + Value* allocSize = ExtractValueInstVec[e.second.second/2]; - DEBUG(errs() << "\tLocal Memory at " << argNum - << ", size = " << *allocSize << "\n"); + DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = " << *allocSize << "\n"); if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) { // Shared memory ptr argument - scalar at size position switchToTimer(visc_TimerID_COPY_SCALAR, RI); - Value *setInputArgs[] = { - GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum), - allocSize}; + Value* setInputArgs[] = {GraphID, + ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), + allocSize + }; CallInst::Create(llvm_visc_ocl_argument_shared, - ArrayRef<Value *>(setInputArgs, 3), "", RI); - } else { + ArrayRef<Value*>(setInputArgs, 3), "", RI); + } + else { // Sharem memory size argument - scalar at address position switchToTimer(visc_TimerID_COPY_SCALAR, RI); // Store the scalar value on stack and then pass the pointer to its // location - AllocaInst *allocSizePtr = - new AllocaInst(allocSize->getType(), 0, - allocSize->getName() + ".sharedMem.ptr", RI); - StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI); - - Value *allocSizeI8Ptr = CastInst::CreatePointerCast( - allocSizePtr, Type::getInt8PtrTy(M.getContext()), - allocSize->getName() + ".sharedMem.i8ptr", RI); - - Value *setInputArgs[] = { - GraphID, allocSizeI8Ptr, - ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum), - ConstantExpr::getSizeOf(allocSize->getType())}; + AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), 0, + allocSize->getName()+".sharedMem.ptr", RI); + StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI); + + Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr, + Type::getInt8PtrTy(M.getContext()), + allocSize->getName()+".sharedMem.i8ptr", + RI); + + Value* setInputArgs[] = {GraphID, + allocSizeI8Ptr, + ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum), + ConstantExpr::getSizeOf(allocSize->getType()) + }; CallInst::Create(llvm_visc_ocl_argument_scalar, - ArrayRef<Value *>(setInputArgs, 4), "", RI); + ArrayRef<Value*>(setInputArgs, 4), "", RI); } } } + DEBUG(errs() << "Setup output edges of node and insert visc api\n"); // Set output if struct is not an empty struct - StructType *OutputTy = K->KernelLeafNode->getOutputType(); - std::vector<Value *> d_Outputs; - if (!OutputTy->isEmptyTy()) { + StructType* OutputTy = K->KernelLeafNode->getOutputType(); + std::vector<Value*> d_Outputs; + if(!OutputTy->isEmptyTy()) { switchToTimer(visc_TimerID_COPY_PTR, RI); // Not an empty struct // Iterate over all elements of the struct and put them in - for (unsigned i = 0; i < OutputTy->getNumElements(); i++) { - unsigned outputIndex = KF->getFunctionType()->getNumParams() + i; - Value *setOutputArgs[] = { - GraphID, - ConstantInt::get(Type::getInt32Ty(M.getContext()), outputIndex), - ConstantExpr::getSizeOf(OutputTy->getElementType(i))}; - - CallInst *d_Output = CallInst::Create(llvm_visc_ocl_output_ptr, - ArrayRef<Value *>(setOutputArgs, 3), - "d_output." + KF->getName(), RI); + for(unsigned i=0; i < OutputTy->getNumElements(); i++) { + unsigned outputIndex = KF->getFunctionType()->getNumParams()+i; + Value* setOutputArgs[] = {GraphID, + ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex), + ConstantExpr::getSizeOf(OutputTy->getElementType(i)) + }; + + CallInst* d_Output = CallInst::Create(llvm_visc_ocl_output_ptr, + ArrayRef<Value*>(setOutputArgs, 3), + "d_output."+KF->getName(), + RI); d_Outputs.push_back(d_Output); } } @@ -681,37 +690,46 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, Value *workDim, *LocalWGPtr, *GlobalWGPtr; getExecuteNodeParams(M, workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI); switchToTimer(visc_TimerID_KERNEL, RI); - Value *ExecNodeArgs[] = {GraphID, workDim, LocalWGPtr, GlobalWGPtr}; - CallInst *Event = CallInst::Create(llvm_visc_ocl_executeNode, - ArrayRef<Value *>(ExecNodeArgs, 4), - "event." + KF->getName(), RI); + Value* ExecNodeArgs[] = {GraphID, + workDim, + LocalWGPtr, + GlobalWGPtr + }; + CallInst* Event = CallInst::Create(llvm_visc_ocl_executeNode, + ArrayRef<Value*>(ExecNodeArgs, 4), + "event."+KF->getName(), + RI); DEBUG(errs() << "Execute Node Call: " << *Event << "\n"); // Wait for Kernel to Finish - CallInst::Create(llvm_visc_ocl_wait, ArrayRef<Value *>(GraphID), "", RI); + CallInst::Create(llvm_visc_ocl_wait, + ArrayRef<Value*>(GraphID), + "", + RI); switchToTimer(visc_TimerID_READ_OUTPUT, RI); // Read Output Struct if not empty - if (!OutputTy->isEmptyTy()) { - std::vector<Value *> h_Outputs; - Value *KernelOutput = UndefValue::get(OutputTy); - for (unsigned i = 0; i < OutputTy->getNumElements(); i++) { - Value *GetOutputArgs[] = { - GraphID, Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), - d_Outputs[i], ConstantExpr::getSizeOf(OutputTy->getElementType(i))}; - CallInst *h_Output = CallInst::Create( - llvm_visc_ocl_getOutput, ArrayRef<Value *>(GetOutputArgs, 4), - "h_output." + KF->getName() + ".addr", RI); + if(!OutputTy->isEmptyTy()) { + std::vector<Value*>h_Outputs; + Value* KernelOutput = UndefValue::get(OutputTy); + for(unsigned i=0; i < OutputTy->getNumElements(); i++) { + Value* GetOutputArgs[] = {GraphID, + Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), + d_Outputs[i], + ConstantExpr::getSizeOf(OutputTy->getElementType(i)) + }; + CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput, + ArrayRef<Value*>(GetOutputArgs, 4), + "h_output."+KF->getName()+".addr", + RI); // Read each device pointer listed in output struct // Load the output struct - CastInst *BI = BitCastInst::CreatePointerCast( - h_Output, OutputTy->getElementType(i)->getPointerTo(), "output.ptr", - RI); - - Value *OutputElement = new LoadInst(BI, "output." + KF->getName(), RI); - KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, - ArrayRef<unsigned>(i), - KF->getName() + "output", RI); + CastInst* BI = BitCastInst::CreatePointerCast(h_Output, + OutputTy->getElementType(i)->getPointerTo(), "output.ptr", RI); + + Value* OutputElement = new LoadInst(BI, "output."+KF->getName(), RI); + KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, ArrayRef<unsigned>(i), + KF->getName()+"output", RI); } OutputMap[K->KernelLeafNode] = KernelOutput; } @@ -726,76 +744,75 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n"); DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n"); - Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, - output.bytes}; CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput, + Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes}; + CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput, ArrayRef<Value*>(GetOutputArgs, 4), "", RI); }*/ switchToTimer(visc_TimerID_MEM_FREE, RI); // Clear Context and free device memory - DEBUG(errs() << "Clearing context" - << "\n"); + DEBUG(errs() << "Clearing context" << "\n"); // Free Device Memory - for (auto d_ptr : DevicePointers) { - CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value *>(d_ptr), "", RI); + for(auto d_ptr: DevicePointers) { + CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value*>(d_ptr), "", RI); } switchToTimer(visc_TimerID_CLEAR_CTX, CleanupCall); // Clear Context - LoadInst *LI = new LoadInst(GraphIDAddr, "", CleanupCall); - CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value *>(LI), "", - CleanupCall); + LoadInst* LI = new LoadInst(GraphIDAddr, "", CleanupCall); + CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value*>(LI), "", CleanupCall); switchToTimer(visc_TimerID_NONE, CleanupCall); switchToTimer(visc_TimerID_MISC, RI); DEBUG(errs() << "*** Generating epilogue code for the function****\n"); // Generate code for output bindings // Get Exit node - DFNode *C = N->getChildGraph()->getExit(); + DFNode* C = N->getChildGraph()->getExit(); // Get OutputType of this node - StructType *OutTy = N->getOutputType(); + StructType* OutTy = N->getOutputType(); Value *retVal = UndefValue::get(F_X86->getReturnType()); // Find the kernel's output arg map, to use instead of the bindings std::vector<unsigned> outArgMap = kernel->getOutArgMap(); // Find all the input edges to exit node - for (unsigned i = 0; i < OutTy->getNumElements(); i++) { + for (unsigned i=0; i < OutTy->getNumElements(); i++) { DEBUG(errs() << "Output Edge " << i << "\n"); // Find the incoming edge at the requested input port - DFEdge *E = C->getInDFEdgeAt(i); + DFEdge* E = C->getInDFEdgeAt(i); assert(E && "No Binding for output element!"); // Find the Source DFNode associated with the incoming edge - DFNode *SrcDF = E->getSourceDF(); + DFNode* SrcDF = E->getSourceDF(); - DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() - << "\n"); + DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName() << "\n"); // If Source DFNode is a dummyNode, edge is from parent. Get the // argument from argument list of this internal node - Value *inputVal; - if (SrcDF->isEntryNode()) { + Value* inputVal; + if(SrcDF->isEntryNode()) { inputVal = getArgumentAt(F_X86, i); - DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n"); - } else { + DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); + } + else { // edge is from a internal node // Check - code should already be generated for this source dfnode // FIXME: Since the 2-level kernel code gen has aspecific structure, we // can assume the SrcDF is same as Kernel Leaf node. // Use outArgMap to get correct mapping SrcDF = K->KernelLeafNode; - assert(OutputMap.count(SrcDF) && - "Source node call not found. Dependency violation!"); + assert(OutputMap.count(SrcDF) + && "Source node call not found. Dependency violation!"); // Find Output Value associated with the Source DFNode using OutputMap - Value *CI = OutputMap[SrcDF]; + Value* CI = OutputMap[SrcDF]; // Extract element at source position from this call instruction std::vector<unsigned> IndexList; // i is the destination of DFEdge E // Use the mapping instead of the bindings - // IndexList.push_back(E->getSourcePosition()); +// IndexList.push_back(E->getSourcePosition()); IndexList.push_back(outArgMap[i]); - DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n"); - ExtractValueInst *EI = ExtractValueInst::Create(CI, IndexList, "", RI); + DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n"); + ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList, + "",RI); inputVal = EI; } std::vector<unsigned> IdxList; @@ -806,31 +823,29 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K, DEBUG(errs() << "Extracted all\n"); switchToTimer(visc_TimerID_NONE, RI); retVal->setName("output"); - ReturnInst *newRI = ReturnInst::Create(F_X86->getContext(), retVal); + ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal); ReplaceInstWithInst(RI, newRI); } + // Right now, only targeting the one level case. In general, device functions // can return values so we don't need to change them -void CGT_NVPTX::codeGen(DFInternalNode *N) { - DEBUG(errs() << "Inside internal node: " << N->getFuncPointer()->getName() - << "\n"); - if (KernelLaunchNode == NULL) - DEBUG(errs() << "No kernel launch node\n"); +void CGT_NVPTX::codeGen(DFInternalNode* N) { + errs () << "Inside internal node: " << N->getFuncPointer()->getName() << "\n"; + if(KernelLaunchNode == NULL) + errs () << "No kernel launch node\n"; else { - DEBUG(errs() << "KernelLaunchNode: " - << KernelLaunchNode->getFuncPointer()->getName() << "\n"); + errs() << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n"; } if (!KernelLaunchNode) { - DEBUG(errs() - << "No code generated (host code for kernel launch complete).\n"); + DEBUG(errs() << "No code generated (host code for kernel launch complete).\n"); return; } if (N == KernelLaunchNode) { DEBUG(errs() << "Found kernel launch node. Generating host code.\n"); - // TODO + //TODO // Now the remaining nodes to be visited should be ignored KernelLaunchNode = NULL; @@ -845,8 +860,7 @@ void CGT_NVPTX::codeGen(DFInternalNode *N) { // TODO: Structure assumed: one thread node, one allocation node (at most), // TB node std::map<unsigned, unsigned> inmapFinal; - for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), - ie = inmap2.end(); + for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), ie = inmap2.end(); ib != ie; ++ib) { inmapFinal[ib->first] = inmap1[ib->second]; } @@ -863,9 +877,8 @@ void CGT_NVPTX::codeGen(DFInternalNode *N) { // 0 ... outmap2.size()-1 // The limit is the size of outmap2, because this is the number of kernel // output arguments for which the mapping matters - // For now, it reasonable to assume that all the kernel arguments are - // returned, maybe plys some others from other nodes, thus outmap2.size() <= - // outmap1.size() + // For now, it reasonable to assume that all the kernel arguments are returned, + // maybe plys some others from other nodes, thus outmap2.size() <= outmap1.size() for (unsigned i = 0; i < outmap2.size(); i++) { outmap1[i] = outmap2[outmap1[i]]; } @@ -873,14 +886,15 @@ void CGT_NVPTX::codeGen(DFInternalNode *N) { // Track the source of local dimlimits for the kernel // Dimension limit can either be a constant or an argument of parent - // function. Since Internal node would no longer exist, we need to insert - // the localWGSize with values from the parent of N. - std::vector<Value *> localWGSizeMapped; + // function. Since Internal node would no longer exist, we need to insert the + // localWGSize with values from the parent of N. + std::vector<Value*> localWGSizeMapped; for (unsigned i = 0; i < kernel->localWGSize.size(); i++) { if (isa<Constant>(kernel->localWGSize[i])) { // if constant, use as it is localWGSizeMapped.push_back(kernel->localWGSize[i]); - } else if (Argument *Arg = dyn_cast<Argument>(kernel->localWGSize[i])) { + } + else if (Argument* Arg = dyn_cast<Argument>(kernel->localWGSize[i])) { // if argument, find the argument location in N. Use InArgMap of N to // find the source location in Parent of N. Retrieve the argument from // parent to insert in the vector. @@ -890,49 +904,46 @@ void CGT_NVPTX::codeGen(DFInternalNode *N) { assert(N->getInArgMap().find(argNum) != N->getInArgMap().end()); unsigned parentArgNum = N->getInArgMap()[argNum]; - Argument *A = - getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum); + Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum); localWGSizeMapped.push_back(A); - } else { - assert( - false && - "LocalWGsize using value which is neither argument nor constant!"); + } + else { + assert(false && "LocalWGsize using value which is neither argument nor constant!"); } } // Update localWGSize vector of kernel kernel->setLocalWGSize(localWGSizeMapped); } + } -void CGT_NVPTX::codeGen(DFLeafNode *N) { - DEBUG(errs() << "Inside leaf node: " << N->getFuncPointer()->getName() - << "\n"); +void CGT_NVPTX::codeGen(DFLeafNode* N) { + errs () << "Inside leaf node: " << N->getFuncPointer()->getName() << "\n"; // Skip code generation if it is a dummy node - if (N->isDummyNode()) { + if(N->isDummyNode()) { DEBUG(errs() << "Skipping dummy node\n"); return; } // Skip code generation if it is an allocation node - if (N->isAllocationNode()) { + if(N->isAllocationNode()) { DEBUG(errs() << "Skipping allocation node\n"); return; } // Generate code only if it has the right hint - // if(!checkPreferredTarget(N, visc::GPU_TARGET)) { - // errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; - // return; - // } - if (!preferredTargetIncludes(N, visc::GPU_TARGET)) { - DEBUG(errs() << "Skipping node: " << N->getFuncPointer()->getName() - << "\n"); +// if(!checkPreferredTarget(N, visc::GPU_TARGET)) { +// errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; +// return; +// } + if(!preferredTargetIncludes(N, visc::GPU_TARGET)) { + errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n"; return; } // Checking which node is the kernel launch - DFNode *PNode = N->getParent(); + DFNode* PNode = N->getParent(); int pLevel = PNode->getLevel(); int pReplFactor = PNode->getNumOfDim(); @@ -940,40 +951,42 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { // (1) Parent is the top level node i.e., Root of DFG // OR // (2) Parent does not have multiple instances - DEBUG(errs() << "pLevel = " << pLevel << "\n"); - DEBUG(errs() << "pReplFactor = " << pReplFactor << "\n"); + errs() << "pLevel = " << pLevel << "\n"; + errs() << "pReplFactor = " << pReplFactor << "\n"; assert((pLevel > 0) && "Root not allowed to be chosen as Kernel Node."); // Only these options are supported - enum XLevelHierarchy { ONE_LEVEL, TWO_LEVEL } SelectedHierarchy; - if (pLevel == 1 || !pReplFactor) { - DEBUG(errs() - << "*************** Kernel Gen: 1-Level Hierarchy **************\n"); + enum XLevelHierarchy{ONE_LEVEL, TWO_LEVEL} SelectedHierarchy; + if(pLevel == 1 || !pReplFactor) { + errs() << "*************** Kernel Gen: 1-Level Hierarchy **************\n"; SelectedHierarchy = ONE_LEVEL; KernelLaunchNode = PNode; - kernel = new Kernel(NULL, N, N->getInArgMap(), N->getSharedInArgMap(), - N->getOutArgMap(), N->getNumOfDim(), N->getDimLimits()); - } else { + kernel = new Kernel(NULL, + N, + N->getInArgMap(), + N->getSharedInArgMap(), + N->getOutArgMap(), + N->getNumOfDim(), + N->getDimLimits()); + } + else { // Converting a 2-level DFG to opencl kernel - DEBUG(errs() - << "*************** Kernel Gen: 2-Level Hierarchy **************\n"); - assert((pLevel >= 2) && - "Selected node not nested deep enough to be Kernel Node."); + errs() << "*************** Kernel Gen: 2-Level Hierarchy **************\n"; + assert((pLevel >= 2) && "Selected node not nested deep enough to be Kernel Node."); SelectedHierarchy = TWO_LEVEL; KernelLaunchNode = PNode->getParent(); - assert((PNode->getNumOfDim() == N->getNumOfDim()) && - "Dimension number must match"); + assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match"); // Contains the instructions generating the kernel configuration parameters - kernel = new Kernel(NULL, // kernel function - N, // kernel leaf node - N->getInArgMap(), // kenel argument mapping + kernel = new Kernel(NULL, // kernel function + N, // kernel leaf node + N->getInArgMap(), // kenel argument mapping N->getSharedInArgMap(), - N->getOutArgMap(), // kernel output mapping from the - // leaf to the interemediate node - PNode->getNumOfDim(), // gridDim - PNode->getDimLimits(), // grid size - N->getNumOfDim(), // blockDim - N->getDimLimits()); // block size + N->getOutArgMap(), // kernel output mapping from the leaf to the interemediate node + PNode->getNumOfDim(), // gridDim + PNode->getDimLimits(),// grid size + N->getNumOfDim(), // blockDim + N->getDimLimits()); // block size + } std::vector<Instruction *> IItoRemove; @@ -985,62 +998,58 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { // Look up if we have visited this function before. If we have, then just // get the cloned function pointer from DFNode. Otherwise, create the cloned // function and add it to the DFNode GenFunc. - // Function *F_nvptx = N->getGenFunc(); +// Function *F_nvptx = N->getGenFunc(); Function *F_nvptx = N->getGenFuncForTarget(visc::GPU_TARGET); - assert(F_nvptx == NULL && - "Error: Visiting a node for which code already generated"); + assert(F_nvptx == NULL && "Error: Visiting a node for which code already generated"); // Clone the function ValueToValueMapTy VMap; - // F_nvptx->setName(FName+"_nvptx"); + //F_nvptx->setName(FName+"_nvptx"); Twine FName = F->getName(); StringRef fStr = FName.getSingleStringRef(); - Twine newFName = Twine(fStr, "_nvptx"); + Twine newFName = Twine(fStr, "_nvptx"); F_nvptx = CloneFunction(F, VMap); F_nvptx->setName(newFName); + // errs() << "Old Function Name: " << F->getName() << "\n"; // errs() << "New Function Name: " << F_nvptx->getName() << "\n"; F_nvptx->removeFromParent(); + // Insert the cloned function into the kernels module KernelM->getFunctionList().push_back(F_nvptx); - // TODO: Iterate over all the instructions of F_nvptx and identify the - // callees and clone them into this module. + + //TODO: Iterate over all the instructions of F_nvptx and identify the + //callees and clone them into this module. DEBUG(errs() << *F_nvptx->getType()); DEBUG(errs() << *F_nvptx); // Transform the function to void and remove all target dependent attributes // from the function F_nvptx = transformFunctionToVoid(F_nvptx); - - // Add generated function info to DFNode - // N->setGenFunc(F_nvptx, visc::GPU_TARGET); + + //Add generated function info to DFNode +// N->setGenFunc(F_nvptx, visc::GPU_TARGET); N->addGenFunc(F_nvptx, visc::GPU_TARGET, false); - DEBUG( - errs() - << "Removing all attributes from Kernel Function and adding nounwind\n"); - F_nvptx->removeAttributes(AttributeList::FunctionIndex, - F_nvptx->getAttributes().getFnAttributes()); + DEBUG(errs() << "Removing all attributes from Kernel Function and adding nounwind\n"); + F_nvptx->removeAttributes(AttributeList::FunctionIndex, F_nvptx->getAttributes().getFnAttributes()); F_nvptx->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind); - // FIXME: For now, assume only one allocation node + //FIXME: For now, assume only one allocation node kernel->AllocationNode = NULL; - for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), - iee = N->indfedge_end(); + for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), iee = N->indfedge_end(); ieb != iee; ++ieb) { DFNode *SrcDFNode = (*ieb)->getSourceDF(); - DEBUG(errs() << "Found edge from node: " - << " " << SrcDFNode->getFuncPointer()->getName() << "\n"); + DEBUG(errs() << "Found edge from node: " << " " << SrcDFNode->getFuncPointer()->getName() << "\n"); DEBUG(errs() << "Current Node: " << N->getFuncPointer()->getName() << "\n"); - DEBUG(errs() << "isAllocationNode = " << SrcDFNode->isAllocationNode() - << "\n"); + DEBUG(errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n"); if (!SrcDFNode->isDummyNode()) { assert(SrcDFNode->isAllocationNode()); kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode); @@ -1055,11 +1064,10 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { // If no allocation node was found, SharedMemArgs is empty if (kernel->AllocationNode) { ValueToValueMapTy VMap; - Function *F_alloc = - CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap); - // F_alloc->removeFromParent(); + Function *F_alloc = CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap); + //F_alloc->removeFromParent(); // Insert the cloned function into the kernels module - // M.getFunctionList().push_back(F_alloc); + //M.getFunctionList().push_back(F_alloc); std::vector<IntrinsicInst *> ViscMallocInstVec; findIntrinsicInst(F_alloc, Intrinsic::visc_malloc, ViscMallocInstVec); @@ -1067,8 +1075,7 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { for (unsigned i = 0; i < ViscMallocInstVec.size(); i++) { IntrinsicInst *II = ViscMallocInstVec[i]; assert(II->hasOneUse() && "visc_malloc result is used more than once"); - II->replaceAllUsesWith( - ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext()))); + II->replaceAllUsesWith(ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext()))); II->eraseFromParent(); } kernel->AllocationFunction = F_alloc; @@ -1083,19 +1090,15 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { assert(RetStructTy && "Allocation node does not return a struct type"); unsigned numFields = RetStructTy->getNumElements(); */ - std::map<unsigned, std::pair<Value *, unsigned>> sharedInMap = - kernel->getSharedInArgMap(); - AllocationNodeProperty *APN = - (AllocationNodeProperty *)kernel->AllocationNode->getProperty( - DFNode::Allocation); - for (auto &AllocPair : APN->getAllocationList()) { + std::map<unsigned, std::pair<Value*, unsigned> > sharedInMap = kernel->getSharedInArgMap(); + AllocationNodeProperty* APN = + (AllocationNodeProperty*) kernel->AllocationNode->getProperty(DFNode::Allocation); + for (auto& AllocPair: APN->getAllocationList()) { unsigned destPos = AllocPair.first->getDestPosition(); unsigned srcPos = AllocPair.first->getSourcePosition(); SharedMemArgs.push_back(destPos); - sharedInMap[destPos] = - std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1); - sharedInMap[destPos + 1] = - std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1); + sharedInMap[destPos] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1); + sharedInMap[destPos+1] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1); } kernel->setSharedInArgMap(sharedInMap); } @@ -1105,14 +1108,12 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { // global address space unsigned argIndex = 0; std::vector<unsigned> GlobalMemArgs; - for (Function::arg_iterator ai = F_nvptx->arg_begin(), - ae = F_nvptx->arg_end(); - ai != ae; ++ai) { - if (ai->getType()->isPointerTy()) { - // If the arguement is already chosen for shared memory arguemnt list, - // skip. Else put it in Global memory arguement list - if (std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == - 0) { + for(Function::arg_iterator ai = F_nvptx->arg_begin(), ae = F_nvptx->arg_end(); + ai != ae; ++ai) { + if (ai->getType()->isPointerTy()) { + // If the arguement is already chosen for shared memory arguemnt list, skip. + // Else put it in Global memory arguement list + if(std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == 0) { GlobalMemArgs.push_back(argIndex); } } @@ -1126,21 +1127,20 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { // Optimization: Gloabl memory arguments, which are not modified and whose // loads are not dependent on node id of current node, should be moved to // constant memory, subject to size of course - std::vector<unsigned> ConstantMemArgs = - globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx); + std::vector<unsigned> ConstantMemArgs = globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx); F_nvptx = changeArgAddrspace(F_nvptx, ConstantMemArgs, GLOBAL_ADDRSPACE); F_nvptx = changeArgAddrspace(F_nvptx, SharedMemArgs, SHARED_ADDRSPACE); F_nvptx = changeArgAddrspace(F_nvptx, GlobalMemArgs, GLOBAL_ADDRSPACE); - // Function to replace call instructions to functions in the kernel +// Function to replace call instructions to functions in the kernel std::map<Function *, Function *> OrgToClonedFuncMap; std::vector<Function *> FuncToBeRemoved; - auto CloneAndReplaceCall = [&](CallInst *CI, Function *OrgFunc) { - Function *NewFunc; + auto CloneAndReplaceCall = [&] (CallInst *CI, Function *OrgFunc) { + Function* NewFunc; // Check if the called function has already been cloned before. auto It = OrgToClonedFuncMap.find(OrgFunc); - if (It == OrgToClonedFuncMap.end()) { + if(It == OrgToClonedFuncMap.end()) { ValueToValueMapTy VMap; NewFunc = CloneFunction(OrgFunc, VMap); OrgToClonedFuncMap[OrgFunc] = NewFunc; @@ -1149,47 +1149,42 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { NewFunc = (*It).second; } // Replace the calls to this function - std::vector<Value *> args; - for (unsigned i = 0; i < CI->getNumArgOperands(); i++) { + std::vector<Value*> args; + for(unsigned i=0; i < CI->getNumArgOperands(); i++) { args.push_back(CI->getArgOperand(i)); } - CallInst *Inst = CallInst::Create( - NewFunc, args, - OrgFunc->getReturnType()->isVoidTy() ? "" : CI->getName(), CI); + CallInst* Inst = CallInst::Create(NewFunc, args, + OrgFunc->getReturnType()->isVoidTy()? "" : CI->getName(), CI); CI->replaceAllUsesWith(Inst); IItoRemove.push_back(CI); return NewFunc; }; + // Go through all the instructions - for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; - ++i) { + for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) { Instruction *I = &(*i); // Leaf nodes should not contain VISC graph intrinsics or launch - assert(!BuildDFG::isViscLaunchIntrinsic(I) && - "Launch intrinsic within a dataflow graph!"); - assert(!BuildDFG::isViscGraphIntrinsic(I) && - "VISC graph intrinsic within a leaf dataflow node!"); + assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!"); + assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!"); if (BuildDFG::isViscIntrinsic(I)) { - IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); - IntrinsicInst *ArgII; - DFNode *ArgDFNode; + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + IntrinsicInst* ArgII; + DFNode* ArgDFNode; - /************************ Handle VISC Query intrinsics - * ************************/ + /************************ Handle VISC Query intrinsics ************************/ switch (II->getIntrinsicID()) { - /**************************** llvm.visc.getNode() - * *****************************/ + /**************************** llvm.visc.getNode() *****************************/ case Intrinsic::visc_getNode: { DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNode\n"); // add mapping <intrinsic, this node> to the node-specific map Leaf_HandleToDFNodeMap[II] = N; IItoRemove.push_back(II); - } break; - /************************* llvm.visc.getParentNode() - * **************************/ + } + break; + /************************* llvm.visc.getParentNode() **************************/ case Intrinsic::visc_getParentNode: { DEBUG(errs() << F_nvptx->getName() << "\t: Handling getParentNode\n"); // get the parent node of the arg node @@ -1203,9 +1198,9 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent(); IItoRemove.push_back(II); - } break; - /*************************** llvm.visc.getNumDims() - * ***************************/ + } + break; + /*************************** llvm.visc.getNumDims() ***************************/ case Intrinsic::visc_getNumDims: { DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumDims\n"); // get node from map @@ -1214,48 +1209,47 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; int numOfDim = ArgDFNode->getNumOfDim(); DEBUG(errs() << "\t Got node dimension : " << numOfDim << "\n"); - IntegerType *IntTy = Type::getInt32Ty(KernelM->getContext()); - ConstantInt *numOfDimConstant = - ConstantInt::getSigned(IntTy, (int64_t)numOfDim); + IntegerType* IntTy = Type::getInt32Ty(KernelM->getContext()); + ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim); // Replace the result of the intrinsic with the computed value II->replaceAllUsesWith(numOfDimConstant); IItoRemove.push_back(II); - } break; - /*********************** llvm.visc.getNodeInstanceID() - * ************************/ + } + break; + /*********************** llvm.visc.getNodeInstanceID() ************************/ case Intrinsic::visc_getNodeInstanceID_x: case Intrinsic::visc_getNodeInstanceID_y: case Intrinsic::visc_getNodeInstanceID_z: { - DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n" - << "\t: " << *II << "\n"); + DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n" << "\t: " << *II << "\n"); ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; assert(ArgDFNode && "Arg node is NULL"); // A leaf node always has a parent - DFNode *ParentDFNode = ArgDFNode->getParent(); + DFNode* ParentDFNode = ArgDFNode->getParent(); assert(ParentDFNode && "Parent node of a leaf is NULL"); // Get the number associated with the required dimension // FIXME: The order is important! // These three intrinsics need to be consecutive x,y,z - uint64_t dim = - II->getIntrinsicID() - Intrinsic::visc_getNodeInstanceID_x; + uint64_t dim = II->getIntrinsicID() - + Intrinsic::visc_getNodeInstanceID_x; assert((dim < 3) && "Invalid dimension argument"); DEBUG(errs() << "\t dimension = " << dim << "\n"); // Argument of the function to be called - ConstantInt *DimConstant = - ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); - // ArrayRef<Value *> Args(DimConstant); + ConstantInt * DimConstant = + ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); + //ArrayRef<Value *> Args(DimConstant); // The following is to find which function to call - Function *OpenCLFunction; + Function * OpenCLFunction; - FunctionType *FT = - FunctionType::get(Type::getInt64Ty(KernelM->getContext()), - Type::getInt32Ty(KernelM->getContext()), false); + FunctionType* FT = + FunctionType::get(Type::getInt64Ty(KernelM->getContext()), + Type::getInt32Ty(KernelM->getContext()), + false); if (SelectedHierarchy == ONE_LEVEL && ArgDFNode == N) { // We only have one level in the hierarchy or the parent node is not // replicated. This indicates that the parent node is the kernel @@ -1264,46 +1258,43 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { // itself DEBUG(errs() << "Substitute with get_global_id()\n"); DEBUG(errs() << *II << "\n"); - OpenCLFunction = cast<Function>( - (KernelM->getOrInsertFunction(StringRef("get_global_id"), FT)) - .getCallee()); + OpenCLFunction = cast<Function> + ((KernelM->getOrInsertFunction(StringRef("get_global_id"), FT)).getCallee()); } else if (Leaf_HandleToDFNodeMap[ArgII] == N) { - // DEBUG(errs() << "Here inside cond 2\n"); + //DEBUG(errs() << "Here inside cond 2\n"); // We are asking for this node's id with respect to its parent // this is a local id call - OpenCLFunction = cast<Function>( - (KernelM->getOrInsertFunction(StringRef("get_local_id"), FT)) - .getCallee()); - // DEBUG(errs() << "exiting condition 2\n"); + OpenCLFunction = cast<Function> + ((KernelM->getOrInsertFunction(StringRef("get_local_id"), FT)).getCallee()); + //DEBUG(errs() << "exiting condition 2\n"); } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) { // We are asking for this node's parent's id with respect to its // parent: this is a group id call - OpenCLFunction = cast<Function>( - (KernelM->getOrInsertFunction(StringRef("get_group_id"), FT)) - .getCallee()); + OpenCLFunction = cast<Function> + ((KernelM->getOrInsertFunction(StringRef("get_group_id"), FT)).getCallee()); } else { - DEBUG(errs() << N->getFuncPointer()->getName() << "\n"); - DEBUG(errs() << N->getParent()->getFuncPointer()->getName() << "\n"); - DEBUG(errs() << *II << "\n"); + errs() << N->getFuncPointer()->getName() << "\n"; + errs() << N->getParent()->getFuncPointer()->getName() << "\n"; + errs() << *II << "\n"; assert(false && "Unable to translate getNodeInstanceID intrinsic"); } - // DEBUG(errs() << "Create call instruction, insert it before the - // instrinsic\n"); DEBUG(errs() << "Function: " << *OpenCLFunction << - // "\n"); DEBUG(errs() << "Arguments size: " << Args.size() << "\n"); - // DEBUG(errs() << "Argument: " << Args[0] << "\n"); - // DEBUG(errs() << "Arguments: " << *DimConstant << "\n"); + //DEBUG(errs() << "Create call instruction, insert it before the instrinsic\n"); + //DEBUG(errs() << "Function: " << *OpenCLFunction << "\n"); + //DEBUG(errs() << "Arguments size: " << Args.size() << "\n"); + //DEBUG(errs() << "Argument: " << Args[0] << "\n"); + //DEBUG(errs() << "Arguments: " << *DimConstant << "\n"); // Create call instruction, insert it before the intrinsic and // replace the uses of the previous instruction with the new one - CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); - // DEBUG(errs() << "Replace uses\n"); + CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); + //DEBUG(errs() << "Replace uses\n"); II->replaceAllUsesWith(CI); IItoRemove.push_back(II); - } break; - /********************** llvm.visc.getNumNodeInstances() - * ***********************/ + } + break; + /********************** llvm.visc.getNumNodeInstances() ***********************/ case Intrinsic::visc_getNumNodeInstances_x: case Intrinsic::visc_getNumNodeInstances_y: case Intrinsic::visc_getNumNodeInstances_z: { @@ -1312,89 +1303,78 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { // then, why do we need to keep that info in the graph? (only for the // kernel configuration during the call) - DEBUG(errs() << F_nvptx->getName() - << "\t: Handling getNumNodeInstances\n"); + DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumNodeInstances\n"); ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts()); ArgDFNode = Leaf_HandleToDFNodeMap[ArgII]; // A leaf node always has a parent - DFNode *ParentDFNode = ArgDFNode->getParent(); + DFNode* ParentDFNode = ArgDFNode->getParent(); assert(ParentDFNode && "Parent node of a leaf is NULL"); // Get the number associated with the required dimension // FIXME: The order is important! // These three intrinsics need to be consecutive x,y,z - uint64_t dim = - II->getIntrinsicID() - Intrinsic::visc_getNumNodeInstances_x; + uint64_t dim = II->getIntrinsicID() - + Intrinsic::visc_getNumNodeInstances_x; assert((dim < 3) && "Invalid dimension argument"); DEBUG(errs() << "\t dimension = " << dim << "\n"); // Argument of the function to be called - ConstantInt *DimConstant = - ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); - // ArrayRef<Value *> Args(DimConstant); + ConstantInt * DimConstant = + ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim); + //ArrayRef<Value *> Args(DimConstant); // The following is to find which function to call - Function *OpenCLFunction; - FunctionType *FT = + Function * OpenCLFunction; + FunctionType* FT = FunctionType::get(Type::getInt64Ty(KernelM->getContext()), - Type::getInt32Ty(KernelM->getContext()), false); + Type::getInt32Ty(KernelM->getContext()), + false); if (N == ArgDFNode && SelectedHierarchy == ONE_LEVEL) { // We only have one level in the hierarchy or the parent node is not // replicated. This indicates that the parent node is the kernel // launch, so the instances are global_size (gridDim x blockDim) - OpenCLFunction = cast<Function>( - (KernelM->getOrInsertFunction(StringRef("get_global_size"), FT)) - .getCallee()); + OpenCLFunction = cast<Function> + ((KernelM->getOrInsertFunction(StringRef("get_global_size"), FT)).getCallee()); } else if (Leaf_HandleToDFNodeMap[ArgII] == N) { // We are asking for this node's instances // this is a local size (block dim) call - OpenCLFunction = cast<Function>( - (KernelM->getOrInsertFunction(StringRef("get_local_size"), FT)) - .getCallee()); + OpenCLFunction = cast<Function> + ((KernelM->getOrInsertFunction(StringRef("get_local_size"), FT)).getCallee()); } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) { // We are asking for this node's parent's instances // this is a (global_size/local_size) (grid dim) call - OpenCLFunction = cast<Function>( - (KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT)) - .getCallee()); + OpenCLFunction = cast<Function> + ((KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT)).getCallee()); } else { assert(false && "Unable to translate getNumNodeInstances intrinsic"); } // Create call instruction, insert it before the intrinsic and // replace the uses of the previous instruction with the new one - CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); + CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II); II->replaceAllUsesWith(CI); IItoRemove.push_back(II); - } break; - case Intrinsic::visc_barrier: { + } + break; + case Intrinsic::visc_barrier: + { DEBUG(errs() << F_nvptx->getName() << "\t: Handling barrier\n"); DEBUG(errs() << "Substitute with barrier()\n"); DEBUG(errs() << *II << "\n"); - FunctionType *FT = FunctionType::get( - Type::getVoidTy(KernelM->getContext()), - std::vector<Type *>(1, Type::getInt32Ty(KernelM->getContext())), - false); - Function *OpenCLFunction = cast<Function>( - (KernelM->getOrInsertFunction(StringRef("barrier"), FT)) - .getCallee()); - CallInst *CI = - CallInst::Create(OpenCLFunction, - ArrayRef<Value *>(ConstantInt::get( - Type::getInt32Ty(KernelM->getContext()), 1)), - "", II); + FunctionType* FT = FunctionType::get(Type::getVoidTy(KernelM->getContext()), + std::vector<Type*>(1, Type::getInt32Ty(KernelM->getContext())), + false); + Function* OpenCLFunction = cast<Function> + ((KernelM->getOrInsertFunction(StringRef("barrier"), FT)).getCallee()); + CallInst* CI = CallInst::Create(OpenCLFunction, + ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)), + "", II); II->replaceAllUsesWith(CI); IItoRemove.push_back(II); -<<<<<<< HEAD } break; -======= - } break; - case Intrinsic::visc_atomic_cmpxchg: - break; ->>>>>>> 1fa97ee84c62e70116fdaa57b3b1b1117c2e653f case Intrinsic::visc_atomic_add: case Intrinsic::visc_atomic_sub: case Intrinsic::visc_atomic_xchg: @@ -1403,7 +1383,6 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { case Intrinsic::visc_atomic_and: case Intrinsic::visc_atomic_or: case Intrinsic::visc_atomic_xor: -<<<<<<< HEAD { DEBUG(errs() << *II << "\n"); // Only have support for i32 atomic intrinsics @@ -1420,735 +1399,697 @@ void CGT_NVPTX::codeGen(DFLeafNode *N) { if (PtrTy != TargetTy) { Ptr = CastInst::CreatePointerCast(Ptr, TargetTy, "", II); PtrTy = TargetTy; -======= - // case Intrinsic::visc_atomic_inc: - // case Intrinsic::visc_atomic_dec: - { - DEBUG(errs() << *II << "\n"); - // Only have support for i32 atomic intrinsics - assert(II->getType() == Type::getInt32Ty(II->getContext()) && - "Only support i32 atomic intrinsics for now"); - // Substitute with atomicrmw instruction - assert(II->getNumArgOperands() == 2 && - "Expecting 2 operands for these atomics"); - Value *Ptr = II->getArgOperand(0); - Value *Val = II->getArgOperand(1); - assert(Ptr->getType()->isPointerTy() && - "First argument of supported atomics is expected to be a " - "pointer"); - PointerType *PtrTy = cast<PointerType>(Ptr->getType()); - PointerType *TargetTy = - Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace()); - if (PtrTy != TargetTy) { - Ptr = CastInst::CreatePointerCast(Ptr, TargetTy, "", II); - PtrTy = TargetTy; - } - - std::string name; - if (II->getIntrinsicID() == Intrinsic::visc_atomic_add) - name = "atomic_add"; - else if (II->getIntrinsicID() == Intrinsic::visc_atomic_sub) - name = "atomic_sub"; - else if (II->getIntrinsicID() == Intrinsic::visc_atomic_xchg) - name = "atomic_xchg"; - else if (II->getIntrinsicID() == Intrinsic::visc_atomic_min) - name = "atomic_min"; - else if (II->getIntrinsicID() == Intrinsic::visc_atomic_max) - name = "atomic_max"; - else if (II->getIntrinsicID() == Intrinsic::visc_atomic_and) - name = "atomic_and"; - else if (II->getIntrinsicID() == Intrinsic::visc_atomic_or) - name = "atomic_or"; - else if (II->getIntrinsicID() == Intrinsic::visc_atomic_xor) - name = "atomic_xor"; - Type *paramTypes[] = {PtrTy, Val->getType()}; - FunctionType *AtomFuncT = FunctionType::get( - II->getType(), ArrayRef<Type *>(paramTypes, 2), false); - FunctionCallee AtomFunc = - KernelM->getOrInsertFunction(name, AtomFuncT); - - Value *Params[] = {Ptr, Val}; - CallInst *AtomCI = CallInst::Create( - AtomFunc, ArrayRef<Value *>(Params, 2), II->getName(), II); - DEBUG(errs() << "Substitute with: " << *AtomCI << "\n"); - II->replaceAllUsesWith(AtomCI); - IItoRemove.push_back(II); ->>>>>>> 1fa97ee84c62e70116fdaa57b3b1b1117c2e653f - } - break; - default: - llvm_unreachable("Unknown VISC Intrinsic!"); - break; - } - - } else if (MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) { - IRBuilder<> Builder(I); - Value *Source = MemCpyI->getSource(); - Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts(); - Value *Length = MemCpyI->getOperand(2); - DEBUG(errs() << "Found memcpy instruction: " << *I << "\n"); - DEBUG(errs() << "Source: " << *Source << "\n"); - DEBUG(errs() << "Destination: " << *Destination << "\n"); - DEBUG(errs() << "Length: " << *Length << "\n"); - - size_t memcpy_length; - unsigned int memcpy_count; - if (ConstantInt *CI = dyn_cast<ConstantInt>(Length)) { - if (CI->getBitWidth() <= 64) { - memcpy_length = CI->getSExtValue(); - DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n"); - Type *Source_Type = Source->getType()->getPointerElementType(); - DEBUG(errs() << "Source Type : " << *Source_Type << "\n"); - memcpy_count = - memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8); - DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n"); - if (GetElementPtrInst *sourceGEPI = - dyn_cast<GetElementPtrInst>(Source)) { - if (GetElementPtrInst *destGEPI = - dyn_cast<GetElementPtrInst>(Destination)) { - Value *SourcePtrOperand = sourceGEPI->getPointerOperand(); - Value *DestPtrOperand = destGEPI->getPointerOperand(); - for (int i = 0; i < memcpy_count; ++i) { - Constant *increment; - LoadInst *newLoadI; - StoreInst *newStoreI; - // First, need to increment the correct index for both source - // and dest This invluves checking to see how many indeces the - // GEP has Assume for now only 1 or 2 are the viable options. - - std::vector<Value *> GEPlIndex; - if (sourceGEPI->getNumIndices() == 1) { - Value *Index = sourceGEPI->getOperand(1); - increment = ConstantInt::get(Index->getType(), i, false); - Value *incAdd = Builder.CreateAdd(Index, increment); - DEBUG(errs() << "Add: " << *incAdd << "\n"); - GEPlIndex.push_back(incAdd); - Value *newGEPIl = Builder.CreateGEP( - SourcePtrOperand, ArrayRef<Value *>(GEPlIndex)); - DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n"); - newLoadI = Builder.CreateLoad(newGEPIl); - DEBUG(errs() << "Load: " << *newLoadI << "\n"); - } else { - llvm_unreachable("Unhandled case where source GEPI has more " - "than 1 indices!\n"); - } - - std::vector<Value *> GEPsIndex; - if (destGEPI->getNumIndices() == 1) { - - } else if (destGEPI->getNumIndices() == 2) { - Value *Index0 = destGEPI->getOperand(1); - GEPsIndex.push_back(Index0); - Value *Index1 = destGEPI->getOperand(2); - increment = ConstantInt::get(Index1->getType(), i, false); - Value *incAdd = Builder.CreateAdd(Index1, increment); - DEBUG(errs() << "Add: " << *incAdd << "\n"); - GEPsIndex.push_back(incAdd); - Value *newGEPIs = Builder.CreateGEP( - DestPtrOperand, ArrayRef<Value *>(GEPsIndex)); - DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n"); - newStoreI = Builder.CreateStore(newLoadI, newGEPIs, - MemCpyI->isVolatile()); - DEBUG(errs() << "Store: " << *newStoreI << "\n"); - } else { - llvm_unreachable("Unhandled case where dest GEPI has more " - "than 2 indices!\n"); - } - } - IItoRemove.push_back(sourceGEPI); - IItoRemove.push_back(destGEPI); - Instruction *destBitcastI = - dyn_cast<Instruction>(MemCpyI->getArgOperand(0)); - Instruction *sourceBitcastI = - dyn_cast<Instruction>(MemCpyI->getArgOperand(1)); - IItoRemove.push_back(destBitcastI); - IItoRemove.push_back(sourceBitcastI); - IItoRemove.push_back(MemCpyI); - } - } } - } else { - llvm_unreachable("MEMCPY length is not a constant, not handled!\n"); - } - // llvm_unreachable("HERE!"); - } - - else if (CallInst *CI = dyn_cast<CallInst>(I)) { - DEBUG(errs() << "Found a call: " << *CI << "\n"); - Function *calleeF = - cast<Function>(CI->getCalledValue()->stripPointerCasts()); - if (calleeF->isDeclaration()) { - // Add the declaration to kernel module - if (calleeF->getName() == "sqrtf") { - calleeF->setName(Twine("sqrt")); - DEBUG(errs() << "CaleeF: " << *calleeF << "\n"); - DEBUG(errs() << "CI: " << *CI << "\n"); - } else if (calleeF->getName() == "rsqrtf") { - calleeF->setName(Twine("rsqrt")); - DEBUG(errs() << "CaleeF: " << *calleeF << "\n"); - DEBUG(errs() << "CI: " << *CI << "\n"); - } - DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF - << "\n"); - KernelM->getOrInsertFunction(calleeF->getName(), - calleeF->getFunctionType()); - } else { - // Check if the called function has already been cloned before. - Function *NewFunc = CloneAndReplaceCall(CI, calleeF); - // Iterate over the new function to see if it calls any other functions - // in the module. - for (inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); - i != e; ++i) { - if (auto *Call = dyn_cast<CallInst>(&*i)) { - Function *CalledFunc = - cast<Function>(Call->getCalledValue()->stripPointerCasts()); - CloneAndReplaceCall(Call, CalledFunc); - } - } - } - // TODO: how to handle address space qualifiers in load/store - } - } - // search for pattern where float is being casted to int and loaded/stored and - // change it. - DEBUG(errs() << "finding pattern for replacement!\n"); - for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; - ++i) { - bool cont = false; - bool keepGEPI = false; - bool keepGEPI2 = false; - Instruction *I = &(*i); - GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I); - if (!GEPI) { - // did nod find pattern start, continue - continue; - } - // may have found pattern, check - DEBUG(errs() << "GEPI " << *GEPI << "\n"); - // print whatever we want for debug - Value *PtrOp = GEPI->getPointerOperand(); - Type *SrcTy = GEPI->getSourceElementType(); - unsigned GEPIaddrspace = GEPI->getAddressSpace(); - - if (SrcTy->isArrayTy()) - DEBUG(errs() << *SrcTy << " is an array type! " - << *(SrcTy->getArrayElementType()) << "\n"); - else - DEBUG(errs() << *SrcTy << " is not an array type!\n"); - // check that source element type is float - if (SrcTy->isArrayTy()) { - if (!(SrcTy->getArrayElementType()->isFloatTy())) { - DEBUG(errs() << "GEPI type is array but not float!\n"); - continue; - } - } else if (!(SrcTy->isFPOrFPVectorTy() /*isFloatTy()*/)) { - DEBUG(errs() << "GEPI type is " << *SrcTy << "\n"); - // does not fit this pattern - no float GEP instruction - continue; - } - // check that addressspace is 1 - // if (GEPIaddrspace != 1) { - // // does not fit this pattern - addrspace of pointer argument is - //not global continue; - // } - if (!(GEPI->hasOneUse())) { - // does not fit this pattern - more than one uses - // continue; - // Keep GEPI around if it has other uses - keepGEPI = true; - } - DEBUG(errs() << "Found GEPI " << *GEPI << "\n"); - - // 1st GEPI it has one use - // assert(GEPI->hasOneUse() && "GEPI has a single use"); - - // See if it is a bitcast - BitCastInst *BitCastI; - for (User *U : GEPI->users()) { - if (Instruction *ui = dyn_cast<Instruction>(U)) { - DEBUG(errs() << "--" << *ui << "\n"); - if (isa<BitCastInst>(ui)) { - BitCastI = dyn_cast<BitCastInst>(ui); - DEBUG(errs() << "---Found bitcast as only use of GEP\n"); - break; - } - } - DEBUG(errs() << "GEPI does not have a bitcast user, continue\n"); - cont = true; - } - // for (Value::user_iterator ui = GEPI->user_begin(), - // ue = GEPI->user_end(); ui!=ue; ++ui) { - // DEBUG(errs() << "--" << *ui << "\n"); - // if (isa<BitCastInst>(*ui)) { - // BitCastI = dyn_cast<BitCastInst>(*ui); - // DEBUG(errs() << "Found bitcast as only use of GEP\n"); - // } - // } - - if (cont /*!BitCastI*/) { - continue; // not in pattern - } - - // DEBUG(errs() << *BitCastI << "\n"); - // Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand - // has to be the GEP, since this is a use of the GEP. - Value *Op2 = BitCastI->getOperand(0); - DEBUG(errs() << "----" << *Op2 << "\n"); - // assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n"); - // Type *OpTy = cast<Type>(Op2); - Type *OpTy = BitCastI->getDestTy(); - DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n"); - // DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) << - // "\n"); - if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) { - // maybe right syntax is (Type::getInt32Ty)->getPointerTo() - continue; // not in pattern - } - - DEBUG(errs() << "----Here!\n"); - // We are in GEP, bitcast. - - // user_iterator, to find the load. - - if (!(BitCastI->hasOneUse())) { - // does not fit this pattern - more than one uses - continue; - } - DEBUG(errs() << "----Bitcast has one use!\n"); - // it has one use - assert(BitCastI->hasOneUse() && "BitCastI has a single use"); - LoadInst *LoadI; - for (User *U : BitCastI->users()) { - if (Instruction *ui = dyn_cast<Instruction>(U)) { - DEBUG(errs() << "-----" << *ui << "\n"); - if (isa<LoadInst>(ui)) { - LoadI = dyn_cast<LoadInst>(ui); - DEBUG(errs() << "-----Found load as only use of bitcast\n"); - break; - } - } - DEBUG(errs() << "Bitcast does not have a load user, continue!\n"); - cont = true; - } - // for (Value::user_iterator ui = BitCastI->user_begin(), - // ue = BitCastI->user_end(); ui!=ue; ++ui) { - // if (isa<LoadInst>(*ui)) { - // LoadI = dyn_cast<LoadInst>(*ui); - // errs() << "Found load as only use of bitcast\n"; - // } - // } - - if (cont) { - continue; // not in pattern - } - - DEBUG("HERE!\n"); - // check that we load from pointer we got from bitcast - assert - the unique - // argument must be the use we found it from - assert(LoadI->getPointerOperand() == BitCastI && - "Unexpected Load Instruction Operand\n"); - - // Copy user_iterator, to find the store. - - if (!(LoadI->hasOneUse())) { - // does not fit this pattern - more than one uses - continue; - // TODO: generalize: one load can have more than one store users - } - - // it has one use - assert(LoadI->hasOneUse() && "LoadI has a single use"); - Value::user_iterator ui = LoadI->user_begin(); - // skipped loop, because is has a single use - StoreInst *StoreI = dyn_cast<StoreInst>(*ui); - if (!StoreI) { - continue; // not in pattern - } - - // Also check that the store uses the loaded value as the value operand - if (StoreI->getValueOperand() != LoadI) { - continue; - } - - DEBUG(errs() << "-------Found store instruction\n"); - - // Look for its bitcast, which is its pointer operand - Value *StPtrOp = StoreI->getPointerOperand(); - DEBUG(errs() << "-------" << *StPtrOp << "\n"); - BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp); - DEBUG(errs() << "-------" << *BitCastI2 << "\n"); - if (!BitCastI2) { - continue; // not in pattern - } - - DEBUG(errs() << "-------- Found Bit Cast of store!\n"); - // found bitcast. Look for the second GEP, its from operand. - Value *BCFromOp = BitCastI2->getOperand(0); - GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp); - DEBUG(errs() << "---------- " << *GEPI2 << "\n"); - if (!GEPI2) { - continue; // not in pattern - } - - if (!(GEPI2->hasOneUse())) { - // does not fit this pattern - more than one uses - // continue; - // Keep GEPI around if it has other uses - keepGEPI2 = true; - } - DEBUG(errs() << "---------- Found GEPI of Bitcast!\n"); - - Value *PtrOp2 = GEPI2->getPointerOperand(); - - // Found GEPI2. TODO: kind of confused as o what checks I need to add here, - // let's add them together- all the code for int-float type checks is - // already above. - - // Assume we found pattern - if (!keepGEPI) { - IItoRemove.push_back(GEPI); - DEBUG(errs() << "Pushing " << *GEPI << " for removal\n"); - } else { - DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n"); - } - IItoRemove.push_back(BitCastI); - DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n"); - IItoRemove.push_back(LoadI); - DEBUG(errs() << "Pushing " << *LoadI << " for removal\n"); - IItoRemove.push_back(GEPI2); - DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n"); - IItoRemove.push_back(BitCastI2); - DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n"); - if (!keepGEPI2) { - IItoRemove.push_back(StoreI); - DEBUG(errs() << "Pushing " << *StoreI << " for removal\n"); - } else { - - DEBUG(errs() << "Keeping " << *StoreI - << " since it has multiple uses!\n"); - } - - std::vector<Value *> GEPlIndex; - if (GEPI->hasIndices()) { - for (auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) { - Value *Index = dyn_cast<Value>(&*ii); - DEBUG(errs() << "GEP-1 Index: " << *Index << "\n"); - GEPlIndex.push_back(Index); - } - } - // ArrayRef<Value*> GEPlArrayRef(GEPlIndex); - - std::vector<Value *> GEPsIndex; - if (GEPI2->hasIndices()) { - for (auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) { - Value *Index = dyn_cast<Value>(&*ii); - DEBUG(errs() << "GEP-2 Index: " << *Index << "\n"); - GEPsIndex.push_back(Index); - } - } - // ArrayRef<Value*> GEPsArrayRef(GEPlIndex); - - // ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end()); - GetElementPtrInst *newlGEP = GetElementPtrInst::Create( - GEPI->getSourceElementType(), // Type::getFloatTy(M.getContext()), - PtrOp, // operand from 1st GEP - ArrayRef<Value *>(GEPlIndex), Twine(), StoreI); - DEBUG(errs() << "Adding: " << *newlGEP << "\n"); - // insert load before GEPI - LoadInst *newLoadI = - new LoadInst(Type::getFloatTy(M.getContext()), - newlGEP, // new GEP - Twine(), LoadI->isVolatile(), LoadI->getAlignment(), - LoadI->getOrdering(), LoadI->getSyncScopeID(), StoreI); - DEBUG(errs() << "Adding: " << *newLoadI << "\n"); - // same for GEP for store, for store operand - GetElementPtrInst *newsGEP = GetElementPtrInst::Create( - GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()), - PtrOp2, // operand from 2nd GEP - ArrayRef<Value *>(GEPsIndex), Twine(), StoreI); - DEBUG(errs() << "Adding: " << *newsGEP << "\n"); - // insert store before GEPI - StoreInst *newStoreI = - new StoreInst(newLoadI, - newsGEP, // new GEP - StoreI->isVolatile(), StoreI->getAlignment(), - StoreI->getOrdering(), StoreI->getSyncScopeID(), StoreI); - DEBUG(errs() << "Adding: " << *newStoreI << "\n"); - } + std::string name; + if(II->getIntrinsicID() == Intrinsic::visc_atomic_add) + name = "atomic_add"; + else if(II->getIntrinsicID() == Intrinsic::visc_atomic_sub) + name = "atomic_sub"; + else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xchg) + name = "atomic_xchg"; + else if(II->getIntrinsicID() == Intrinsic::visc_atomic_min) + name = "atomic_min"; + else if(II->getIntrinsicID() == Intrinsic::visc_atomic_max) + name = "atomic_max"; + else if(II->getIntrinsicID() == Intrinsic::visc_atomic_and) + name = "atomic_and"; + else if(II->getIntrinsicID() == Intrinsic::visc_atomic_or) + name = "atomic_or"; + else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xor) + name = "atomic_xor"; + Type* paramTypes[] = {PtrTy, Val->getType()}; + FunctionType * AtomFuncT = FunctionType::get(II->getType(), ArrayRef<Type*>(paramTypes,2), false); + FunctionCallee AtomFunc = KernelM->getOrInsertFunction(name, AtomFuncT); + + Value* Params[] = {Ptr, Val}; + CallInst* AtomCI = CallInst::Create(AtomFunc, ArrayRef<Value*>(Params,2), II->getName(), II); + DEBUG(errs() << "Substitute with: " << *AtomCI << "\n"); + II->replaceAllUsesWith(AtomCI); + IItoRemove.push_back(II); + } + break; + default: + llvm_unreachable("Unknown VISC Intrinsic!"); + break; + } + + } + else if(MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) { + IRBuilder<> Builder(I); + Value *Source = MemCpyI->getSource(); + Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts(); + Value *Length = MemCpyI->getOperand(2); + DEBUG(errs() << "Found memcpy instruction: " << *I << "\n"); + DEBUG(errs() << "Source: " << *Source << "\n"); + DEBUG(errs() << "Destination: " << *Destination << "\n"); + DEBUG(errs() << "Length: " << *Length << "\n"); + + size_t memcpy_length; + unsigned int memcpy_count; + if (ConstantInt* CI = dyn_cast<ConstantInt>(Length)) { + if (CI->getBitWidth() <= 64) { + memcpy_length = CI->getSExtValue(); + DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n"); + Type *Source_Type = Source->getType()->getPointerElementType(); + DEBUG(errs() << "Source Type : " << *Source_Type << "\n"); + memcpy_count = memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8); + DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n"); + if (GetElementPtrInst *sourceGEPI = dyn_cast<GetElementPtrInst>(Source)) { + if (GetElementPtrInst *destGEPI = dyn_cast<GetElementPtrInst>(Destination)) { + Value *SourcePtrOperand = sourceGEPI->getPointerOperand(); + Value *DestPtrOperand = destGEPI->getPointerOperand(); + for(int i = 0; i < memcpy_count; ++i) { + Constant *increment; + LoadInst *newLoadI; + StoreInst *newStoreI; + // First, need to increment the correct index for both source and dest + // This invluves checking to see how many indeces the GEP has + // Assume for now only 1 or 2 are the viable options. + + std::vector<Value*> GEPlIndex; + if (sourceGEPI->getNumIndices() == 1) { + Value *Index = sourceGEPI->getOperand(1); + increment = ConstantInt::get(Index->getType(), i, false); + Value *incAdd = Builder.CreateAdd(Index, increment); + DEBUG(errs() << "Add: " << *incAdd << "\n"); + GEPlIndex.push_back(incAdd); + Value *newGEPIl = Builder.CreateGEP(SourcePtrOperand, ArrayRef<Value*>(GEPlIndex)); + DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n"); + newLoadI = Builder.CreateLoad(newGEPIl); + DEBUG(errs() << "Load: " << *newLoadI << "\n"); + } else { + llvm_unreachable("Unhandled case where source GEPI has more than 1 indices!\n"); + } + + + std::vector<Value*> GEPsIndex; + if (destGEPI->getNumIndices() == 1) { + + } else if (destGEPI->getNumIndices() == 2) { + Value *Index0 = destGEPI->getOperand(1); + GEPsIndex.push_back(Index0); + Value *Index1 = destGEPI->getOperand(2); + increment = ConstantInt::get(Index1->getType(), i, false); + Value *incAdd = Builder.CreateAdd(Index1, increment); + DEBUG(errs() << "Add: " << *incAdd << "\n"); + GEPsIndex.push_back(incAdd); + Value *newGEPIs = Builder.CreateGEP(DestPtrOperand, ArrayRef<Value*>(GEPsIndex)); + DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n"); + newStoreI = Builder.CreateStore(newLoadI, newGEPIs, MemCpyI->isVolatile()); + DEBUG(errs() << "Store: " << *newStoreI << "\n"); + } else { + llvm_unreachable("Unhandled case where dest GEPI has more than 2 indices!\n"); + } + } + IItoRemove.push_back(sourceGEPI); + IItoRemove.push_back(destGEPI); + Instruction *destBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(0)); + Instruction *sourceBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(1)); + IItoRemove.push_back(destBitcastI); + IItoRemove.push_back(sourceBitcastI); + IItoRemove.push_back(MemCpyI); + } + } + + } + } else { + llvm_unreachable("MEMCPY length is not a constant, not handled!\n"); + } + // llvm_unreachable("HERE!"); + } + + else if(CallInst* CI = dyn_cast<CallInst>(I)) { + DEBUG(errs() << "Found a call: " << *CI << "\n"); + Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts()); + if(calleeF->isDeclaration()) { + // Add the declaration to kernel module + if (calleeF->getName() == "sqrtf") { + calleeF->setName(Twine("sqrt")); + DEBUG(errs() << "CaleeF: " << *calleeF << "\n"); + DEBUG(errs() << "CI: " << *CI << "\n"); + } else if (calleeF->getName() == "rsqrtf") { + calleeF->setName(Twine("rsqrt")); + DEBUG(errs() << "CaleeF: " << *calleeF << "\n"); + DEBUG(errs() << "CI: " << *CI << "\n"); + } + DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n"); + KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType()); + } + else { + // Check if the called function has already been cloned before. + Function *NewFunc = CloneAndReplaceCall(CI, calleeF); + // Iterate over the new function to see if it calls any other functions + // in the module. + for(inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); i != e; ++i) { + if(auto *Call = dyn_cast<CallInst>(&*i)) { + Function *CalledFunc = cast<Function>(Call->getCalledValue()->stripPointerCasts()); + CloneAndReplaceCall(Call, CalledFunc); + } + } + } + //TODO: how to handle address space qualifiers in load/store + } + + } + // search for pattern where float is being casted to int and loaded/stored and change it. + DEBUG(errs() << "finding pattern for replacement!\n"); + for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) { + bool cont = false; + bool keepGEPI = false; + bool keepGEPI2= false; + Instruction *I = &(*i); + GetElementPtrInst* GEPI = dyn_cast<GetElementPtrInst>(I); + + if (!GEPI) { + // did nod find pattern start, continue + continue; + } + // may have found pattern, check + DEBUG(errs() << "GEPI " << *GEPI << "\n"); + // print whatever we want for debug + Value* PtrOp = GEPI->getPointerOperand(); + Type *SrcTy = GEPI->getSourceElementType(); + unsigned GEPIaddrspace = GEPI->getAddressSpace(); + + if (SrcTy->isArrayTy()) + DEBUG(errs() << *SrcTy << " is an array type! " << *(SrcTy->getArrayElementType()) << "\n"); + else + DEBUG(errs() << *SrcTy << " is not an array type!\n"); + // check that source element type is float + if (SrcTy->isArrayTy()) { + if (!(SrcTy->getArrayElementType()->isFloatTy())) { + DEBUG(errs() << "GEPI type is array but not float!\n"); + continue; + } + } + else if (!(SrcTy->isFPOrFPVectorTy()/*isFloatTy()*/)) { + DEBUG(errs() << "GEPI type is " << *SrcTy << "\n"); + // does not fit this pattern - no float GEP instruction + continue; + } + // check that addressspace is 1 + // if (GEPIaddrspace != 1) { + // // does not fit this pattern - addrspace of pointer argument is not global + // continue; + // } + if (!(GEPI->hasOneUse())) { + // does not fit this pattern - more than one uses + //continue; + // Keep GEPI around if it has other uses + keepGEPI = true; + } + DEBUG(errs() << "Found GEPI " << *GEPI << "\n"); + + // 1st GEPI it has one use + // assert(GEPI->hasOneUse() && "GEPI has a single use"); + + // See if it is a bitcast + BitCastInst *BitCastI; + for (User * U : GEPI->users()) { + if(Instruction *ui = dyn_cast<Instruction> (U)) { + DEBUG(errs() << "--" << *ui << "\n"); + if (isa<BitCastInst>(ui)) { + BitCastI = dyn_cast<BitCastInst>(ui); + DEBUG(errs() << "---Found bitcast as only use of GEP\n"); + break; + } + } + DEBUG(errs() << "GEPI does not have a bitcast user, continue\n"); + cont = true; + } + // for (Value::user_iterator ui = GEPI->user_begin(), + // ue = GEPI->user_end(); ui!=ue; ++ui) { + // DEBUG(errs() << "--" << *ui << "\n"); + // if (isa<BitCastInst>(*ui)) { + // BitCastI = dyn_cast<BitCastInst>(*ui); + // DEBUG(errs() << "Found bitcast as only use of GEP\n"); + // } + // } + + if (cont/*!BitCastI*/) { + continue; // not in pattern + } + + // DEBUG(errs() << *BitCastI << "\n"); + // Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand has to be the GEP, since this is a use of the GEP. + Value *Op2 = BitCastI->getOperand(0); + DEBUG(errs() << "----" << *Op2 << "\n"); + // assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n"); + // Type *OpTy = cast<Type>(Op2); + Type *OpTy = BitCastI->getDestTy(); + DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n"); + // DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) << "\n"); + if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) { + // maybe right syntax is (Type::getInt32Ty)->getPointerTo() + continue; // not in pattern + } + + DEBUG(errs() << "----Here!\n"); + // We are in GEP, bitcast. + + // user_iterator, to find the load. + + if (!(BitCastI->hasOneUse())) { + // does not fit this pattern - more than one uses + continue; + } + DEBUG(errs() << "----Bitcast has one use!\n"); + // it has one use + assert(BitCastI->hasOneUse() && "BitCastI has a single use"); + LoadInst *LoadI; + for (User * U : BitCastI->users()) { + if (Instruction *ui = dyn_cast<Instruction> (U)) { + DEBUG(errs() << "-----" << *ui << "\n"); + if (isa<LoadInst>(ui)) { + LoadI = dyn_cast<LoadInst>(ui); + DEBUG(errs() << "-----Found load as only use of bitcast\n"); + break; + } + } + DEBUG(errs() << "Bitcast does not have a load user, continue!\n"); + cont = true; + } + // for (Value::user_iterator ui = BitCastI->user_begin(), + // ue = BitCastI->user_end(); ui!=ue; ++ui) { + // if (isa<LoadInst>(*ui)) { + // LoadI = dyn_cast<LoadInst>(*ui); + // errs() << "Found load as only use of bitcast\n"; + // } + // } + + if (cont) { + continue; // not in pattern + } + + DEBUG("HERE!\n"); + // check that we load from pointer we got from bitcast - assert - the unique argument must be the use we found it from + assert(LoadI->getPointerOperand() == BitCastI && "Unexpected Load Instruction Operand\n"); + + // Copy user_iterator, to find the store. + + if (!(LoadI->hasOneUse())) { + // does not fit this pattern - more than one uses + continue; + // TODO: generalize: one load can have more than one store users + } + + // it has one use + assert(LoadI->hasOneUse() && "LoadI has a single use"); + Value::user_iterator ui = LoadI->user_begin(); + // skipped loop, because is has a single use + StoreInst *StoreI = dyn_cast<StoreInst>(*ui); + if (!StoreI) { + continue; // not in pattern + } + + // Also check that the store uses the loaded value as the value operand + if (StoreI->getValueOperand() != LoadI) { + continue; + } + + DEBUG(errs() << "-------Found store instruction\n"); + + // Look for its bitcast, which is its pointer operand + Value *StPtrOp = StoreI->getPointerOperand(); + DEBUG(errs() << "-------" << *StPtrOp << "\n"); + BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp); + DEBUG(errs() << "-------" << *BitCastI2 << "\n"); + if (!BitCastI2) { + continue; //not in pattern + } + + DEBUG(errs() << "-------- Found Bit Cast of store!\n" ); + // found bitcast. Look for the second GEP, its from operand. + Value *BCFromOp = BitCastI2->getOperand(0); + GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp); + DEBUG(errs() << "---------- " << *GEPI2 << "\n"); + if (!GEPI2) { + continue; //not in pattern + } + + if (!(GEPI2->hasOneUse())) { + // does not fit this pattern - more than one uses + //continue; + // Keep GEPI around if it has other uses + keepGEPI2 = true; + } + DEBUG(errs() << "---------- Found GEPI of Bitcast!\n"); + + Value *PtrOp2 = GEPI2->getPointerOperand(); + + // Found GEPI2. TODO: kind of confused as o what checks I need to add here, let's add them together- all the code for int-float type checks is already above. + + // Assume we found pattern + if (!keepGEPI) { + IItoRemove.push_back(GEPI); + DEBUG(errs() << "Pushing " << *GEPI << " for removal\n"); + } else { + DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n"); + } + IItoRemove.push_back(BitCastI); + DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n"); + IItoRemove.push_back(LoadI); + DEBUG(errs() << "Pushing " << *LoadI << " for removal\n"); + IItoRemove.push_back(GEPI2); + DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n"); + IItoRemove.push_back(BitCastI2); + DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n"); + if (!keepGEPI2) { + IItoRemove.push_back(StoreI); + DEBUG(errs() << "Pushing " << *StoreI << " for removal\n"); + } else { + + DEBUG(errs() << "Keeping " << *StoreI << " since it has multiple uses!\n"); + } + + std::vector<Value*> GEPlIndex; + if (GEPI->hasIndices()) { + for(auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) { + Value *Index = dyn_cast<Value>(&*ii); + DEBUG(errs() << "GEP-1 Index: " << *Index << "\n"); + GEPlIndex.push_back(Index); + } + } + // ArrayRef<Value*> GEPlArrayRef(GEPlIndex); + + std::vector<Value*> GEPsIndex; + if (GEPI2->hasIndices()) { + for(auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) { + Value *Index = dyn_cast<Value>(&*ii); + DEBUG(errs() << "GEP-2 Index: " << *Index << "\n"); + GEPsIndex.push_back(Index); + } + } + // ArrayRef<Value*> GEPsArrayRef(GEPlIndex); + + + + // ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end()); + GetElementPtrInst* newlGEP = + GetElementPtrInst::Create(GEPI->getSourceElementType(), //Type::getFloatTy(M.getContext()), + PtrOp, // operand from 1st GEP + ArrayRef<Value*>(GEPlIndex), + Twine(), + StoreI); + DEBUG(errs() << "Adding: " << *newlGEP << "\n"); + // insert load before GEPI + LoadInst *newLoadI = + new LoadInst(Type::getFloatTy(M.getContext()), + newlGEP, // new GEP + Twine(), + LoadI->isVolatile(), + LoadI->getAlignment(), + LoadI->getOrdering(), + LoadI->getSyncScopeID(), + StoreI); + DEBUG(errs() << "Adding: " << *newLoadI << "\n"); + // same for GEP for store, for store operand + GetElementPtrInst* newsGEP = + GetElementPtrInst::Create(GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()), + PtrOp2, // operand from 2nd GEP + ArrayRef<Value*>(GEPsIndex), + Twine(), + StoreI); + DEBUG(errs() << "Adding: " << *newsGEP << "\n"); + // insert store before GEPI + StoreInst *newStoreI = + new StoreInst(newLoadI, + newsGEP, // new GEP + StoreI->isVolatile(), + StoreI->getAlignment(), + StoreI->getOrdering(), + StoreI->getSyncScopeID(), + StoreI); + DEBUG(errs() << "Adding: " << *newStoreI << "\n"); + + } + + // We need to do this explicitly: DCE pass will not remove them because we + // have assumed theworst memory behaviour for these function calls + // Traverse the vector backwards, otherwise definitions are deleted while + // their subsequent uses are still around + for (auto *I : reverse(IItoRemove)) { + DEBUG(errs() << "Erasing: " << *I << "\n"); + I->eraseFromParent(); + } + + // Removed the cloned functions from the parent module into the new module + for(auto *F : FuncToBeRemoved) { + F->removeFromParent(); //TODO: MARIA check + KernelM->getFunctionList().push_back(F); + } + + addCLMetadata(F_nvptx); + kernel->KernelFunction = F_nvptx; + errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n"; + DEBUG(errs() << *KernelM); + + return; +} - // We need to do this explicitly: DCE pass will not remove them because we - // have assumed theworst memory behaviour for these function calls - // Traverse the vector backwards, otherwise definitions are deleted while - // their subsequent uses are still around - for (auto *I : reverse(IItoRemove)) { - DEBUG(errs() << "Erasing: " << *I << "\n"); - I->eraseFromParent(); - } +bool DFG2LLVM_NVPTX::runOnModule(Module &M) { + errs() << "\nDFG2LLVM_NVPTX PASS\n"; - // Removed the cloned functions from the parent module into the new module - for (auto *F : FuncToBeRemoved) { - F->removeFromParent(); // TODO: MARIA check - KernelM->getFunctionList().push_back(F); - } + // Get the BuildDFG Analysis Results: + // - Dataflow graph + // - Maps from i8* hansles to DFNode and DFEdge + BuildDFG &DFG = getAnalysis<BuildDFG>(); - addCLMetadata(F_nvptx); - kernel->KernelFunction = F_nvptx; - DEBUG(errs() << "Identified kernel - " << kernel->KernelFunction->getName() - << "\n"); - DEBUG(errs() << *KernelM); + // DFInternalNode *Root = DFG.getRoot(); + std::vector<DFInternalNode*> Roots = DFG.getRoots(); + // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap(); + // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap(); - return; -} + // Visitor for Code Generation Graph Traversal + CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG); -bool DFG2LLVM_NVPTX::runOnModule(Module &M) { - DEBUG(errs() << "\nDFG2LLVM_NVPTX PASS\n"); - - // Get the BuildDFG Analysis Results: - // - Dataflow graph - // - Maps from i8* hansles to DFNode and DFEdge - BuildDFG &DFG = getAnalysis<BuildDFG>(); - - // DFInternalNode *Root = DFG.getRoot(); - std::vector<DFInternalNode *> Roots = DFG.getRoots(); - // BuildDFG::HandleToDFNode &HandleToDFNodeMap = - // DFG.getHandleToDFNodeMap(); BuildDFG::HandleToDFEdge &HandleToDFEdgeMap - // = DFG.getHandleToDFEdgeMap(); - - // Visitor for Code Generation Graph Traversal - CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG); - - // Iterate over all the DFGs and produce code for each one of them - for (auto rootNode : Roots) { - // Initiate code generation for root DFNode - CGTVisitor->visit(rootNode); - } + // Iterate over all the DFGs and produce code for each one of them + for (auto rootNode: Roots) { + // Initiate code generation for root DFNode + CGTVisitor->visit(rootNode); + } - CGTVisitor->writeKernelsModule(); + CGTVisitor->writeKernelsModule(); - // TODO: Edit module epilogue to remove the VISC intrinsic declarations - delete CGTVisitor; + //TODO: Edit module epilogue to remove the VISC intrinsic declarations + delete CGTVisitor; - return true; + return true; } std::string CGT_NVPTX::getKernelsModuleName(Module &M) { - /*SmallString<128> currentDir; - llvm::sys::fs::current_path(currentDir); - std::string fileName = getFilenameFromModule(M); - Twine output = Twine(currentDir) + "/Output/" + fileName + ""; - return output.str().append(".kernels.ll");*/ - std::string mid = M.getModuleIdentifier(); - return mid.append(".kernels.ll"); + /*SmallString<128> currentDir; + llvm::sys::fs::current_path(currentDir); + std::string fileName = getFilenameFromModule(M); + Twine output = Twine(currentDir) + "/Output/" + fileName + ""; + return output.str().append(".kernels.ll");*/ + std::string mid = M.getModuleIdentifier(); + return mid.append(".kernels.ll"); } -void CGT_NVPTX::fixValueAddrspace(Value *V, unsigned addrspace) { - assert(isa<PointerType>(V->getType()) && "Value should be of Pointer Type!"); - PointerType *OldTy = cast<PointerType>(V->getType()); - PointerType *NewTy = PointerType::get(OldTy->getElementType(), addrspace); - V->mutateType(NewTy); - for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; - ui++) { - // Change all uses producing pointer type in same address space to new - // addressspace. - if (PointerType *PTy = dyn_cast<PointerType>((*ui)->getType())) { - if (PTy->getAddressSpace() == OldTy->getAddressSpace()) { - fixValueAddrspace(*ui, addrspace); - } - } - } +void CGT_NVPTX::fixValueAddrspace(Value* V, unsigned addrspace) { + assert(isa<PointerType>(V->getType()) + && "Value should be of Pointer Type!"); + PointerType* OldTy = cast<PointerType>(V->getType()); + PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace); + V->mutateType(NewTy); + for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) { + // Change all uses producing pointer type in same address space to new + // addressspace. + if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) { + if(PTy->getAddressSpace() == OldTy->getAddressSpace()) { + fixValueAddrspace(*ui, addrspace); + } + } + } } -std::vector<unsigned> -CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned> *GlobalMemArgs, - Function *F) { - std::vector<unsigned> ConstantMemArgs; - for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae; - ++ai) { - Argument *arg = &*ai; - std::vector<unsigned>::iterator pos = std::find( - GlobalMemArgs->begin(), GlobalMemArgs->end(), arg->getArgNo()); - // It has to be a global memory argument to be promotable - if (pos == GlobalMemArgs->end()) - continue; - - // Check if it can/should be promoted - if (canBePromoted(arg, F)) { - DEBUG(errs() << "Promoting << " << arg->getName() - << " to constant memory." - << "\n"); - ConstantMemArgs.push_back(arg->getArgNo()); - GlobalMemArgs->erase(pos); - } - } - return ConstantMemArgs; -} - -Function *CGT_NVPTX::changeArgAddrspace(Function *F, - std::vector<unsigned> &Args, - unsigned addrspace) { - unsigned idx = 0; - std::vector<Type *> ArgTypes; - for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae; - ++ai) { - Argument *arg = &*ai; - DEBUG(errs() << *arg << "\n"); - unsigned argno = arg->getArgNo(); - if ((idx < Args.size()) && (argno == Args[idx])) { - fixValueAddrspace(arg, addrspace); - idx++; - } - ArgTypes.push_back(arg->getType()); - } - FunctionType *newFT = FunctionType::get(F->getReturnType(), ArgTypes, false); - // F->mutateType(PTy); - Function *newF = cloneFunction(F, newFT, false); - replaceNodeFunctionInIR(*F->getParent(), F, newF); +std::vector<unsigned> CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned>* GlobalMemArgs, Function* F) { + std::vector<unsigned> ConstantMemArgs; + for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + Argument* arg = &*ai; + std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(), + GlobalMemArgs->end(), arg->getArgNo()); + // It has to be a global memory argument to be promotable + if(pos == GlobalMemArgs->end()) + continue; + + // Check if it can/should be promoted + if(canBePromoted(arg, F)) { + errs() << "Promoting << " << arg->getName() << " to constant memory."<< "\n"; + ConstantMemArgs.push_back(arg->getArgNo()); + GlobalMemArgs->erase(pos); + } + } + return ConstantMemArgs; +} - DEBUG(errs() << *newF->getFunctionType() << "\n" << *newF << "\n"); - return newF; +Function* CGT_NVPTX::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) { + unsigned idx = 0; + std::vector<Type*> ArgTypes; + for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + Argument *arg = &*ai; + DEBUG(errs() << *arg << "\n"); + unsigned argno = arg->getArgNo(); + if ((idx < Args.size()) && (argno == Args[idx])) { + fixValueAddrspace(arg, addrspace); + idx++; + } + ArgTypes.push_back(arg->getType()); + } + FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false); + + //F->mutateType(PTy); + Function* newF = cloneFunction(F, newFT, false); + replaceNodeFunctionInIR(*F->getParent(), F, newF); + + DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n"); + return newF; } /* Add metadata to module KernelM, for OpenCL kernels */ void CGT_NVPTX::addCLMetadata(Function *F) { - IRBuilder<> Builder(&*F->begin()); - - SmallVector<Metadata *, 8> KernelMD; - KernelMD.push_back(ValueAsMetadata::get(F)); - - // TODO: There is additional metadata used by kernel files but we skip them as - // they are not mandatory. In future they might be useful to enable - // optimizations - - MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD); - NamedMDNode *MDN_kernels = - KernelM->getOrInsertNamedMetadata("opencl.kernels"); - MDN_kernels->addOperand(MDKernelNode); - - KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel")); - // TODO: Replace 1 with the number of the kernel. - // Add when support for multiple launces is added - KernelMD.push_back(ValueAsMetadata::get( - ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1))); - MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD); - NamedMDNode *MDN_annotations = - KernelM->getOrInsertNamedMetadata("nvvm.annotations"); - MDN_annotations->addOperand(MDNvvmAnnotationsNode); -} + IRBuilder<> Builder(&*F->begin()); -void CGT_NVPTX::writeKernelsModule() { + SmallVector<Metadata*,8> KernelMD; + KernelMD.push_back(ValueAsMetadata::get(F)); - // In addition to deleting all other functions, we also want to spiff it - // up a little bit. Do this now. - legacy::PassManager Passes; + // TODO: There is additional metadata used by kernel files but we skip them as + // they are not mandatory. In future they might be useful to enable + // optimizations - DEBUG(errs() << "Writing to File --- "); - DEBUG(errs() << getKernelsModuleName(M).c_str() << "\n"); - std::error_code EC; - ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None); - if (EC) { - DEBUG(errs() << EC.message() << '\n'); - } - - Passes.add(createPrintModulePass(Out.os())); + MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD); + NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels"); + MDN_kernels->addOperand(MDKernelNode); - Passes.run(*KernelM); + KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel")); + // TODO: Replace 1 with the number of the kernel. + // Add when support for multiple launces is added + KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1))); + MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD); + NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations"); + MDN_annotations->addOperand(MDNvvmAnnotationsNode); - // Declare success. - Out.keep(); } -Function *CGT_NVPTX::transformFunctionToVoid(Function *F) { - - DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n"); - // FIXME: Maybe do that using the Node? - StructType *FRetTy = dyn_cast<StructType>(F->getReturnType()); - assert(FRetTy && "Return Type must always be a struct"); - - // Keeps return statements, because we will need to replace them - std::vector<ReturnInst *> RItoRemove; - findReturnInst(F, RItoRemove); - - std::vector<Type *> RetArgTypes; - std::vector<Argument *> RetArgs; - std::vector<Argument *> Args; - // Check for { } return struct, which means that the function returns void - if (FRetTy->isEmptyTy()) { - - DEBUG(errs() << "\tFunction output struct is void\n"); - DEBUG(errs() << "\tNo parameters added\n"); +void CGT_NVPTX::writeKernelsModule() { - // Replacing return statements with others returning void - for (auto *RI : RItoRemove) { - ReturnInst::Create((F->getContext()), 0, RI); - RI->eraseFromParent(); - } - DEBUG(errs() << "\tChanged return statements to return void\n"); - } else { - // The struct has return values, thus needs to be converted to parameter - - // Iterate over all element types of return struct and add arguments to the - // function - for (unsigned i = 0; i < FRetTy->getNumElements(); i++) { - Argument *RetArg = - new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F); - RetArgs.push_back(RetArg); - RetArgTypes.push_back(RetArg->getType()); - DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n"); - } + // In addition to deleting all other functions, we also want to spiff it + // up a little bit. Do this now. + legacy::PassManager Passes; - DEBUG(errs() << "\tReplacing Return statements\n"); - // Replace return statements with extractValue and store instructions - for (auto *RI : RItoRemove) { - Value *RetVal = RI->getReturnValue(); - for (unsigned i = 0; i < RetArgs.size(); i++) { - ExtractValueInst *EI = ExtractValueInst::Create( - RetVal, ArrayRef<unsigned>(i), RetArgs[i]->getName() + ".val", RI); - new StoreInst(EI, RetArgs[i], RI); - } - // assert(RetVal && "Return value should not be null at this point"); - // StructType* RetType = cast<StructType>(RetVal->getType()); - // assert(RetType && "Return type is not a struct"); + errs() << "Writing to File --- "; + errs() << getKernelsModuleName(M).c_str() << "\n"; + std::error_code EC; + ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None); + if (EC) { + errs() << EC.message() << '\n'; + } - ReturnInst::Create((F->getContext()), 0, RI); - RI->eraseFromParent(); - } - } - DEBUG(errs() << "\tReplaced return statements\n"); + Passes.add( + createPrintModulePass(Out.os())); - // Create the argument type list with the added argument's type - std::vector<Type *> ArgTypes; - for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { - ArgTypes.push_back(ai->getType()); - } - for (auto *RATy : RetArgTypes) { - ArgTypes.push_back(RATy); - } + Passes.run(*KernelM); - // Creating Args vector to use in cloning! - for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae; - ++ai) { - Args.push_back(&*ai); - } - for (auto *ai : RetArgs) { - Args.push_back(ai); - } + // Declare success. + Out.keep(); +} - // Adding new arguments to the function argument list, would not change the - // function type. We need to change the type of this function to reflect the - // added arguments - Type *VoidRetType = Type::getVoidTy(F->getContext()); - FunctionType *newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg()); - - // Change the function type - // F->mutateType(PTy); - Function *newF = cloneFunction(F, newFT, false, NULL, &Args); - replaceNodeFunctionInIR(*F->getParent(), F, newF); - // F->eraseFromParent(); - return newF; +Function* CGT_NVPTX::transformFunctionToVoid(Function* F) { + + DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n"); + // FIXME: Maybe do that using the Node? + StructType* FRetTy = dyn_cast<StructType>(F->getReturnType()); + assert(FRetTy && "Return Type must always be a struct"); + + // Keeps return statements, because we will need to replace them + std::vector<ReturnInst *> RItoRemove; + findReturnInst(F, RItoRemove); + + std::vector<Type *> RetArgTypes; + std::vector<Argument*> RetArgs; + std::vector<Argument*> Args; + // Check for { } return struct, which means that the function returns void + if (FRetTy->isEmptyTy()) { + + DEBUG(errs() << "\tFunction output struct is void\n"); + DEBUG(errs() << "\tNo parameters added\n"); + + // Replacing return statements with others returning void + for (auto *RI : RItoRemove) { + ReturnInst::Create((F->getContext()), 0, RI); + RI->eraseFromParent(); + } + DEBUG(errs() << "\tChanged return statements to return void\n"); + } + else { + // The struct has return values, thus needs to be converted to parameter + + // Iterate over all element types of return struct and add arguments to the + // function + for (unsigned i=0; i<FRetTy->getNumElements(); i++) { + Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F); + RetArgs.push_back(RetArg); + RetArgTypes.push_back(RetArg->getType()); + DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n"); + } + + DEBUG(errs() << "\tReplacing Return statements\n"); + // Replace return statements with extractValue and store instructions + for (auto *RI : RItoRemove) { + Value* RetVal = RI->getReturnValue(); + for(unsigned i = 0; i < RetArgs.size(); i++) { + ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i), + RetArgs[i]->getName()+".val", RI); + new StoreInst(EI, RetArgs[i], RI); + } + // assert(RetVal && "Return value should not be null at this point"); + // StructType* RetType = cast<StructType>(RetVal->getType()); + // assert(RetType && "Return type is not a struct"); + + ReturnInst::Create((F->getContext()), 0, RI); + RI->eraseFromParent(); + + } + } + DEBUG(errs() << "\tReplaced return statements\n"); + + // Create the argument type list with the added argument's type + std::vector<Type*> ArgTypes; + for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + ArgTypes.push_back(ai->getType()); + } + for(auto *RATy: RetArgTypes) { + ArgTypes.push_back(RATy); + } + + // Creating Args vector to use in cloning! + for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { + Args.push_back(&*ai); + } + for(auto *ai : RetArgs) { + Args.push_back(ai); + } + + // Adding new arguments to the function argument list, would not change the + // function type. We need to change the type of this function to reflect the + // added arguments + Type* VoidRetType = Type::getVoidTy(F->getContext()); + FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg()); + + // Change the function type + //F->mutateType(PTy); + Function* newF = cloneFunction(F, newFT, false, NULL, &Args); + replaceNodeFunctionInIR(*F->getParent(), F, newF); + //F->eraseFromParent(); + return newF; } /****************************************************************************** @@ -2159,289 +2100,269 @@ Function *CGT_NVPTX::transformFunctionToVoid(Function *F) { // 1. No stores // 2. Loads not dependent on getNodeInstanceID itrinsic -static bool findLoadStoreUses(Value *V, std::vector<Value *> *UseList, - std::vector<Value *> *VisitedList) { - if (std::find(VisitedList->begin(), VisitedList->end(), V) != - VisitedList->end()) { - DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); - return false; - } - VisitedList->push_back(V); - for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; - ++ui) { - Instruction *I = dyn_cast<Instruction>(*ui); - if (!I) { - // if use is not an instruction, then skip it - continue; - } - DEBUG(errs() << "\t" << *I << "\n"); - if (isa<LoadInst>(I)) { - DEBUG(errs() << "\tFound load instruction: " << *I << "\n"); - DEBUG(errs() << "\tAdd to use list: " << *V << "\n"); - UseList->push_back(V); - } else if (isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) { - // found a store in use chain - DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n"); - return true; - } else if (BuildDFG::isViscIntrinsic(I)) { - // If it is an atomic intrinsic, we found a store - IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); - assert(II && - II->getCalledValue()->getName().startswith("llvm.visc.atomic") && - "Only visc atomic intrinsics can have an argument as input"); - return true; - } else { - DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n"); - if (findLoadStoreUses(I, UseList, VisitedList)) - return true; - } - } - return false; +static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector<Value*>*VisitedList) { + if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) { + DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); + return false; + } + VisitedList->push_back(V); + for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); + ui != ue; ++ui) { + Instruction* I = dyn_cast<Instruction>(*ui); + if(!I) { + // if use is not an instruction, then skip it + continue; + } + DEBUG(errs() << "\t" << *I << "\n"); + if(isa<LoadInst>(I)) { + DEBUG(errs() << "\tFound load instruction: " << *I << "\n"); + DEBUG(errs() << "\tAdd to use list: " << *V << "\n"); + UseList->push_back(V); + } + else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) { + // found a store in use chain + DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n"); + return true; + } + else if(BuildDFG::isViscIntrinsic(I)) { + // If it is an atomic intrinsic, we found a store + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic") + && "Only visc atomic intrinsics can have an argument as input"); + return true; + } + else { + DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n"); + if(findLoadStoreUses(I, UseList, VisitedList)) + return true; + } + } + return false; } -static bool isDependentOnNodeInstanceID(Value *V, - std::vector<Value *> *DependenceList) { - if (std::find(DependenceList->begin(), DependenceList->end(), V) != - DependenceList->end()) { - DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); - return false; - } - DependenceList->push_back(V); - // If not an instruction, then not dependent on node instance id - if (!isa<Instruction>(V) || isa<Constant>(V)) { - DEBUG(errs() << "\tStop\n"); - return false; - } - - Instruction *I = cast<Instruction>(V); - for (unsigned i = 0; i < I->getNumOperands(); i++) { - Value *operand = I->getOperand(i); - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(operand)) { - if ((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x || - II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y || - II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) { - Value *Node = II->getArgOperand(0); - IntrinsicInst *GN = dyn_cast<IntrinsicInst>(Node); - assert( - GN && - "NodeInstanceID operande should be node/parent node intrinsic\n"); - if (GN->getIntrinsicID() == Intrinsic::visc_getNode) { - DEBUG(errs() << "\tDependency found on Node instance ID: " << *II - << "\n"); - return true; - } - } - } - if (CmpInst *CI = dyn_cast<CmpInst>(operand)) { - DEBUG(errs() << "Found compare instruction: " << *CI - << "\nNot following its dependency list\n"); - continue; - } - DEBUG(errs() << "\tTraverse the operand chain of: " << *operand << "\n"); - if (isDependentOnNodeInstanceID(operand, DependenceList)) { - return true; - } - } - return false; +static bool isDependentOnNodeInstanceID(Value* V, std::vector<Value*>*DependenceList) { + if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) { + DEBUG(errs() << "\tAlready visited value: " << *V << "\n"); + return false; + } + DependenceList->push_back(V); + // If not an instruction, then not dependent on node instance id + if(!isa<Instruction>(V) || isa<Constant>(V)) { + DEBUG(errs() << "\tStop\n"); + return false; + } + + Instruction* I = cast<Instruction>(V); + for(unsigned i = 0; i < I->getNumOperands(); i++) { + Value* operand = I->getOperand(i); + if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) { + if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x + || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y + || II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) { + Value* Node = II->getArgOperand(0); + IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node); + assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n"); + if(GN->getIntrinsicID() == Intrinsic::visc_getNode) { + DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n"); + return true; + } + } + } + if(CmpInst* CI = dyn_cast<CmpInst>(operand)) { + DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n"); + continue; + } + DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n"); + if(isDependentOnNodeInstanceID(operand, DependenceList)) { + return true; + } + } + return false; } // Function to check if argument arg can be changed to a constant memory pointer -static bool canBePromoted(Argument *arg, Function *F) { - DEBUG(errs() << "OPT: Check if Argument " << *arg - << " can be changed to constant memory\n"); - std::vector<Value *> UseList; - std::vector<Value *> VisitedList; - // recursively traverse use chain - // if find a store instruction return false, everything fails, cannot be - // promoted - // if find a load instruction as use, add the GEP instruction to list - bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList); - if (foundStore == true) - return false; - // See that the GEP instructions are not dependent on getNodeInstanceID - // intrinsic - DEBUG(errs() << foundStore - << "\tNo Store Instruction found. Check dependence on node " - "instance ID\n"); - std::vector<Value *> DependenceList; - for (auto U : UseList) { - if (isDependentOnNodeInstanceID(U, &DependenceList)) - return false; - } - DEBUG(errs() << "\tYes, Promotable to Constant Memory\n"); - return true; +static bool canBePromoted(Argument* arg, Function* F) { + DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n"); + std::vector<Value*> UseList; + std::vector<Value*> VisitedList; + // recursively traverse use chain + // if find a store instruction return false, everything fails, cannot be + // promoted + // if find a load instruction as use, add the GEP instruction to list + bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList); + if(foundStore == true) + return false; + // See that the GEP instructions are not dependent on getNodeInstanceID + // intrinsic + DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n"); + std::vector<Value*>DependenceList; + for(auto U: UseList) { + if(isDependentOnNodeInstanceID(U, &DependenceList)) + return false; + } + DEBUG(errs() << "\tYes, Promotable to Constant Memory\n"); + return true; } + // Calculate execute node parameters which include, number of diemnsions for // dynamic instances of the kernel, local and global work group sizes. -static void getExecuteNodeParams(Module &M, Value *&workDim, Value *&LocalWGPtr, - Value *&GlobalWGPtr, Kernel *kernel, - ValueToValueMapTy &VMap, Instruction *IB) { - - // Assign number of dimenstions a constant value - workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim); - - // If local work group size if null - if (!kernel->hasLocalWG()) { - LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext())); - } else { - for (unsigned i = 0; i < kernel->localWGSize.size(); i++) { - if (isa<Argument>(kernel->localWGSize[i])) - kernel->localWGSize[i] = VMap[kernel->localWGSize[i]]; - } - LocalWGPtr = - genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize"); - } - - for (unsigned i = 0; i < kernel->globalWGSize.size(); i++) { - if (isa<Argument>(kernel->globalWGSize[i])) - kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]]; - } - - // For OpenCL, global work group size is the total bumber of instances in each - // dimension. So, multiply local and global dim limits. - std::vector<Value *> globalWGSizeInsts; - if (kernel->hasLocalWG()) { - for (unsigned i = 0; i < kernel->gridDim; i++) { - BinaryOperator *MulInst = - BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], - kernel->localWGSize[i], "", IB); - globalWGSizeInsts.push_back(MulInst); - } - } else { - globalWGSizeInsts = kernel->globalWGSize; - } - GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize"); - DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n"); +static void getExecuteNodeParams(Module &M, Value* &workDim, Value* &LocalWGPtr, Value* + &GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) { + + // Assign number of dimenstions a constant value + workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim); + + // If local work group size if null + if(!kernel->hasLocalWG()) { + LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext())); + } + else { + for(unsigned i = 0; i < kernel->localWGSize.size(); i++) { + if(isa<Argument>(kernel->localWGSize[i])) + kernel->localWGSize[i] = VMap[kernel->localWGSize[i]]; + } + LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize"); + } + + for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) { + if(isa<Argument>(kernel->globalWGSize[i])) + kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]]; + } + + // For OpenCL, global work group size is the total bumber of instances in each + // dimension. So, multiply local and global dim limits. + std::vector<Value*> globalWGSizeInsts; + if(kernel->hasLocalWG()) { + for (unsigned i = 0; i < kernel->gridDim; i++) { + BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB); + globalWGSizeInsts.push_back(MulInst); + } + } + else { + globalWGSizeInsts = kernel->globalWGSize; + } + GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize"); + DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n"); } // CodeGen for allocating space for Work Group on stack and returning a pointer // to its address -static Value *genWorkGroupPtr(Module &M, std::vector<Value *> WGSize, - ValueToValueMapTy &VMap, Instruction *IB, - const Twine &WGName) { - Value *WGPtr; - // Get int64_t and or ease of use - Type *Int64Ty = Type::getInt64Ty(M.getContext()); - - // Work Group type is [#dim x i64] - Type *WGTy = ArrayType::get(Int64Ty, WGSize.size()); - // Allocate space of Global work group data on stack and get pointer to - // first element. - AllocaInst *WG = new AllocaInst(WGTy, 0, WGName, IB); - WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), - WG->getName() + ".0", IB); - Value *nextDim = WGPtr; - DEBUG(errs() << *WGPtr << "\n"); - - // Iterate over the number of dimensions and store the global work group - // size in that dimension - for (unsigned i = 0; i < WGSize.size(); i++) { - DEBUG(errs() << *WGSize[i] << "\n"); - assert(WGSize[i]->getType()->isIntegerTy() && - "Dimension not an integer type!"); - - if (WGSize[i]->getType() != Int64Ty) { - // If number of dimensions are mentioned in any other integer format, - // generate code to extend it to i64. We need to use the mapped value in - // the new generated function, hence the use of VMap - // FIXME: Why are we changing the kernel WGSize vector here? - DEBUG(errs() << "Not i64. Zero extend required.\n"); - DEBUG(errs() << *WGSize[i] << "\n"); - CastInst *CI = - BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB); - DEBUG(errs() << "Bitcast done.\n"); - StoreInst *SI = new StoreInst(CI, nextDim, IB); - DEBUG(errs() << "Zero extend done.\n"); - DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n"); - } else { - // Store the value representing work group size in ith dimension on - // stack - StoreInst *SI = new StoreInst(WGSize[i], nextDim, IB); +static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) { + Value* WGPtr; + // Get int64_t and or ease of use + Type* Int64Ty = Type::getInt64Ty(M.getContext()); + + // Work Group type is [#dim x i64] + Type* WGTy = ArrayType::get(Int64Ty, WGSize.size()); + // Allocate space of Global work group data on stack and get pointer to + // first element. + AllocaInst* WG = new AllocaInst(WGTy, 0, WGName, IB); + WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB); + Value* nextDim = WGPtr; + DEBUG(errs() << *WGPtr << "\n"); + + // Iterate over the number of dimensions and store the global work group + // size in that dimension + for(unsigned i=0; i < WGSize.size(); i++) { + DEBUG(errs() << *WGSize[i] << "\n"); + assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!"); + + if(WGSize[i]->getType() != Int64Ty) { + // If number of dimensions are mentioned in any other integer format, + // generate code to extend it to i64. We need to use the mapped value in + // the new generated function, hence the use of VMap + // FIXME: Why are we changing the kernel WGSize vector here? + DEBUG(errs() << "Not i64. Zero extend required.\n"); + DEBUG(errs() << *WGSize[i] << "\n"); + CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB); + DEBUG(errs() << "Bitcast done.\n"); + StoreInst* SI = new StoreInst(CI, nextDim, IB); + DEBUG(errs() << "Zero extend done.\n"); + DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n"); + } else { + // Store the value representing work group size in ith dimension on + // stack + StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB); + + DEBUG(errs() << "\t Work group size: " << *SI << "\n"); + } + if(i+1 < WGSize.size()) { + // Move to next dimension + GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim, + ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)), + WG->getName()+"."+Twine(i+1), + IB); + DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n"); + nextDim = GEP; + } + } + return WGPtr; - DEBUG(errs() << "\t Work group size: " << *SI << "\n"); - } - if (i + 1 < WGSize.size()) { - // Move to next dimension - GetElementPtrInst *GEP = GetElementPtrInst::Create( - nullptr, nextDim, ArrayRef<Value *>(ConstantInt::get(Int64Ty, 1)), - WG->getName() + "." + Twine(i + 1), IB); - DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n"); - nextDim = GEP; - } - } - return WGPtr; } // Get generated PTX binary name -static std::string getPTXFilename(const Module &M) { - std::string moduleID = M.getModuleIdentifier(); - moduleID.append(".kernels.cl"); - return moduleID; +static std::string getPTXFilename(const Module& M) { + std::string moduleID = M.getModuleIdentifier(); + moduleID.append(".kernels.cl"); + return moduleID; } // Get the name of the input file from module ID -static std::string getFilenameFromModule(const Module &M) { - std::string moduleID = M.getModuleIdentifier(); - return moduleID.substr(moduleID.find_last_of("/") + 1); +static std::string getFilenameFromModule(const Module& M) { + std::string moduleID = M.getModuleIdentifier(); + return moduleID.substr(moduleID.find_last_of("/")+1); } // Changes the data layout of the Module to be compiled with NVPTX backend // TODO: Figure out when to call it, probably after duplicating the modules static void changeDataLayout(Module &M) { - std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"; - std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64"; + std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"; + std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64"; - if (TARGET_PTX == 32) - M.setDataLayout(StringRef(nvptx32_layoutStr)); - else if (TARGET_PTX == 64) - M.setDataLayout(StringRef(nvptx64_layoutStr)); - else - assert(false && "Invalid PTX target"); + if (TARGET_PTX == 32) + M.setDataLayout(StringRef(nvptx32_layoutStr)); + else if (TARGET_PTX == 64) + M.setDataLayout(StringRef(nvptx64_layoutStr)); + else assert(false && "Invalid PTX target"); - return; + return; } static void changeTargetTriple(Module &M) { - std::string nvptx32_TargetTriple = "nvptx--nvidiacl"; - std::string nvptx64_TargetTriple = "nvptx64--nvidiacl"; + std::string nvptx32_TargetTriple = "nvptx--nvidiacl"; + std::string nvptx64_TargetTriple = "nvptx64--nvidiacl"; - if (TARGET_PTX == 32) - M.setTargetTriple(StringRef(nvptx32_TargetTriple)); - else if (TARGET_PTX == 64) - M.setTargetTriple(StringRef(nvptx64_TargetTriple)); - else - assert(false && "Invalid PTX target"); + if (TARGET_PTX == 32) + M.setTargetTriple(StringRef(nvptx32_TargetTriple)); + else if (TARGET_PTX == 64) + M.setTargetTriple(StringRef(nvptx64_TargetTriple)); + else assert(false && "Invalid PTX target"); - return; + return; } // Helper function, populate a vector with all return statements in a function -static void findReturnInst(Function *F, - std::vector<ReturnInst *> &ReturnInstVec) { - for (auto &BB : *F) { - if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator())) - ReturnInstVec.push_back(RI); - } +static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) { + for (auto &BB : *F) { + if(auto *RI = dyn_cast<ReturnInst>(BB.getTerminator())) + ReturnInstVec.push_back(RI); + } } -// Helper function, populate a vector with all IntrinsicID intrinsics in a -// function -static void findIntrinsicInst(Function *F, Intrinsic::ID IntrinsicID, - std::vector<IntrinsicInst *> &IntrinsicInstVec) { - for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { - Instruction *I = &(*i); - IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); - if (II && II->getIntrinsicID() == IntrinsicID) { - IntrinsicInstVec.push_back(II); - } - } +// Helper function, populate a vector with all IntrinsicID intrinsics in a function +static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) { + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &(*i); + IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + if (II && II->getIntrinsicID() == IntrinsicID) { + IntrinsicInstVec.push_back(II); + } + } } -// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic -// op +// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic op static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) { -<<<<<<< HEAD switch(ID) { case Intrinsic::visc_atomic_add: return AtomicRMWInst::Add; @@ -2462,39 +2383,11 @@ static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) { default: llvm_unreachable("Unsupported atomic intrinsic!"); }; -======= - switch (ID) { - case Intrinsic::visc_atomic_add: - return AtomicRMWInst::Add; - case Intrinsic::visc_atomic_sub: - return AtomicRMWInst::Sub; - case Intrinsic::visc_atomic_min: - return AtomicRMWInst::Min; - case Intrinsic::visc_atomic_umin: - return AtomicRMWInst::UMin; - case Intrinsic::visc_atomic_max: - return AtomicRMWInst::Max; - case Intrinsic::visc_atomic_umax: - return AtomicRMWInst::UMax; - // case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc; - // case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec; - case Intrinsic::visc_atomic_xchg: - return AtomicRMWInst::Xchg; - case Intrinsic::visc_atomic_and: - return AtomicRMWInst::And; - case Intrinsic::visc_atomic_or: - return AtomicRMWInst::Or; - case Intrinsic::visc_atomic_xor: - return AtomicRMWInst::Xor; - default: - llvm_unreachable("Unsupported atomic intrinsic!"); - }; ->>>>>>> 1fa97ee84c62e70116fdaa57b3b1b1117c2e653f } + // Helper funtion, returns the OpenCL function name, corresponding to atomic op static std::string getAtomicOpName(Intrinsic::ID ID) { -<<<<<<< HEAD switch(ID) { case Intrinsic::visc_atomic_add: return "atom_add"; @@ -2515,34 +2408,6 @@ static std::string getAtomicOpName(Intrinsic::ID ID) { default: llvm_unreachable("Unsupported atomic intrinsic!"); }; -======= - switch (ID) { - case Intrinsic::visc_atomic_cmpxchg: - return "atom_cmpxchg"; - case Intrinsic::visc_atomic_add: - return "atom_add"; - case Intrinsic::visc_atomic_sub: - return "atom_sub"; - case Intrinsic::visc_atomic_min: - return "atom_min"; - case Intrinsic::visc_atomic_max: - return "atom_max"; - case Intrinsic::visc_atomic_inc: - return "atom_inc"; - case Intrinsic::visc_atomic_dec: - return "atom_dec"; - case Intrinsic::visc_atomic_xchg: - return "atom_xchg"; - case Intrinsic::visc_atomic_and: - return "atom_and"; - case Intrinsic::visc_atomic_or: - return "atom_or"; - case Intrinsic::visc_atomic_xor: - return "atom_xor"; - default: - llvm_unreachable("Unsupported atomic intrinsic!"); - }; ->>>>>>> 1fa97ee84c62e70116fdaa57b3b1b1117c2e653f } } // End of namespace @@ -2553,3 +2418,4 @@ static RegisterPass<DFG2LLVM_NVPTX> X("dfg2llvm-nvptx", false /* does not modify the CFG */, true /* transformation, * * not just analysis */); + diff --git a/hpvm/lib/Transforms/GenVISC/GenVISC.cpp b/hpvm/lib/Transforms/GenVISC/GenVISC.cpp index f6a958f79e..d39003a294 100644 --- a/hpvm/lib/Transforms/GenVISC/GenVISC.cpp +++ b/hpvm/lib/Transforms/GenVISC/GenVISC.cpp @@ -10,118 +10,112 @@ #define DEBUG_TYPE "genvisc" #include "GenVISC/GenVISC.h" -#include "SupportVISC/VISCHint.h" -#include "SupportVISC/VISCUtils.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/CallSite.h" -#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/InstIterator.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IRReader/IRReader.h" -#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Support/SourceMgr.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/IR/DerivedTypes.h" +#include "SupportVISC/VISCHint.h" +#include "SupportVISC/VISCUtils.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "SupportVISC/VISCUtils.h" + -#define TIMER(X) \ - do { \ - if (VISCTimer) { \ - X; \ - } \ - } while (0) +#define TIMER(X) do { if (VISCTimer) { X; } } while (0) using namespace llvm; using namespace viscUtils; + // VISC Command line option to use timer or not -static cl::opt<bool> VISCTimer("visc-timers-gen", - cl::desc("Enable GenVISC timer")); +static cl::opt<bool> +VISCTimer("visc-timers-gen", cl::desc("Enable GenVISC timer")); namespace genvisc { // Helper Functions -static inline ConstantInt *getTimerID(Module &, enum visc_TimerID); -static Function *transformReturnTypeToStruct(Function *F); -static Type *getReturnTypeFromReturnInst(Function *F); +static inline ConstantInt* getTimerID(Module&, enum visc_TimerID); +static Function* transformReturnTypeToStruct(Function* F); +static Type* getReturnTypeFromReturnInst(Function* F); // Check if the dummy function call is a __visc__node call -#define IS_VISC_CALL(callName) \ - static bool isVISCCall_##callName(Instruction *I) { \ - if (!isa<CallInst>(I)) \ - return false; \ - CallInst *CI = cast<CallInst>(I); \ - return (CI->getCalledValue()->stripPointerCasts()->getName()) \ - .equals("__visc__" #callName); \ +#define IS_VISC_CALL(callName) \ + static bool isVISCCall_##callName(Instruction* I) { \ + if(!isa<CallInst>(I)) \ + return false; \ + CallInst* CI = cast<CallInst>(I); \ + return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("__visc__"#callName); \ } -static void ReplaceCallWithIntrinsic(Instruction *I, Intrinsic::ID IntrinsicID, - std::vector<Instruction *> *Erase) { +static void ReplaceCallWithIntrinsic(Instruction* I, Intrinsic::ID IntrinsicID, std::vector<Instruction*>* Erase) { // Check if the instruction is Call Instruction assert(isa<CallInst>(I) && "Expecting CallInst"); - CallInst *CI = cast<CallInst>(I); + CallInst* CI = cast<CallInst>(I); DEBUG(errs() << "Found call: " << *CI << "\n"); // Find the correct intrinsic call - Module *M = CI->getParent()->getParent()->getParent(); - Function *F; - std::vector<Type *> ArgTypes; - std::vector<Value *> args; - if (Intrinsic::isOverloaded(IntrinsicID)) { + Module* M = CI->getParent()->getParent()->getParent(); + Function* F; + std::vector<Type*> ArgTypes; + std::vector<Value*> args; + if(Intrinsic::isOverloaded(IntrinsicID)) { // This is an overloaded intrinsic. The types must exactly match. Get the // argument types - for (unsigned i = 0; i < CI->getNumArgOperands(); i++) { + for(unsigned i=0; i < CI->getNumArgOperands(); i++) { ArgTypes.push_back(CI->getArgOperand(i)->getType()); args.push_back(CI->getArgOperand(i)); } F = Intrinsic::getDeclaration(M, IntrinsicID, ArgTypes); DEBUG(errs() << *F << "\n"); - } else { // Non-overloaded intrinsic + } + else { // Non-overloaded intrinsic F = Intrinsic::getDeclaration(M, IntrinsicID); - FunctionType *FTy = F->getFunctionType(); + FunctionType* FTy = F->getFunctionType(); DEBUG(errs() << *F << "\n"); // Create argument list - assert(CI->getNumArgOperands() == FTy->getNumParams() && - "Number of arguments of call do not match with Intrinsic"); - for (unsigned i = 0; i < CI->getNumArgOperands(); i++) { - Value *V = CI->getArgOperand(i); + assert(CI->getNumArgOperands() == FTy->getNumParams() + && "Number of arguments of call do not match with Intrinsic"); + for(unsigned i=0; i < CI->getNumArgOperands(); i++) { + Value* V = CI->getArgOperand(i); // Either the type should match or both should be of pointer type assert((V->getType() == FTy->getParamType(i) || - (V->getType()->isPointerTy() && - FTy->getParamType(i)->isPointerTy())) && - "Dummy function call argument does not match with Intrinsic " - "argument!"); + (V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy())) + && "Dummy function call argument does not match with Intrinsic argument!"); // If the types do not match, then both must be pointer type and pointer // cast needs to be performed - if (V->getType() != FTy->getParamType(i)) { + if(V->getType() != FTy->getParamType(i)) { V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI); } args.push_back(V); } } // Insert call instruction - CallInst *Inst = CallInst::Create( - F, args, F->getReturnType()->isVoidTy() ? "" : CI->getName(), CI); + CallInst* Inst = CallInst::Create(F, args, F->getReturnType()->isVoidTy()? "" : CI->getName(), CI); DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n"); CI->replaceAllUsesWith(Inst); // If the previous instruction needs to be erased, insert it in the vector // Erased - if (Erase != NULL) + if(Erase != NULL) Erase->push_back(CI); } IS_VISC_CALL(launch) /* Exists but not required */ -IS_VISC_CALL(edge) /* Exists but not required */ +IS_VISC_CALL(edge) /* Exists but not required */ IS_VISC_CALL(createNodeND) -// IS_VISC_CALL(createNode) -// IS_VISC_CALL(createNode1D) -// IS_VISC_CALL(createNode2D) -// IS_VISC_CALL(createNode3D) +//IS_VISC_CALL(createNode) +//IS_VISC_CALL(createNode1D) +//IS_VISC_CALL(createNode2D) +//IS_VISC_CALL(createNode3D) IS_VISC_CALL(bindIn) IS_VISC_CALL(bindOut) IS_VISC_CALL(push) @@ -130,7 +124,7 @@ IS_VISC_CALL(getNode) IS_VISC_CALL(getParentNode) IS_VISC_CALL(barrier) IS_VISC_CALL(malloc) -IS_VISC_CALL(return ) +IS_VISC_CALL(return) IS_VISC_CALL(getNodeInstanceID_x) IS_VISC_CALL(getNodeInstanceID_y) IS_VISC_CALL(getNodeInstanceID_z) @@ -158,6 +152,7 @@ IS_VISC_CALL(sqrt) IS_VISC_CALL(sin) IS_VISC_CALL(cos) + IS_VISC_CALL(init) IS_VISC_CALL(cleanup) IS_VISC_CALL(wait) @@ -168,137 +163,136 @@ IS_VISC_CALL(attributes) IS_VISC_CALL(hint) // Return the constant integer represented by value V -static unsigned getNumericValue(Value *V) { - assert( - isa<ConstantInt>(V) && - "Value indicating the number of arguments should be a constant integer"); +static unsigned getNumericValue(Value* V) { + assert(isa<ConstantInt>(V) + && "Value indicating the number of arguments should be a constant integer"); return cast<ConstantInt>(V)->getZExtValue(); } // Take the __visc__return instruction and generate code for combining the // values being returned into a struct and returning it. // The first operand is the number of returned values -static Value *genCodeForReturn(CallInst *CI) { - LLVMContext &Ctx = CI->getContext(); - assert(isVISCCall_return(CI) && "__visc__return instruction expected!"); +static Value* genCodeForReturn(CallInst* CI) { + LLVMContext& Ctx = CI->getContext(); + assert(isVISCCall_return(CI) + && "__visc__return instruction expected!"); // Parse the dummy function call here - assert(CI->getNumArgOperands() > 0 && - "Too few arguments for __visc_return call!\n"); + assert(CI->getNumArgOperands() > 0 && "Too few arguments for __visc_return call!\n"); unsigned numRetVals = getNumericValue(CI->getArgOperand(0)); - assert(CI->getNumArgOperands() - 1 == numRetVals && + assert(CI->getNumArgOperands()-1 == numRetVals && "Too few arguments for __visc_return call!\n"); DEBUG(errs() << "\tNum of return values = " << numRetVals << "\n"); - std::vector<Type *> ArgTypes; - for (unsigned i = 1; i < CI->getNumArgOperands(); i++) { + std::vector<Type*> ArgTypes; + for(unsigned i=1; i < CI->getNumArgOperands(); i++) { ArgTypes.push_back(CI->getArgOperand(i)->getType()); } Twine outTyName = "struct.out." + CI->getParent()->getParent()->getName(); - StructType *RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true); + StructType* RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true); - InsertValueInst *IV = InsertValueInst::Create( - UndefValue::get(RetTy), CI->getArgOperand(1), 0, "returnStruct", CI); + InsertValueInst* IV = InsertValueInst::Create(UndefValue::get(RetTy), + CI->getArgOperand(1), + 0, + "returnStruct", + CI); DEBUG(errs() << "Code generation for return:\n"); DEBUG(errs() << *IV << "\n"); - for (unsigned i = 2; i < CI->getNumArgOperands(); i++) { - IV = InsertValueInst::Create(IV, CI->getArgOperand(i), i - 1, IV->getName(), + for(unsigned i=2; i < CI->getNumArgOperands(); i++) { + IV = InsertValueInst::Create(IV, + CI->getArgOperand(i), + i-1, + IV->getName(), CI); DEBUG(errs() << *IV << "\n"); } - + return IV; } // Analyse the attribute call for this function. Add the in and out // attributes to pointer parameters. -static void handleVISCAttributes(Function *F, CallInst *CI) { - DEBUG(errs() << "Kernel before adding In/Out VISC attributes:\n" - << *F << "\n"); +static void handleVISCAttributes(Function* F, CallInst* CI) { + DEBUG(errs() << "Kernel before adding In/Out VISC attributes:\n" << *F << "\n"); // Parse the dummy function call here unsigned offset = 0; // Find number of In pointers - assert(CI->getNumArgOperands() > offset && - "Too few arguments for __visc__attributes call!"); + assert(CI->getNumArgOperands() > offset + && "Too few arguments for __visc__attributes call!"); unsigned numInPtrs = getNumericValue(CI->getArgOperand(offset)); DEBUG(errs() << "\tNum of in pointers = " << numInPtrs << "\n"); - for (unsigned i = offset + 1; i < offset + 1 + numInPtrs; i++) { - Value *V = CI->getArgOperand(i); - if (Argument *arg = dyn_cast<Argument>(V)) { - F->addAttribute(1 + arg->getArgNo(), Attribute::In); - } else { - DEBUG(errs() << "Invalid argument to __visc__attribute: " << *V << "\n"); - llvm_unreachable( - "Only pointer arguments can be passed to __visc__attributes call"); + for(unsigned i = offset+1; i< offset+1+numInPtrs; i++) { + Value* V = CI->getArgOperand(i); + if(Argument* arg = dyn_cast<Argument>(V)) { + F->addAttribute(1+arg->getArgNo(), Attribute::In); + } + else { + errs() << "Invalid argument to __visc__attribute: " << *V << "\n"; + llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call"); } } // Find number of Out Pointers offset += 1 + numInPtrs; - assert(CI->getNumArgOperands() > offset && - "Too few arguments for __visc__attributes call!"); + assert(CI->getNumArgOperands() > offset + && "Too few arguments for __visc__attributes call!"); unsigned numOutPtrs = getNumericValue(CI->getOperand(offset)); DEBUG(errs() << "\tNum of out Pointers = " << numOutPtrs << "\n"); - for (unsigned i = offset + 1; i < offset + 1 + numOutPtrs; i++) { - Value *V = CI->getArgOperand(i); - if (Argument *arg = dyn_cast<Argument>(V)) { - F->addAttribute(1 + arg->getArgNo(), Attribute::Out); - } else { - DEBUG(errs() << "Invalid argument to __visc__attribute: " << *V << "\n"); - llvm_unreachable( - "Only pointer arguments can be passed to __visc__attributes call"); + for(unsigned i = offset+1; i< offset+1+numOutPtrs; i++) { + Value* V = CI->getArgOperand(i); + if(Argument* arg = dyn_cast<Argument>(V)) { + F->addAttribute(1+arg->getArgNo(), Attribute::Out); + } + else { + errs() << "Invalid argument to __visc__attribute: " << *V << "\n"; + llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call"); } } - DEBUG(errs() << "Kernel after adding In/Out VISC attributes:\n" - << *F << "\n"); + DEBUG(errs() << "Kernel after adding In/Out VISC attributes:\n" << *F << "\n"); } // Public Functions of GenVISC pass bool GenVISC::runOnModule(Module &M) { - DEBUG(errs() << "\nGENVISC PASS\n"); + errs() << "\nGENVISC PASS\n"; this->M = &M; // Load Runtime API Module SMDiagnostic Err; - char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); - assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!"); + char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT"); + assert(LLVM_SRC_ROOT != NULL && + "Define LLVM_SRC_ROOT environment variable!"); Twine llvmSrcRoot = LLVM_SRC_ROOT; Twine runtimeAPI = llvmSrcRoot + "/tools/hpvm/projects/visc-rt/visc-rt.ll"; - DEBUG(errs() << llvmSrcRoot << "\n"); + errs() << llvmSrcRoot << "\n"; - std::unique_ptr<Module> runtimeModule = - parseIRFile(runtimeAPI.str(), Err, M.getContext()); + std::unique_ptr<Module> runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext()); - if (runtimeModule == NULL) - DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n"); + if(runtimeModule == NULL) + DEBUG(errs() << Err.getMessage()); else DEBUG(errs() << "Successfully loaded visc-rt API module\n"); - llvm_visc_initializeTimerSet = M.getOrInsertFunction( - "llvm_visc_initializeTimerSet", - runtimeModule->getFunction("llvm_visc_initializeTimerSet") - ->getFunctionType()); - // DEBUG(errs() << *llvm_visc_initializeTimerSet); + llvm_visc_initializeTimerSet = M.getOrInsertFunction("llvm_visc_initializeTimerSet", + runtimeModule->getFunction("llvm_visc_initializeTimerSet")->getFunctionType()); + //DEBUG(errs() << *llvm_visc_initializeTimerSet); - llvm_visc_switchToTimer = M.getOrInsertFunction( - "llvm_visc_switchToTimer", - runtimeModule->getFunction("llvm_visc_switchToTimer")->getFunctionType()); - // DEBUG(errs() << *llvm_visc_switchToTimer); + llvm_visc_switchToTimer = M.getOrInsertFunction("llvm_visc_switchToTimer", + runtimeModule->getFunction("llvm_visc_switchToTimer")->getFunctionType()); + // DEBUG(errs() << *llvm_visc_switchToTimer); - llvm_visc_printTimerSet = M.getOrInsertFunction( - "llvm_visc_printTimerSet", - runtimeModule->getFunction("llvm_visc_printTimerSet")->getFunctionType()); - // DEBUG(errs() << *llvm_visc_printTimerSet); + llvm_visc_printTimerSet = M.getOrInsertFunction("llvm_visc_printTimerSet", + runtimeModule->getFunction("llvm_visc_printTimerSet")->getFunctionType()); + //DEBUG(errs() << *llvm_visc_printTimerSet); // Insert init context in main DEBUG(errs() << "Locate __visc__init()\n"); - Function *VI = M.getFunction("__visc__init"); + Function* VI = M.getFunction("__visc__init"); assert(VI->getNumUses() == 1 && "__visc__init should only be used once"); - Instruction *I = cast<Instruction>(*VI->user_begin()); + Instruction* I = cast<Instruction>(*VI->user_begin()); DEBUG(errs() << "Initialize Timer Set\n"); initializeTimerSet(I); @@ -306,17 +300,18 @@ bool GenVISC::runOnModule(Module &M) { // Insert print instruction at visc exit DEBUG(errs() << "Locate __visc__cleanup()\n"); - Function *VC = M.getFunction("__visc__cleanup"); + Function* VC = M.getFunction("__visc__cleanup"); assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once"); I = cast<Instruction>(*VC->user_begin()); printTimerSet(I); + DEBUG(errs() << "-------- Searching for launch sites ----------\n"); - std::vector<Instruction *> toBeErased; - std::vector<Function *> functions; + std::vector<Instruction*> toBeErased; + std::vector<Function*> functions; - for (auto &F : M) + for (auto &F : M) functions.push_back(&F); // Iterate over all functions in the module @@ -324,7 +319,7 @@ bool GenVISC::runOnModule(Module &M) { DEBUG(errs() << "Function: " << f->getName() << "\n"); // List with the required additions in the function's return type - std::vector<Type *> FRetTypes; + std::vector<Type*> FRetTypes; enum mutateTypeCause { mtc_None, @@ -335,106 +330,98 @@ bool GenVISC::runOnModule(Module &M) { bind = mutateTypeCause::mtc_None; // Iterate over all the instructions in this function - for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e; ++i) { - Instruction *I = &*i; // Grab pointer to Instruction + for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e ; ++i) { + Instruction* I = &*i; // Grab pointer to Instruction // If not a call instruction, move to next instruction - if (!isa<CallInst>(I)) + if(!isa<CallInst>(I)) continue; - CallInst *CI = cast<CallInst>(I); - LLVMContext &Ctx = CI->getContext(); + CallInst* CI = cast<CallInst>(I); + LLVMContext& Ctx = CI->getContext(); - if (isVISCCall_init(I)) { + if(isVISCCall_init(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_init, &toBeErased); } - if (isVISCCall_cleanup(I)) { + if(isVISCCall_cleanup(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_cleanup, &toBeErased); } - if (isVISCCall_wait(I)) { + if(isVISCCall_wait(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_wait, &toBeErased); } - if (isVISCCall_trackMemory(I)) { + if(isVISCCall_trackMemory(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_trackMemory, &toBeErased); } - if (isVISCCall_untrackMemory(I)) { + if(isVISCCall_untrackMemory(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_untrackMemory, &toBeErased); } - if (isVISCCall_requestMemory(I)) { + if(isVISCCall_requestMemory(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_requestMemory, &toBeErased); } - if (isVISCCall_hint(I)) { - assert(isa<ConstantInt>(CI->getArgOperand(0)) && - "Argument to hint must be constant integer!"); - ConstantInt *hint = cast<ConstantInt>(CI->getArgOperand(0)); + if(isVISCCall_hint(I)) { + assert(isa<ConstantInt>(CI->getArgOperand(0)) + && "Argument to hint must be constant integer!"); + ConstantInt* hint = cast<ConstantInt>(CI->getArgOperand(0)); - visc::Target t = (visc::Target)hint->getZExtValue(); + visc::Target t = (visc::Target) hint->getZExtValue(); addHint(CI->getParent()->getParent(), t); DEBUG(errs() << "Found visc hint call: " << *CI << "\n"); toBeErased.push_back(CI); } - if (isVISCCall_launch(I)) { - Function *LaunchF = - Intrinsic::getDeclaration(&M, Intrinsic::visc_launch); + if(isVISCCall_launch(I)) { + Function* LaunchF = Intrinsic::getDeclaration(&M, Intrinsic::visc_launch); DEBUG(errs() << *LaunchF << "\n"); // Get i8* cast to function pointer - Function *graphFunc = cast<Function>(CI->getArgOperand(1)); + Function* graphFunc = cast<Function>(CI->getArgOperand(1)); graphFunc = transformReturnTypeToStruct(graphFunc); - Constant *F = - ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); - assert( - F && - "Function invoked by VISC launch has to be define and constant."); - - ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(0)); - assert(Op && "VISC launch's streaming argument is a constant value."); - Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - + Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); + assert(F && "Function invoked by VISC launch has to be define and constant."); + + ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(0)); + assert(Op && "VISC launch's streaming argument is a constant value."); + Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + auto *ArgTy = dyn_cast<PointerType>(CI->getArgOperand(2)->getType()); assert(ArgTy && "VISC launch argument should be pointer type."); Value *Arg = CI->getArgOperand(2); - if (!ArgTy->getElementType()->isIntegerTy(8)) - Arg = BitCastInst::CreatePointerCast(CI->getArgOperand(2), - Type::getInt8PtrTy(Ctx), "", CI); - Value *LaunchArgs[] = {F, Arg, isStreaming}; - CallInst *LaunchInst = CallInst::Create( - LaunchF, ArrayRef<Value *>(LaunchArgs, 3), "graphID", CI); + if(!ArgTy->getElementType()->isIntegerTy(8)) + Arg = BitCastInst::CreatePointerCast(CI->getArgOperand(2), Type::getInt8PtrTy(Ctx), "", CI); + Value* LaunchArgs[] = {F, Arg, isStreaming}; + CallInst* LaunchInst = CallInst::Create(LaunchF, + ArrayRef<Value*>(LaunchArgs, 3), + "graphID", CI); DEBUG(errs() << "Found visc launch call: " << *CI << "\n"); DEBUG(errs() << "\tSubstitute with: " << *LaunchInst << "\n"); CI->replaceAllUsesWith(LaunchInst); toBeErased.push_back(CI); } - if (isVISCCall_push(I)) { + if(isVISCCall_push(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_push, &toBeErased); } - if (isVISCCall_pop(I)) { + if(isVISCCall_pop(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_pop, &toBeErased); } - if (isVISCCall_createNodeND(I)) { + if(isVISCCall_createNodeND(I)) { assert(CI->getNumArgOperands() > 0 && "Too few arguments for __visc__createNodeND call"); unsigned numDims = getNumericValue(CI->getArgOperand(0)); // We need as meny dimension argments are there are dimensions - assert(CI->getNumArgOperands() - 2 == numDims && - "Too few arguments for __visc_createNodeND call!\n"); + assert(CI->getNumArgOperands()-2 == numDims && + "Too few arguments for __visc_createNodeND call!\n"); - Function *CreateNodeF; + Function* CreateNodeF; switch (numDims) { case 0: - CreateNodeF = - Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode); + CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode); break; case 1: - CreateNodeF = - Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode1D); + CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode1D); break; case 2: - CreateNodeF = - Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode2D); + CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode2D); break; case 3: - CreateNodeF = - Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode3D); + CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode3D); break; default: llvm_unreachable("Unsupported number of dimensions\n"); @@ -442,57 +429,63 @@ bool GenVISC::runOnModule(Module &M) { } DEBUG(errs() << *CreateNodeF << "\n"); DEBUG(errs() << *I << "\n"); - DEBUG(errs() << "in " << I->getParent()->getParent()->getName() - << "\n"); + DEBUG(errs() << "in " << I->getParent()->getParent()->getName() << "\n"); // Get i8* cast to function pointer - Function *graphFunc = cast<Function>(CI->getArgOperand(1)); + Function* graphFunc = cast<Function>(CI->getArgOperand(1)); graphFunc = transformReturnTypeToStruct(graphFunc); - Constant *F = - ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); + Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx)); - CallInst *CreateNodeInst; + CallInst* CreateNodeInst; switch (numDims) { case 0: - CreateNodeInst = CallInst::Create(CreateNodeF, ArrayRef<Value *>(F), - graphFunc->getName() + ".node", CI); + CreateNodeInst = CallInst::Create(CreateNodeF, + ArrayRef<Value*>(F), + graphFunc->getName()+".node", CI); break; - case 1: { + case 1: + { assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && "CreateNodeND dimension argument, 2, expected to be i64\n"); - Value *CreateNodeArgs[] = {F, CI->getArgOperand(2)}; - CreateNodeInst = CallInst::Create( - CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 2), - graphFunc->getName() + ".node", CI); - } break; - case 2: { + Value* CreateNodeArgs[] = {F, CI->getArgOperand(2)}; + CreateNodeInst = CallInst::Create(CreateNodeF, + ArrayRef<Value*>(CreateNodeArgs, 2), + graphFunc->getName()+".node", CI); + } + break; + case 2: + { assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && "CreateNodeND dimension argument, 2, expected to be i64\n"); assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) && "CreateNodeND dimension argument, 3, expected to be i64\n"); - Value *CreateNodeArgs[] = {F, CI->getArgOperand(2), + Value* CreateNodeArgs[] = {F, + CI->getArgOperand(2), CI->getArgOperand(3)}; - CreateNodeInst = CallInst::Create( - CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 3), - graphFunc->getName() + ".node", CI); - } break; - case 3: { + CreateNodeInst = CallInst::Create(CreateNodeF, + ArrayRef<Value*>(CreateNodeArgs, 3), + graphFunc->getName()+".node", CI); + } + break; + case 3: + { assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) && "CreateNodeND dimension argument, 2, expected to be i64\n"); assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) && "CreateNodeND dimension argument, 3, expected to be i64\n"); assert((CI->getArgOperand(4)->getType() == Type::getInt64Ty(Ctx)) && "CreateNodeND dimension argument, 4, expected to be i64\n"); - Value *CreateNodeArgs[] = {F, CI->getArgOperand(2), + Value* CreateNodeArgs[] = {F, + CI->getArgOperand(2), CI->getArgOperand(3), CI->getArgOperand(4)}; - CreateNodeInst = CallInst::Create( - CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 4), - graphFunc->getName() + ".node", CI); - } break; + CreateNodeInst = CallInst::Create(CreateNodeF, + ArrayRef<Value*>(CreateNodeArgs, 4), + graphFunc->getName()+".node", CI); + } + break; default: - llvm_unreachable( - "Impossible path: number of dimensions is 0, 1, 2, 3\n"); + llvm_unreachable("Impossible path: number of dimensions is 0, 1, 2, 3\n"); break; } @@ -502,104 +495,99 @@ bool GenVISC::runOnModule(Module &M) { toBeErased.push_back(CI); } - if (isVISCCall_edge(I)) { - Function *EdgeF = - Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge); + if(isVISCCall_edge(I)) { + Function* EdgeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge); DEBUG(errs() << *EdgeF << "\n"); - ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(5)); - ConstantInt *EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2)); - assert(Op && EdgeTypeOp && - "Arguments of CreateEdge are not constant integers."); - Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - Value *isAllToAll = EdgeTypeOp->isZero() ? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - Value *EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), - isAllToAll, CI->getArgOperand(3), - CI->getArgOperand(4), isStreaming}; - CallInst *EdgeInst = CallInst::Create( - EdgeF, ArrayRef<Value *>(EdgeArgs, 6), "output", CI); + ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(5)); + ConstantInt* EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2)); + assert(Op && EdgeTypeOp && "Arguments of CreateEdge are not constant integers."); + Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value* isAllToAll = EdgeTypeOp->isZero()? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value* EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), + isAllToAll, CI->getArgOperand(3), CI->getArgOperand(4), + isStreaming + }; + CallInst* EdgeInst = CallInst::Create(EdgeF, + ArrayRef<Value*>(EdgeArgs, 6), + "output", CI); DEBUG(errs() << "Found visc edge call: " << *CI << "\n"); DEBUG(errs() << "\tSubstitute with: " << *EdgeInst << "\n"); CI->replaceAllUsesWith(EdgeInst); toBeErased.push_back(CI); } - if (isVISCCall_bindIn(I)) { - Function *BindInF = - Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input); + if(isVISCCall_bindIn(I)) { + Function* BindInF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input); DEBUG(errs() << *BindInF << "\n"); // Check if this is a streaming bind or not - ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(3)); - assert(Op && "Streaming argument for bind in intrinsic should be a " - "constant integer."); - Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - Value *BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), isStreaming}; - CallInst *BindInInst = - CallInst::Create(BindInF, ArrayRef<Value *>(BindInArgs, 4), "", CI); + ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3)); + assert(Op && "Streaming argument for bind in intrinsic should be a constant integer."); + Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value* BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), isStreaming + }; + CallInst* BindInInst = CallInst::Create(BindInF, + ArrayRef<Value*>(BindInArgs, 4), + "", CI); DEBUG(errs() << "Found visc bindIn call: " << *CI << "\n"); DEBUG(errs() << "\tSubstitute with: " << *BindInInst << "\n"); CI->replaceAllUsesWith(BindInInst); toBeErased.push_back(CI); } - if (isVISCCall_bindOut(I)) { - Function *BindOutF = - Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output); + if(isVISCCall_bindOut(I)) { + Function* BindOutF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output); DEBUG(errs() << *BindOutF << "\n"); // Check if this is a streaming bind or not - ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(3)); - assert(Op && "Streaming argument for bind out intrinsic should be a " - "constant integer."); - Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx) - : ConstantInt::getTrue(Ctx); - Value *BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), isStreaming}; - CallInst *BindOutInst = CallInst::Create( - BindOutF, ArrayRef<Value *>(BindOutArgs, 4), "", CI); + ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3)); + assert(Op && "Streaming argument for bind out intrinsic should be a constant integer."); + Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx) + : ConstantInt::getTrue(Ctx); + Value* BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), isStreaming + }; + CallInst* BindOutInst = CallInst::Create(BindOutF, + ArrayRef<Value*>(BindOutArgs, 4), + "", CI); DEBUG(errs() << "Found visc bindOut call: " << *CI << "\n"); DEBUG(errs() << "\tSubstitute with: " << *BindOutInst << "\n"); DEBUG(errs() << "Fixing the return type of the function\n"); // FIXME: What if the child node function has not been visited already. // i.e., it's return type has not been fixed. - Function *F = I->getParent()->getParent(); + Function* F = I->getParent()->getParent(); DEBUG(errs() << F->getName() << "\n";); - IntrinsicInst *NodeIntrinsic = - cast<IntrinsicInst>(CI->getArgOperand(0)); - assert(NodeIntrinsic && - "Instruction value in bind out is not a create node intrinsic."); + IntrinsicInst* NodeIntrinsic = cast<IntrinsicInst>(CI->getArgOperand(0)); + assert(NodeIntrinsic && "Instruction value in bind out is not a create node intrinsic."); DEBUG(errs() << "Node intrinsic: " << *NodeIntrinsic << "\n"); - assert( - (NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode || - NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode1D || - NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode2D || - NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode3D) && - "Instruction value in bind out is not a create node intrinsic."); - Function *ChildF = cast<Function>( - NodeIntrinsic->getArgOperand(0)->stripPointerCasts()); + assert((NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode || + NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode1D || + NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode2D || + NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode3D) && + "Instruction value in bind out is not a create node intrinsic."); + Function* ChildF = cast<Function>(NodeIntrinsic->getArgOperand(0)->stripPointerCasts()); DEBUG(errs() << ChildF->getName() << "\n";); int srcpos = cast<ConstantInt>(CI->getArgOperand(1))->getSExtValue(); int destpos = cast<ConstantInt>(CI->getArgOperand(2))->getSExtValue(); - StructType *ChildReturnTy = cast<StructType>(ChildF->getReturnType()); + StructType* ChildReturnTy = cast<StructType>(ChildF->getReturnType()); - Type *ReturnType = F->getReturnType(); + Type* ReturnType = F->getReturnType(); DEBUG(errs() << *ReturnType << "\n";); - assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType)) && - "Return type should either be a struct or void type!"); + assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType)) + && "Return type should either be a struct or void type!"); - FRetTypes.insert(FRetTypes.begin() + destpos, - ChildReturnTy->getElementType(srcpos)); + FRetTypes.insert(FRetTypes.begin()+destpos, ChildReturnTy->getElementType(srcpos)); assert(((bind == mutateTypeCause::mtc_BIND) || (bind == mutateTypeCause::mtc_None)) && - "Both bind_out and visc_return detected"); + "Both bind_out and visc_return detected"); bind = mutateTypeCause::mtc_BIND; CI->replaceAllUsesWith(BindOutInst); toBeErased.push_back(CI); } - if (isVISCCall_attributes(I)) { - Function *F = CI->getParent()->getParent(); + if(isVISCCall_attributes(I)) { + Function* F = CI->getParent()->getParent(); handleVISCAttributes(F, CI); toBeErased.push_back(CI); } @@ -616,80 +604,65 @@ bool GenVISC::runOnModule(Module &M) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_malloc, &toBeErased); } if (isVISCCall_return(I)) { - DEBUG(errs() << "Function before visc return processing\n" - << *I->getParent()->getParent() << "\n"); + DEBUG(errs() << "Function before visc return processing\n" << *I->getParent()->getParent() << "\n"); // The operands to this call are the values to be returned by the node - Value *ReturnVal = genCodeForReturn(CI); + Value* ReturnVal = genCodeForReturn(CI); DEBUG(errs() << *ReturnVal << "\n"); - Type *ReturnType = ReturnVal->getType(); - assert(isa<StructType>(ReturnType) && - "Return type should be a struct type!"); + Type* ReturnType = ReturnVal->getType(); + assert(isa<StructType>(ReturnType) + && "Return type should be a struct type!"); assert(((bind == mutateTypeCause::mtc_RETURN) || (bind == mutateTypeCause::mtc_None)) && - "Both bind_out and visc_return detected"); + "Both bind_out and visc_return detected"); if (bind == mutateTypeCause::mtc_None) { // If this is None, this is the first __visc__return // instruction we have come upon. Place the return type of the // function in the return type vector bind = mutateTypeCause::mtc_RETURN; - StructType *ReturnStructTy = cast<StructType>(ReturnType); + StructType* ReturnStructTy = cast<StructType>(ReturnType); for (unsigned i = 0; i < ReturnStructTy->getNumElements(); i++) FRetTypes.push_back(ReturnStructTy->getElementType(i)); } else { // bind == mutateTypeCause::mtc_RETURN // This is not the first __visc__return - // instruction we have come upon. + // instruction we have come upon. // Check that the return types are the same - assert((ReturnType == FRetTypes[0]) && - "Multiple returns with mismatching types"); + assert((ReturnType == FRetTypes[0]) + && "Multiple returns with mismatching types"); } - ReturnInst *RetInst = ReturnInst::Create(Ctx, ReturnVal); + ReturnInst* RetInst = ReturnInst::Create(Ctx, ReturnVal); DEBUG(errs() << "Found visc return call: " << *CI << "\n"); - Instruction *oldReturn = CI->getParent()->getTerminator(); - assert(isa<ReturnInst>(oldReturn) && - "Expecting a return to be the terminator of this BB!"); + Instruction* oldReturn = CI->getParent()->getTerminator(); + assert(isa<ReturnInst>(oldReturn) + && "Expecting a return to be the terminator of this BB!"); DEBUG(errs() << "Found return statement of BB: " << *oldReturn << "\n"); DEBUG(errs() << "\tSubstitute return with: " << *RetInst << "\n"); - // CI->replaceAllUsesWith(RetInst); + //CI->replaceAllUsesWith(RetInst); toBeErased.push_back(CI); ReplaceInstWithInst(oldReturn, RetInst); - DEBUG(errs() << "Function after visc return processing\n" - << *I->getParent()->getParent() << "\n"); + DEBUG(errs() << "Function after visc return processing\n" << *I->getParent()->getParent() << "\n"); } if (isVISCCall_getNodeInstanceID_x(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_x, - &toBeErased); + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_x, &toBeErased); } if (isVISCCall_getNodeInstanceID_y(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_y, - &toBeErased); + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_y, &toBeErased); } if (isVISCCall_getNodeInstanceID_z(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_z, - &toBeErased); + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_z, &toBeErased); } if (isVISCCall_getNumNodeInstances_x(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_x, - &toBeErased); + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_x, &toBeErased); } if (isVISCCall_getNumNodeInstances_y(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_y, - &toBeErased); + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_y, &toBeErased); } if (isVISCCall_getNumNodeInstances_z(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_z, - &toBeErased); - } -<<<<<<< HEAD -======= - if (isVISCCall_atomic_cmpxchg(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_cmpxchg, - &toBeErased); + ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_z, &toBeErased); } ->>>>>>> 1fa97ee84c62e70116fdaa57b3b1b1117c2e653f if (isVISCCall_atomic_add(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_add, &toBeErased); } @@ -714,19 +687,6 @@ bool GenVISC::runOnModule(Module &M) { if (isVISCCall_atomic_xor(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_xor, &toBeErased); } -<<<<<<< HEAD -======= - if (isVISCCall_floor(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::floor, &toBeErased); - } - if (isVISCCall_rsqrt(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::nvvm_rsqrt_approx_f, - &toBeErased); - } - if (isVISCCall_sqrt(I)) { - ReplaceCallWithIntrinsic(I, Intrinsic::sqrt, &toBeErased); - } ->>>>>>> 1fa97ee84c62e70116fdaa57b3b1b1117c2e653f if (isVISCCall_sin(I)) { ReplaceCallWithIntrinsic(I, Intrinsic::sin, &toBeErased); } @@ -737,155 +697,148 @@ bool GenVISC::runOnModule(Module &M) { // Erase the __visc__node calls DEBUG(errs() << "Erase " << toBeErased.size() << " Statements:\n"); - for (auto I : toBeErased) { + for(auto I: toBeErased) { DEBUG(errs() << *I << "\n"); } - while (!toBeErased.empty()) { - Instruction *I = toBeErased.back(); + while(!toBeErased.empty()) { + Instruction* I = toBeErased.back(); DEBUG(errs() << "\tErasing " << *I << "\n"); I->eraseFromParent(); - toBeErased.pop_back(); + toBeErased.pop_back(); } - if (bind == mutateTypeCause::mtc_BIND || - bind == mutateTypeCause::mtc_RETURN) { - DEBUG(errs() << "Function before fixing return type\n" << *f << "\n"); - // Argument type list. - std::vector<Type *> FArgTypes; - for (Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end(); - ai != ae; ++ai) { - FArgTypes.push_back(ai->getType()); - } - - // Find new return type of function - Type *NewReturnTy; - if (bind == mutateTypeCause::mtc_BIND) { - - std::vector<Type *> TyList; - for (unsigned i = 0; i < FRetTypes.size(); i++) - TyList.push_back(FRetTypes[i]); - - NewReturnTy = - StructType::create(f->getContext(), TyList, - Twine("struct.out." + f->getName()).str(), true); - } else { - NewReturnTy = getReturnTypeFromReturnInst(f); - assert(NewReturnTy->isStructTy() && "Expecting a struct type!"); - } - - FunctionType *FTy = - FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg()); - - // Change the function type - Function *newF = cloneFunction(f, FTy, false); - DEBUG(errs() << *newF << "\n"); - - if (bind == mutateTypeCause::mtc_BIND) { - // This is certainly an internal node, and hence just one BB with one - // return terminator instruction. Change return statement - ReturnInst *RI = - cast<ReturnInst>(newF->getEntryBlock().getTerminator()); - ReturnInst *newRI = ReturnInst::Create(newF->getContext(), - UndefValue::get(NewReturnTy)); - ReplaceInstWithInst(RI, newRI); - } - if (bind == mutateTypeCause::mtc_RETURN) { - // Nothing - } - replaceNodeFunctionInIR(*f->getParent(), f, newF); - DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n"); + if(bind == mutateTypeCause::mtc_BIND || bind == mutateTypeCause::mtc_RETURN) { + DEBUG(errs() << "Function before fixing return type\n" << *f << "\n"); + // Argument type list. + std::vector<Type*> FArgTypes; + for(Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end(); + ai != ae; ++ai) { + FArgTypes.push_back(ai->getType()); + } + + // Find new return type of function + Type* NewReturnTy; + if(bind == mutateTypeCause::mtc_BIND) { + + std::vector<Type*> TyList; + for (unsigned i = 0; i < FRetTypes.size(); i++) + TyList.push_back(FRetTypes[i]); + + NewReturnTy = StructType::create(f->getContext(), TyList, Twine("struct.out."+f->getName()).str(), true); + } + else { + NewReturnTy = getReturnTypeFromReturnInst(f); + assert(NewReturnTy->isStructTy() && "Expecting a struct type!"); + } + + FunctionType* FTy = FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg()); + + // Change the function type + Function* newF = cloneFunction(f, FTy, false); + DEBUG(errs() << *newF << "\n"); + + if (bind == mutateTypeCause::mtc_BIND) { + // This is certainly an internal node, and hence just one BB with one + // return terminator instruction. Change return statement + ReturnInst* RI = cast<ReturnInst>(newF->getEntryBlock().getTerminator()); + ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(NewReturnTy)); + ReplaceInstWithInst(RI, newRI); + } + if (bind == mutateTypeCause::mtc_RETURN) { + // Nothing + } + replaceNodeFunctionInIR(*f->getParent(), f, newF); + DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n"); } + + } - return false; // TODO: What does returning "false" mean? + return false; //TODO: What does returning "false" mean? } // Generate Code for declaring a constant string [L x i8] and return a pointer // to the start of it. -Value *GenVISC::getStringPointer(const Twine &S, Instruction *IB, - const Twine &Name) { - Constant *SConstant = - ConstantDataArray::getString(M->getContext(), S.str(), true); - Value *SGlobal = - new GlobalVariable(*M, SConstant->getType(), true, - GlobalValue::InternalLinkage, SConstant, Name); - Value *Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0); - Value *GEPArgs[] = {Zero, Zero}; - GetElementPtrInst *SPtr = GetElementPtrInst::Create( - nullptr, SGlobal, ArrayRef<Value *>(GEPArgs, 2), Name + "Ptr", IB); +Value* GenVISC::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) { + Constant* SConstant = ConstantDataArray::getString(M->getContext(), S.str(), true); + Value* SGlobal = new GlobalVariable(*M, SConstant->getType(), true, + GlobalValue::InternalLinkage, SConstant, Name); + Value* Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0); + Value* GEPArgs[] = {Zero, Zero}; + GetElementPtrInst* SPtr = GetElementPtrInst::Create(nullptr, SGlobal, + ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB); return SPtr; } -void GenVISC::initializeTimerSet(Instruction *InsertBefore) { - Value *TimerSetAddr; - StoreInst *SI; - TIMER(TimerSet = new GlobalVariable( - *M, Type::getInt8PtrTy(M->getContext()), false, - GlobalValue::CommonLinkage, - Constant::getNullValue(Type::getInt8PtrTy(M->getContext())), - "viscTimerSet_GenVISC")); - DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet - << "\n"); - // DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << - // "\n"); - - TIMER(TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet, None, "", +void GenVISC::initializeTimerSet(Instruction* InsertBefore) { + Value* TimerSetAddr; + StoreInst* SI; + TIMER(TimerSet = new GlobalVariable(*M, + Type::getInt8PtrTy(M->getContext()), + false, + GlobalValue::CommonLinkage, + Constant::getNullValue(Type::getInt8PtrTy(M->getContext())), + "viscTimerSet_GenVISC")); + DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet << "\n"); + //DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << "\n"); + + TIMER(TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet, + None, + "", InsertBefore)); DEBUG(errs() << "TimerSetAddress = " << *TimerSetAddr << "\n"); TIMER(SI = new StoreInst(TimerSetAddr, TimerSet, InsertBefore)); DEBUG(errs() << "Store Timer Address in Global variable: " << *SI << "\n"); } -void GenVISC::switchToTimer(enum visc_TimerID timer, - Instruction *InsertBefore) { - Value *switchArgs[] = {TimerSet, getTimerID(*M, timer)}; +void GenVISC::switchToTimer(enum visc_TimerID timer, Instruction* InsertBefore) { + Value* switchArgs[] = {TimerSet, getTimerID(*M, timer)}; TIMER(CallInst::Create(llvm_visc_switchToTimer, - ArrayRef<Value *>(switchArgs, 2), "", InsertBefore)); + ArrayRef<Value*>(switchArgs, 2), + "", + InsertBefore)); } -void GenVISC::printTimerSet(Instruction *InsertBefore) { - Value *TimerName; +void GenVISC::printTimerSet(Instruction* InsertBefore) { + Value* TimerName; TIMER(TimerName = getStringPointer("GenVISC_Timer", InsertBefore)); - Value *printArgs[] = {TimerSet, TimerName}; + Value* printArgs[] = {TimerSet, TimerName}; TIMER(CallInst::Create(llvm_visc_printTimerSet, - ArrayRef<Value *>(printArgs, 2), "", InsertBefore)); + ArrayRef<Value*>(printArgs, 2), + "", + InsertBefore)); } -static inline ConstantInt *getTimerID(Module &M, enum visc_TimerID timer) { +static inline ConstantInt* getTimerID(Module& M, enum visc_TimerID timer) { return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer); } -static Function *transformReturnTypeToStruct(Function *F) { +static Function* transformReturnTypeToStruct(Function* F) { // Currently only works for void return types - DEBUG(errs() << "Transforming return type of function to Struct: " - << F->getName() << "\n"); + DEBUG(errs() << "Transforming return type of function to Struct: " << F->getName() << "\n"); if (isa<StructType>(F->getReturnType())) { - DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": " - << *F->getReturnType() << "\n"); + DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": " << *F->getReturnType() << "\n"); return F; } - assert(F->getReturnType()->isVoidTy() && - "Unhandled case - Only void return type handled\n"); + assert(F->getReturnType()->isVoidTy() && "Unhandled case - Only void return type handled\n"); // Create the argument type list with added argument types - std::vector<Type *> ArgTypes; - for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); - ai != ae; ++ai) { + std::vector<Type*> ArgTypes; + for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai) { ArgTypes.push_back(ai->getType()); } - - StructType *RetTy = - StructType::create(F->getContext(), None, "emptyStruct", true); - FunctionType *FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg()); - - SmallVector<ReturnInst *, 8> Returns; - Function *newF = cloneFunction(F, FTy, false, &Returns); + + StructType* RetTy = StructType::create(F->getContext(), None, "emptyStruct", true); + FunctionType* FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg()); + + SmallVector<ReturnInst*, 8> Returns; + Function* newF = cloneFunction(F, FTy, false, &Returns); // Replace ret void instruction with ret %RetTy undef - for (auto &RI : Returns) { - DEBUG(errs() << "Found return inst: " << *RI << "\n"); - ReturnInst *newRI = - ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy)); + for(auto &RI: Returns) { + DEBUG(errs() << "Found return inst: "<< *RI << "\n"); + ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy)); ReplaceInstWithInst(RI, newRI); } @@ -893,20 +846,19 @@ static Function *transformReturnTypeToStruct(Function *F) { return newF; } -static Type *getReturnTypeFromReturnInst(Function *F) { - for (BasicBlock &BB : *F) { - if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) { - DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType() - << "\n"); +static Type* getReturnTypeFromReturnInst(Function* F) { + for(BasicBlock &BB: *F) { + if(ReturnInst* RI = dyn_cast<ReturnInst>(BB.getTerminator())) { + DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType() << "\n"); return RI->getReturnValue()->getType(); } } } + char genvisc::GenVISC::ID = 0; -static RegisterPass<genvisc::GenVISC> - X("genvisc", - "Pass to generate VISC IR from LLVM IR (with dummy function calls)", - false, false); +static RegisterPass<genvisc::GenVISC> X("genvisc", "Pass to generate VISC IR from LLVM IR (with dummy function calls)", false, false); } // End of namespace genvisc + + -- GitLab