-
Akash Kothari authoredAkash Kothari authored
DFG2LLVM_OpenCL.cpp 94.35 KiB
//===----------------------- DFG2LLVM_OpenCL.cpp ---------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This pass is responsible for generating code for kernel code and code for
// launching kernels for GPU target using HPVM dataflow graph. The kernels are
// generated into a separate file which is the C-Backend uses to generate
// OpenCL kernels with.
//
//===----------------------------------------------------------------------===//
#define ENABLE_ASSERTS
#define TARGET_PTX 64
#define GENERIC_ADDRSPACE 0
#define GLOBAL_ADDRSPACE 1
#define CONSTANT_ADDRSPACE 4
#define SHARED_ADDRSPACE 3
#define DEBUG_TYPE "DFG2LLVM_OpenCL"
#include "SupportHPVM/DFG2LLVM.h"
#include "SupportHPVM/HPVMTimer.h"
#include "SupportHPVM/HPVMUtils.h"
#include "llvm-c/Core.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Module.h"
#include "llvm/IRReader/IRReader.h"
#include "llvm/Linker/Linker.h"
#include "llvm/Pass.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include "llvm/IR/IRPrintingPasses.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/UseListOrder.h"
#include "llvm/Support/ToolOutputFile.h"
#include <sstream>
#ifndef LLVM_BUILD_DIR
#error LLVM_BUILD_DIR is not defined
#endif
#define STR_VALUE(X) #X
#define STRINGIFY(X) STR_VALUE(X)
#define LLVM_BUILD_DIR_STR STRINGIFY(LLVM_BUILD_DIR)
using namespace llvm;
using namespace builddfg;
using namespace dfg2llvm;
using namespace hpvmUtils;
// HPVM Command line option to use timer or not
static cl::opt<bool> HPVMTimer_OpenCL("hpvm-timers-ptx",
cl::desc("Enable hpvm timers"));
namespace {
// Helper class declarations
// Class to maintain the tuple of host pointer, device pointer and size
// in bytes. Would have preferred to use tuple but support not yet available
class OutputPtr {
public:
OutputPtr(Value *_h_ptr, Value *_d_ptr, Value *_bytes)
: h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {}
Value *h_ptr;
Value *d_ptr;
Value *bytes;
};
// Class to maintain important kernel info required for generating runtime
// calls
class Kernel {
public:
Kernel(
Function *_KF, DFLeafNode *_KLeafNode,
std::map<unsigned, unsigned> _inArgMap = std::map<unsigned, unsigned>(),
std::map<unsigned, std::pair<Value *, unsigned>> _sharedInArgMap =
std::map<unsigned, std::pair<Value *, unsigned>>(),
std::vector<unsigned> _outArgMap = std::vector<unsigned>(),
unsigned _gridDim = 0,
std::vector<Value *> _globalWGSize = std::vector<Value *>(),
unsigned _blockDim = 0,
std::vector<Value *> _localWGSize = std::vector<Value *>())
: KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap),
gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim),
localWGSize(_localWGSize) {
assert(gridDim == globalWGSize.size() &&
"gridDim should be same as the size of vector globalWGSize");
assert(blockDim == localWGSize.size() &&
"blockDim should be same as the size of vector localWGSize");
}
Function *KernelFunction;
DFLeafNode *KernelLeafNode;
std::map<unsigned, unsigned> inArgMap;
// Map for shared memory arguments
std::map<unsigned, std::pair<Value *, unsigned>> sharedInArgMap;
// Fields for (potential) allocation node
DFLeafNode *AllocationNode;
Function *AllocationFunction;
std::map<unsigned, unsigned> allocInArgMap;
std::vector<unsigned> outArgMap;
unsigned gridDim;
std::vector<Value *> globalWGSize;
unsigned blockDim;
std::vector<Value *> localWGSize;
std::vector<int> localDimMap;
std::map<unsigned, unsigned> &getInArgMap() { return inArgMap; }
void setInArgMap(std::map<unsigned, unsigned> map) { inArgMap = map; }
std::map<unsigned, std::pair<Value *, unsigned>> &getSharedInArgMap() {
return sharedInArgMap;
}
void setSharedInArgMap(std::map<unsigned, std::pair<Value *, unsigned>> map) {
sharedInArgMap = map;
}
std::vector<unsigned> &getOutArgMap() { return outArgMap; }
void setOutArgMap(std::vector<unsigned> map) { outArgMap = map; }
void setLocalWGSize(std::vector<Value *> V) { localWGSize = V; }
bool hasLocalWG() const { return blockDim != 0; }
};
// Helper function declarations
static bool canBePromoted(Argument *arg, Function *F);
static void getExecuteNodeParams(Module &M, Value *&, Value *&, Value *&,
Kernel *, ValueToValueMapTy &, Instruction *);
static Value *genWorkGroupPtr(Module &M, std::vector<Value *>,
ValueToValueMapTy &, Instruction *,
const Twine &WGName = "WGSize");
static std::string getPTXFilename(const Module &);
static void changeDataLayout(Module &);
static void changeTargetTriple(Module &);
static void findReturnInst(Function *, std::vector<ReturnInst *> &);
static void findIntrinsicInst(Function *, Intrinsic::ID,
std::vector<IntrinsicInst *> &);
// DFG2LLVM_OpenCL - The first implementation.
struct DFG2LLVM_OpenCL : public DFG2LLVM {
static char ID; // Pass identification, replacement for typeid
DFG2LLVM_OpenCL() : DFG2LLVM(ID) {}
private:
public:
bool runOnModule(Module &M);
};
// Visitor for Code generation traversal (tree traversal for now)
class CGT_OpenCL : public CodeGenTraversal {
private:
// Member variables
std::unique_ptr<Module> KernelM;
DFNode *KernelLaunchNode = NULL;
Kernel *kernel;
// HPVM Runtime API
FunctionCallee llvm_hpvm_ocl_launch;
FunctionCallee llvm_hpvm_ocl_wait;
FunctionCallee llvm_hpvm_ocl_initContext;
FunctionCallee llvm_hpvm_ocl_clearContext;
FunctionCallee llvm_hpvm_ocl_argument_shared;
FunctionCallee llvm_hpvm_ocl_argument_scalar;
FunctionCallee llvm_hpvm_ocl_argument_ptr;
FunctionCallee llvm_hpvm_ocl_output_ptr;
FunctionCallee llvm_hpvm_ocl_free;
FunctionCallee llvm_hpvm_ocl_getOutput;
FunctionCallee llvm_hpvm_ocl_executeNode;
// Functions
std::string getKernelsModuleName(Module &M);
void fixValueAddrspace(Value *V, unsigned addrspace);
std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned> *,
Function *);
Function *changeArgAddrspace(Function *F, std::vector<unsigned> &Ags,
unsigned i);
void addCLMetadata(Function *F);
Function *transformFunctionToVoid(Function *F);
void insertRuntimeCalls(DFInternalNode *N, Kernel *K, const Twine &FileName);
// Virtual Functions
void init() {
HPVMTimer = HPVMTimer_OpenCL;
TargetName = "OpenCL";
}
void initRuntimeAPI();
void codeGen(DFInternalNode *N);
void codeGen(DFLeafNode *N);
public:
// Constructor
CGT_OpenCL(Module &_M, BuildDFG &_DFG)
: CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) {
init();
initRuntimeAPI();
DEBUG(errs() << "Old module pointer: " << &_M << "\n");
DEBUG(errs() << "New module pointer: " << KernelM.get() << "\n");
// Copying instead of creating new, in order to preserve required info
// (metadata) Remove functions, global variables and aliases
std::vector<GlobalVariable *> GVVect;
for (Module::global_iterator mi = KernelM->global_begin(),
me = KernelM->global_end();
(mi != me); ++mi) {
GlobalVariable *GV = &*mi;
GVVect.push_back(GV);
}
for (auto *GV : GVVect) {
GV->replaceAllUsesWith(UndefValue::get(GV->getType()));
GV->eraseFromParent();
}
std::vector<Function *> FuncVect;
for (Module::iterator mi = KernelM->begin(), me = KernelM->end();
(mi != me); ++mi) {
Function *F = &*mi;
FuncVect.push_back(F);
}
for (auto *F : FuncVect) {
F->replaceAllUsesWith(UndefValue::get(F->getType()));
F->eraseFromParent();
}
std::vector<GlobalAlias *> GAVect;
for (Module::alias_iterator mi = KernelM->alias_begin(),
me = KernelM->alias_end();
(mi != me); ++mi) {
GlobalAlias *GA = &*mi;
GAVect.push_back(GA);
}
for (auto *GA : GAVect) {
GA->replaceAllUsesWith(UndefValue::get(GA->getType()));
GA->eraseFromParent();
}
changeDataLayout(*KernelM);
changeTargetTriple(*KernelM);
DEBUG(errs() << *KernelM);
}
void writeKernelsModule();
};
// Initialize the HPVM runtime API. This makes it easier to insert these calls
void CGT_OpenCL::initRuntimeAPI() {
// Load Runtime API Module
SMDiagnostic Err;
std::string runtimeAPI = std::string(LLVM_BUILD_DIR_STR) +
"/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc";
runtimeModule = parseIRFile(runtimeAPI, Err, M.getContext());
if (runtimeModule == nullptr) {
DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n");
assert(false && "couldn't parse runtime");
} else
DEBUG(errs() << "Successfully loaded hpvm-rt API module\n");
// Get or insert the global declarations for launch/wait functions
DECLARE(llvm_hpvm_ocl_launch);
DECLARE(llvm_hpvm_ocl_wait);
DECLARE(llvm_hpvm_ocl_initContext);
DECLARE(llvm_hpvm_ocl_clearContext);
DECLARE(llvm_hpvm_ocl_argument_shared);
DECLARE(llvm_hpvm_ocl_argument_scalar);
DECLARE(llvm_hpvm_ocl_argument_ptr);
DECLARE(llvm_hpvm_ocl_output_ptr);
DECLARE(llvm_hpvm_ocl_free);
DECLARE(llvm_hpvm_ocl_getOutput);
DECLARE(llvm_hpvm_ocl_executeNode);
// Get or insert timerAPI functions as well if you plan to use timers
initTimerAPI();
// Insert init context in main
DEBUG(errs() << "Gen Code to initialize OpenCL Timer\n");
Function *VI = M.getFunction("llvm.hpvm.init");
assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once");
InitCall = cast<Instruction>(*VI->user_begin());
initializeTimerSet(InitCall);
switchToTimer(hpvm_TimerID_INIT_CTX, InitCall);
CallInst::Create(llvm_hpvm_ocl_initContext,
ArrayRef<Value *>(getTargetID(M, hpvm::GPU_TARGET)), "",
InitCall);
switchToTimer(hpvm_TimerID_NONE, InitCall);
// Insert print instruction at hpvm exit
DEBUG(errs() << "Gen Code to print OpenCL Timer\n");
Function *VC = M.getFunction("llvm.hpvm.cleanup");
DEBUG(errs() << *VC << "\n");
assert(VC->getNumUses() == 1 && "__hpvm__clear should only be used once");
CleanupCall = cast<Instruction>(*VC->user_begin());
printTimerSet(CleanupCall);
}
// Generate Code to call the kernel
// The plan is to replace the internal node with a leaf node. This method is
// used to generate a function to associate with this leaf node. The function
// is responsible for all the memory allocation/transfer and invoking the
// kernel call on the device
void CGT_OpenCL::insertRuntimeCalls(DFInternalNode *N, Kernel *K,
const Twine &FileName) {
// Check if clone already exists. If it does, it means we have visited this
// function before.
// assert(N->getGenFunc() == NULL && "Code already generated for this node");
assert(N->getGenFuncForTarget(hpvm::GPU_TARGET) == NULL &&
"Code already generated for this node");
// Useful values
Value *True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
Value *False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
// If kernel struct has not been initialized with kernel function, then fail
assert(K != NULL && "No kernel found!!");
DEBUG(errs() << "Generating kernel call code\n");
Function *F = N->getFuncPointer();
// Create of clone of F with no instructions. Only the type is the same as F
// without the extra arguments.
Function *F_CPU;
// Clone the function, if we are seeing this function for the first time. We
// only need a clone in terms of type.
ValueToValueMapTy VMap;
// Create new function with the same type
F_CPU =
Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
// Loop over the arguments, copying the names of arguments over.
Function::arg_iterator dest_iterator = F_CPU->arg_begin();
for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
i != e; ++i) {
dest_iterator->setName(i->getName()); // Copy the name over...
// Increment dest iterator
++dest_iterator;
}
// Add a basic block to this empty function
BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_CPU);
ReturnInst *RI = ReturnInst::Create(
M.getContext(), UndefValue::get(F_CPU->getReturnType()), BB);
// FIXME: Adding Index and Dim arguments are probably not required except
// for consistency purpose (DFG2LLVM_CPU does assume that all leaf nodes do
// have those arguments)
// Add Index and Dim arguments except for the root node
if (!N->isRoot() && !N->getParent()->isChildGraphStreaming())
F_CPU = addIdxDimArgs(F_CPU);
BB = &*F_CPU->begin();
RI = cast<ReturnInst>(BB->getTerminator());
// Add the generated function info to DFNode
// N->setGenFunc(F_CPU, hpvm::CPU_TARGET);
N->addGenFunc(F_CPU, hpvm::GPU_TARGET, true);
DEBUG(errs() << "Added GPUGenFunc: " << F_CPU->getName() << " for node "
<< N->getFuncPointer()->getName() << "\n");
// Loop over the arguments, to create the VMap
dest_iterator = F_CPU->arg_begin();
for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
i != e; ++i) {
// Add mapping to VMap and increment dest iterator
VMap[&*i] = &*dest_iterator;
++dest_iterator;
}
/* TODO: Use this code to verufy if this is a good pattern for PTX kernel
// Sort children in topological order before code generation for kernel call
N->getChildGraph()->sortChildren();
// The DFNode N has the property that it has only one child (leaving Entry
// and Exit dummy nodes). This child is the PTX kernel. This simplifies code
// generation for kernel calls significantly. All the inputs to this child
// node would either be constants or from the parent node N.
assert(N->getChildGraph()->size() == 3
&& "Node expected to have just one non-dummy node!");
DFNode* C;
for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
ce = N->getChildGraph()->end(); ci != ce; ++ci) {
C = *ci;
// Skip dummy node call
if (!C->isDummyNode())
break;
}
assert(C->isDummyNode() == false && "Internal Node only contains dummy
nodes!");
Function* CF = C->getFuncPointer();
*/
Function *KF = K->KernelLeafNode->getFuncPointer();
// Initialize context
// DEBUG(errs() << "Initializing context" << "\n");
// CallInst::Create(llvm_hpvm_ocl_initContext, None, "", RI);
DEBUG(errs() << "Initializing commandQ"
<< "\n");
// Initialize command queue
switchToTimer(hpvm_TimerID_SETUP, InitCall);
Value *fileStr = getStringPointer(FileName, InitCall, "Filename");
DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n");
DEBUG(errs() << "Generating code for kernel - "
<< K->KernelFunction->getName() << "\n");
Value *kernelStr =
getStringPointer(K->KernelFunction->getName(), InitCall, "KernelName");
Value *LaunchInstArgs[] = {fileStr, kernelStr};
DEBUG(errs() << "Inserting launch call"
<< "\n");
CallInst *OpenCL_Ctx = CallInst::Create(llvm_hpvm_ocl_launch,
ArrayRef<Value *>(LaunchInstArgs, 2),
"graph" + KF->getName(), InitCall);
DEBUG(errs() << *OpenCL_Ctx << "\n");
GraphIDAddr = new GlobalVariable(
M, OpenCL_Ctx->getType(), false, GlobalValue::CommonLinkage,
Constant::getNullValue(OpenCL_Ctx->getType()),
"graph" + KF->getName() + ".addr");
DEBUG(errs() << "Store at: " << *GraphIDAddr << "\n");
StoreInst *SI = new StoreInst(OpenCL_Ctx, GraphIDAddr, InitCall);
DEBUG(errs() << *SI << "\n");
switchToTimer(hpvm_TimerID_NONE, InitCall);
switchToTimer(hpvm_TimerID_SETUP, RI);
Value *GraphID = new LoadInst(GraphIDAddr, "graph." + KF->getName(), RI);
// Iterate over the required input edges of the node and use the hpvm-rt API
// to set inputs
DEBUG(errs() << "Iterate over input edges of node and insert hpvm api\n");
std::vector<OutputPtr> OutputPointers;
// Vector to hold the device memory object that need to be cleared before we
// release context
std::vector<Value *> DevicePointers;
std::map<unsigned, unsigned> &kernelInArgMap = K->getInArgMap();
/*
for(unsigned i=0; i<KF->getFunctionType()->getNumParams(); i++) {
// The kernel object gives us the mapping of arguments from kernel launch
// node function (F_CPU) to kernel (kernel->KF)
Value* inputVal = getArgumentAt(F_CPU, K->getInArgMap()[i]);
*/
for (auto &InArgMapPair : kernelInArgMap) {
unsigned i = InArgMapPair.first;
Value *inputVal = getArgumentAt(F_CPU, InArgMapPair.second);
DEBUG(errs() << "\tArgument " << i << " = " << *inputVal << "\n");
// input value has been obtained.
// Check if input is a scalar value or a pointer operand
// For scalar values such as int, float, etc. the size is simply the size of
// type on target machine, but for pointers, the size of data would be the
// next integer argument
if (inputVal->getType()->isPointerTy()) {
switchToTimer(hpvm_TimerID_COPY_PTR, RI);
// Pointer Input
// CheckAttribute
Value *isOutput = (hasAttribute(KF, i, Attribute::Out)) ? True : False;
Value *isInput = ((hasAttribute(KF, i, Attribute::Out)) &&
!(hasAttribute(KF, i, Attribute::In)))
? False
: True;
Argument *A = getArgumentAt(KF, i);
if (isOutput == True) {
DEBUG(errs() << *A << " is an OUTPUT argument\n");
}
if (isInput == True) {
DEBUG(errs() << *A << " is an INPUT argument\n");
}
Value *inputValI8Ptr = CastInst::CreatePointerCast(
inputVal, Type::getInt8PtrTy(M.getContext()),
inputVal->getName() + ".i8ptr", RI);
// Assert that the pointer argument size (next argument) is in the map
assert(kernelInArgMap.find(i + 1) != kernelInArgMap.end());
Value *inputSize = getArgumentAt(F_CPU, kernelInArgMap[i + 1]);
assert(
inputSize->getType() == Type::getInt64Ty(M.getContext()) &&
"Pointer type input must always be followed by size (integer type)");
Value *setInputArgs[] = {
GraphID,
inputValI8Ptr,
ConstantInt::get(Type::getInt32Ty(M.getContext()), i),
inputSize,
isInput,
isOutput};
Value *d_ptr =
CallInst::Create(llvm_hpvm_ocl_argument_ptr,
ArrayRef<Value *>(setInputArgs, 6), "", RI);
DevicePointers.push_back(d_ptr);
// If this has out attribute, store the returned device pointer in
// memory to read device memory later
if (isOutput == True)
OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
} else {
switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
// Scalar Input
// Store the scalar value on stack and then pass the pointer to its
// location
AllocaInst *inputValPtr = new AllocaInst(
inputVal->getType(), 0, inputVal->getName() + ".ptr", RI);
new StoreInst(inputVal, inputValPtr, RI);
Value *inputValI8Ptr = CastInst::CreatePointerCast(
inputValPtr, Type::getInt8PtrTy(M.getContext()),
inputVal->getName() + ".i8ptr", RI);
Value *setInputArgs[] = {
GraphID, inputValI8Ptr,
ConstantInt::get(Type::getInt32Ty(M.getContext()), i),
ConstantExpr::getSizeOf(inputVal->getType())};
CallInst::Create(llvm_hpvm_ocl_argument_scalar,
ArrayRef<Value *>(setInputArgs, 4), "", RI);
}
}
DEBUG(
errs() << "Setup shared memory arguments of node and insert hpvm api\n");
// Check to see if all the allocation sizes are constant (determined
// statically)
bool constSizes = true;
for (auto &e : K->getSharedInArgMap()) {
constSizes &= isa<Constant>(e.second.first);
}
// If the sizes are all constant
if (constSizes) {
for (auto &e : K->getSharedInArgMap()) {
unsigned argNum = e.first;
Value *allocSize = e.second.first;
DEBUG(errs() << "\tLocal Memory at " << argNum
<< ", size = " << *allocSize << "\n");
if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
// Shared memory ptr argument - scalar at size position
switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
assert(isa<Constant>(allocSize) &&
"Constant shared memory size is expected");
Value *setInputArgs[] = {
GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
allocSize};
CallInst::Create(llvm_hpvm_ocl_argument_shared,
ArrayRef<Value *>(setInputArgs, 3), "", RI);
} else {
// Sharem memory size argument - scalar at address position
switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
// Store the scalar value on stack and then pass the pointer to its
// location
AllocaInst *allocSizePtr =
new AllocaInst(allocSize->getType(), 0,
allocSize->getName() + ".sharedMem.ptr", RI);
new StoreInst(allocSize, allocSizePtr, RI);
Value *allocSizeI8Ptr = CastInst::CreatePointerCast(
allocSizePtr, Type::getInt8PtrTy(M.getContext()),
allocSize->getName() + ".sharedMem.i8ptr", RI);
Value *setInputArgs[] = {
GraphID, allocSizeI8Ptr,
ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
ConstantExpr::getSizeOf(allocSize->getType())};
CallInst::Create(llvm_hpvm_ocl_argument_scalar,
ArrayRef<Value *>(setInputArgs, 4), "", RI);
}
}
} else {
Function *F_alloc = K->AllocationFunction;
StructType *FAllocRetTy = dyn_cast<StructType>(F_alloc->getReturnType());
assert(FAllocRetTy && "Allocation node with no struct return type");
std::vector<Value *> AllocInputArgs;
for (unsigned i = 0; i < K->allocInArgMap.size(); i++) {
AllocInputArgs.push_back(getArgumentAt(F_CPU, K->allocInArgMap.at(i)));
}
CallInst *CI = CallInst::Create(F_alloc, AllocInputArgs, "", RI);
std::vector<ExtractValueInst *> ExtractValueInstVec;
for (unsigned i = 1; i < FAllocRetTy->getNumElements(); i += 2) {
ExtractValueInst *EI = ExtractValueInst::Create(CI, i, "", RI);
ExtractValueInstVec.push_back(EI);
}
for (auto &e : K->getSharedInArgMap()) {
unsigned argNum = e.first;
Value *allocSize = ExtractValueInstVec[e.second.second / 2];
DEBUG(errs() << "\tLocal Memory at " << argNum
<< ", size = " << *allocSize << "\n");
if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
// Shared memory ptr argument - scalar at size position
switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
Value *setInputArgs[] = {
GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
allocSize};
CallInst::Create(llvm_hpvm_ocl_argument_shared,
ArrayRef<Value *>(setInputArgs, 3), "", RI);
} else {
// Sharem memory size argument - scalar at address position
switchToTimer(hpvm_TimerID_COPY_SCALAR, RI);
// Store the scalar value on stack and then pass the pointer to its
// location
AllocaInst *allocSizePtr =
new AllocaInst(allocSize->getType(), 0,
allocSize->getName() + ".sharedMem.ptr", RI);
new StoreInst(allocSize, allocSizePtr, RI);
Value *allocSizeI8Ptr = CastInst::CreatePointerCast(
allocSizePtr, Type::getInt8PtrTy(M.getContext()),
allocSize->getName() + ".sharedMem.i8ptr", RI);
Value *setInputArgs[] = {
GraphID, allocSizeI8Ptr,
ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
ConstantExpr::getSizeOf(allocSize->getType())};
CallInst::Create(llvm_hpvm_ocl_argument_scalar,
ArrayRef<Value *>(setInputArgs, 4), "", RI);
}
}
}
DEBUG(errs() << "Setup output edges of node and insert hpvm api\n");
// Set output if struct is not an empty struct
StructType *OutputTy = K->KernelLeafNode->getOutputType();
std::vector<Value *> d_Outputs;
if (!OutputTy->isEmptyTy()) {
switchToTimer(hpvm_TimerID_COPY_PTR, RI);
// Not an empty struct
// Iterate over all elements of the struct and put them in
for (unsigned i = 0; i < OutputTy->getNumElements(); i++) {
unsigned outputIndex = KF->getFunctionType()->getNumParams() + i;
Value *setOutputArgs[] = {
GraphID,
ConstantInt::get(Type::getInt32Ty(M.getContext()), outputIndex),
ConstantExpr::getSizeOf(OutputTy->getElementType(i))};
CallInst *d_Output = CallInst::Create(llvm_hpvm_ocl_output_ptr,
ArrayRef<Value *>(setOutputArgs, 3),
"d_output." + KF->getName(), RI);
d_Outputs.push_back(d_Output);
}
}
// Enqueue kernel
// Need work dim, localworksize, globalworksize
// Allocate size_t[numDims] space on stack. Store the work group sizes and
// pass it as an argument to ExecNode
switchToTimer(hpvm_TimerID_MISC, RI);
Value *workDim, *LocalWGPtr, *GlobalWGPtr;
getExecuteNodeParams(M, workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI);
switchToTimer(hpvm_TimerID_KERNEL, RI);
Value *ExecNodeArgs[] = {GraphID, workDim, LocalWGPtr, GlobalWGPtr};
CallInst *Event = CallInst::Create(llvm_hpvm_ocl_executeNode,
ArrayRef<Value *>(ExecNodeArgs, 4),
"event." + KF->getName(), RI);
DEBUG(errs() << "Execute Node Call: " << *Event << "\n");
// Wait for Kernel to Finish
CallInst::Create(llvm_hpvm_ocl_wait, ArrayRef<Value *>(GraphID), "", RI);
switchToTimer(hpvm_TimerID_READ_OUTPUT, RI);
// Read Output Struct if not empty
if (!OutputTy->isEmptyTy()) {
std::vector<Value *> h_Outputs;
Value *KernelOutput = UndefValue::get(OutputTy);
for (unsigned i = 0; i < OutputTy->getNumElements(); i++) {
Value *GetOutputArgs[] = {
GraphID, Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
d_Outputs[i], ConstantExpr::getSizeOf(OutputTy->getElementType(i))};
CallInst *h_Output = CallInst::Create(
llvm_hpvm_ocl_getOutput, ArrayRef<Value *>(GetOutputArgs, 4),
"h_output." + KF->getName() + ".addr", RI);
// Read each device pointer listed in output struct
// Load the output struct
CastInst *BI = BitCastInst::CreatePointerCast(
h_Output, OutputTy->getElementType(i)->getPointerTo(), "output.ptr",
RI);
Value *OutputElement = new LoadInst(BI, "output." + KF->getName(), RI);
KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement,
ArrayRef<unsigned>(i),
KF->getName() + "output", RI);
}
OutputMap[K->KernelLeafNode] = KernelOutput;
}
// Read all the pointer arguments which had side effects i.e., had out
// attribute
DEBUG(errs() << "Output Pointers : " << OutputPointers.size() << "\n");
// FIXME: Not reading output pointers anymore as we read them when data is
// actually requested
/*for(auto output: OutputPointers) {
DEBUG(errs() << "Read: " << *output.d_ptr << "\n");
DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n");
DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n");
Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr,
output.bytes}; CallInst* CI = CallInst::Create(llvm_hpvm_ocl_getOutput,
ArrayRef<Value*>(GetOutputArgs, 4),
"", RI);
}*/
switchToTimer(hpvm_TimerID_MEM_FREE, RI);
// Clear Context and free device memory
DEBUG(errs() << "Clearing context"
<< "\n");
// Free Device Memory
for (auto d_ptr : DevicePointers) {
CallInst::Create(llvm_hpvm_ocl_free, ArrayRef<Value *>(d_ptr), "", RI);
}
switchToTimer(hpvm_TimerID_CLEAR_CTX, CleanupCall);
// Clear Context
LoadInst *LI = new LoadInst(GraphIDAddr, "", CleanupCall);
CallInst::Create(llvm_hpvm_ocl_clearContext, ArrayRef<Value *>(LI), "",
CleanupCall);
switchToTimer(hpvm_TimerID_NONE, CleanupCall);
switchToTimer(hpvm_TimerID_MISC, RI);
DEBUG(errs() << "*** Generating epilogue code for the function****\n");
// Generate code for output bindings
// Get Exit node
DFNode *C = N->getChildGraph()->getExit();
// Get OutputType of this node
StructType *OutTy = N->getOutputType();
Value *retVal = UndefValue::get(F_CPU->getReturnType());
// Find the kernel's output arg map, to use instead of the bindings
std::vector<unsigned> outArgMap = kernel->getOutArgMap();
// Find all the input edges to exit node
for (unsigned i = 0; i < OutTy->getNumElements(); i++) {
DEBUG(errs() << "Output Edge " << i << "\n");
// Find the incoming edge at the requested input port
DFEdge *E = C->getInDFEdgeAt(i);
assert(E && "No Binding for output element!");
// Find the Source DFNode associated with the incoming edge
DFNode *SrcDF = E->getSourceDF();
DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName()
<< "\n");
// If Source DFNode is a dummyNode, edge is from parent. Get the
// argument from argument list of this internal node
Value *inputVal;
if (SrcDF->isEntryNode()) {
inputVal = getArgumentAt(F_CPU, i);
DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
} else {
// edge is from a internal node
// Check - code should already be generated for this source dfnode
// FIXME: Since the 2-level kernel code gen has aspecific structure, we
// can assume the SrcDF is same as Kernel Leaf node.
// Use outArgMap to get correct mapping
SrcDF = K->KernelLeafNode;
assert(OutputMap.count(SrcDF) &&
"Source node call not found. Dependency violation!");
// Find Output Value associated with the Source DFNode using OutputMap
Value *CI = OutputMap[SrcDF];
// Extract element at source position from this call instruction
std::vector<unsigned> IndexList;
// i is the destination of DFEdge E
// Use the mapping instead of the bindings
// IndexList.push_back(E->getSourcePosition());
IndexList.push_back(outArgMap[i]);
DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n");
ExtractValueInst *EI = ExtractValueInst::Create(CI, IndexList, "", RI);
inputVal = EI;
}
std::vector<unsigned> IdxList;
IdxList.push_back(i);
retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
}
DEBUG(errs() << "Extracted all\n");
switchToTimer(hpvm_TimerID_NONE, RI);
retVal->setName("output");
ReturnInst *newRI = ReturnInst::Create(F_CPU->getContext(), retVal);
ReplaceInstWithInst(RI, newRI);
}
// Right now, only targeting the one level case. In general, device functions
// can return values so we don't need to change them
void CGT_OpenCL::codeGen(DFInternalNode *N) {
DEBUG(errs() << "Inside internal node: " << N->getFuncPointer()->getName()
<< "\n");
if (KernelLaunchNode == NULL)
DEBUG(errs() << "No kernel launch node\n");
else {
DEBUG(errs() << "KernelLaunchNode: "
<< KernelLaunchNode->getFuncPointer()->getName() << "\n");
}
if (!KernelLaunchNode) {
DEBUG(errs()
<< "No code generated (host code for kernel launch complete).\n");
return;
}
if (N == KernelLaunchNode) {
DEBUG(errs() << "Found kernel launch node. Generating host code.\n");
// TODO
// Now the remaining nodes to be visited should be ignored
KernelLaunchNode = NULL;
DEBUG(errs() << "Insert Runtime calls\n");
insertRuntimeCalls(N, kernel, getPTXFilename(M));
} else {
DEBUG(errs() << "Found intermediate node. Getting size parameters.\n");
// Keep track of the arguments order.
std::map<unsigned, unsigned> inmap1 = N->getInArgMap();
std::map<unsigned, unsigned> inmap2 = kernel->getInArgMap();
// TODO: Structure assumed: one thread node, one allocation node (at most),
// TB node
std::map<unsigned, unsigned> inmapFinal;
for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(),
ie = inmap2.end();
ib != ie; ++ib) {
inmapFinal[ib->first] = inmap1[ib->second];
}
kernel->setInArgMap(inmapFinal);
// Keep track of the output arguments order.
std::vector<unsigned> outmap1 = N->getOutArgMap();
std::vector<unsigned> outmap2 = kernel->getOutArgMap();
// TODO: Change when we have incoming edges to the dummy exit node from more
// than one nodes. In this case, the number of bindings is the same, but
// their destination position, thus the index in outmap1, is not
// 0 ... outmap2.size()-1
// The limit is the size of outmap2, because this is the number of kernel
// output arguments for which the mapping matters
// For now, it reasonable to assume that all the kernel arguments are
// returned, maybe plys some others from other nodes, thus outmap2.size() <=
// outmap1.size()
for (unsigned i = 0; i < outmap2.size(); i++) {
outmap1[i] = outmap2[outmap1[i]];
}
kernel->setOutArgMap(outmap1);
// Track the source of local dimlimits for the kernel
// Dimension limit can either be a constant or an argument of parent
// function. Since Internal node would no longer exist, we need to insert
// the localWGSize with values from the parent of N.
std::vector<Value *> localWGSizeMapped;
for (unsigned i = 0; i < kernel->localWGSize.size(); i++) {
if (isa<Constant>(kernel->localWGSize[i])) {
// if constant, use as it is
localWGSizeMapped.push_back(kernel->localWGSize[i]);
} else if (Argument *Arg = dyn_cast<Argument>(kernel->localWGSize[i])) {
// if argument, find the argument location in N. Use InArgMap of N to
// find the source location in Parent of N. Retrieve the argument from
// parent to insert in the vector.
unsigned argNum = Arg->getArgNo();
// This argument will be coming from the parent node, not the allocation
// Node
assert(N->getInArgMap().find(argNum) != N->getInArgMap().end());
unsigned parentArgNum = N->getInArgMap()[argNum];
Argument *A =
getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum);
localWGSizeMapped.push_back(A);
} else {
assert(
false &&
"LocalWGsize using value which is neither argument nor constant!");
}
}
// Update localWGSize vector of kernel
kernel->setLocalWGSize(localWGSizeMapped);
}
}
void CGT_OpenCL::codeGen(DFLeafNode *N) {
DEBUG(errs() << "Inside leaf node: " << N->getFuncPointer()->getName()
<< "\n");
// Skip code generation if it is a dummy node
if (N->isDummyNode()) {
DEBUG(errs() << "Skipping dummy node\n");
return;
}
// Skip code generation if it is an allocation node
if (N->isAllocationNode()) {
DEBUG(errs() << "Skipping allocation node\n");
return;
}
// Generate code only if it has the right hint
// if(!checkPreferredTarget(N, hpvm::GPU_TARGET)) {
// errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
// return;
// }
if (!preferredTargetIncludes(N, hpvm::GPU_TARGET)) {
DEBUG(errs() << "Skipping node: " << N->getFuncPointer()->getName()
<< "\n");
return;
}
// Checking which node is the kernel launch
DFNode *PNode = N->getParent();
int pLevel = PNode->getLevel();
int pReplFactor = PNode->getNumOfDim();
// Choose parent node as kernel launch if:
// (1) Parent is the top level node i.e., Root of DFG
// OR
// (2) Parent does not have multiple instances
DEBUG(errs() << "pLevel = " << pLevel << "\n");
DEBUG(errs() << "pReplFactor = " << pReplFactor << "\n");
assert((pLevel > 0) && "Root not allowed to be chosen as Kernel Node.");
// Only these options are supported
enum XLevelHierarchy { ONE_LEVEL, TWO_LEVEL } SelectedHierarchy;
if (pLevel == 1 || !pReplFactor) {
DEBUG(errs()
<< "*************** Kernel Gen: 1-Level Hierarchy **************\n");
SelectedHierarchy = ONE_LEVEL;
KernelLaunchNode = PNode;
kernel = new Kernel(NULL, N, N->getInArgMap(), N->getSharedInArgMap(),
N->getOutArgMap(), N->getNumOfDim(), N->getDimLimits());
} else {
// Converting a 2-level DFG to opencl kernel
DEBUG(errs()
<< "*************** Kernel Gen: 2-Level Hierarchy **************\n");
assert((pLevel >= 2) &&
"Selected node not nested deep enough to be Kernel Node.");
SelectedHierarchy = TWO_LEVEL;
KernelLaunchNode = PNode->getParent();
assert((PNode->getNumOfDim() == N->getNumOfDim()) &&
"Dimension number must match");
// Contains the instructions generating the kernel configuration parameters
kernel = new Kernel(NULL, // kernel function
N, // kernel leaf node
N->getInArgMap(), // kenel argument mapping
N->getSharedInArgMap(),
N->getOutArgMap(), // kernel output mapping from the
// leaf to the interemediate node
PNode->getNumOfDim(), // gridDim
PNode->getDimLimits(), // grid size
N->getNumOfDim(), // blockDim
N->getDimLimits()); // block size
}
std::vector<Instruction *> IItoRemove;
BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap;
// Get the function associated with the dataflow node
Function *F = N->getFuncPointer();
// Look up if we have visited this function before. If we have, then just
// get the cloned function pointer from DFNode. Otherwise, create the cloned
// function and add it to the DFNode GenFunc.
// Function *F_opencl = N->getGenFunc();
Function *F_opencl = N->getGenFuncForTarget(hpvm::GPU_TARGET);
assert(F_opencl == NULL &&
"Error: Visiting a node for which code already generated");
// Clone the function
ValueToValueMapTy VMap;
// F_opencl->setName(FName+"_opencl");
Twine FName = F->getName();
StringRef fStr = FName.getSingleStringRef();
Twine newFName = Twine(fStr, "_opencl");
F_opencl = CloneFunction(F, VMap);
F_opencl->setName(newFName);
// errs() << "Old Function Name: " << F->getName() << "\n";
// errs() << "New Function Name: " << F_opencl->getName() << "\n";
F_opencl->removeFromParent();
// Insert the cloned function into the kernels module
KernelM->getFunctionList().push_back(F_opencl);
// TODO: Iterate over all the instructions of F_opencl and identify the
// callees and clone them into this module.
DEBUG(errs() << *F_opencl->getType());
DEBUG(errs() << *F_opencl);
// Transform the function to void and remove all target dependent attributes
// from the function
F_opencl = transformFunctionToVoid(F_opencl);
// Add generated function info to DFNode
// N->setGenFunc(F_opencl, hpvm::GPU_TARGET);
N->addGenFunc(F_opencl, hpvm::GPU_TARGET, false);
DEBUG(
errs()
<< "Removing all attributes from Kernel Function and adding nounwind\n");
F_opencl->removeAttributes(AttributeList::FunctionIndex,
F_opencl->getAttributes().getFnAttributes());
F_opencl->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
// FIXME: For now, assume only one allocation node
kernel->AllocationNode = NULL;
for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(),
iee = N->indfedge_end();
ieb != iee; ++ieb) {
DFNode *SrcDFNode = (*ieb)->getSourceDF();
DEBUG(errs() << "Found edge from node: "
<< " " << SrcDFNode->getFuncPointer()->getName() << "\n");
DEBUG(errs() << "Current Node: " << N->getFuncPointer()->getName() << "\n");
DEBUG(errs() << "isAllocationNode = " << SrcDFNode->isAllocationNode()
<< "\n");
if (!SrcDFNode->isDummyNode()) {
assert(SrcDFNode->isAllocationNode());
kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode);
kernel->allocInArgMap = SrcDFNode->getInArgMap();
break;
}
}
// Vector for shared memory arguments
std::vector<unsigned> SharedMemArgs;
// If no allocation node was found, SharedMemArgs is empty
if (kernel->AllocationNode) {
ValueToValueMapTy VMap;
Function *F_alloc =
CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap);
// F_alloc->removeFromParent();
// Insert the cloned function into the kernels module
// M.getFunctionList().push_back(F_alloc);
std::vector<IntrinsicInst *> HPVMMallocInstVec;
findIntrinsicInst(F_alloc, Intrinsic::hpvm_malloc, HPVMMallocInstVec);
for (unsigned i = 0; i < HPVMMallocInstVec.size(); i++) {
IntrinsicInst *II = HPVMMallocInstVec[i];
assert(II->hasOneUse() && "hpvm_malloc result is used more than once");
II->replaceAllUsesWith(
ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())));
II->eraseFromParent();
}
kernel->AllocationFunction = F_alloc;
// This could be used to check that the allocation node has the appropriate
// number of fields in its return struct
/*
ReturnInst *RI = ReturnInstVec[0];
Value *RetVal = RI->getReturnValue();
Type *RetTy = RetVal->getType();
StructType *RetStructTy = dyn_cast<StructType>(RetTy);
assert(RetStructTy && "Allocation node does not return a struct type");
unsigned numFields = RetStructTy->getNumElements();
*/
std::map<unsigned, std::pair<Value *, unsigned>> sharedInMap =
kernel->getSharedInArgMap();
AllocationNodeProperty *APN =
(AllocationNodeProperty *)kernel->AllocationNode->getProperty(
DFNode::Allocation);
for (auto &AllocPair : APN->getAllocationList()) {
unsigned destPos = AllocPair.first->getDestPosition();
unsigned srcPos = AllocPair.first->getSourcePosition();
SharedMemArgs.push_back(destPos);
sharedInMap[destPos] =
std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1);
sharedInMap[destPos + 1] =
std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1);
}
kernel->setSharedInArgMap(sharedInMap);
}
std::sort(SharedMemArgs.begin(), SharedMemArgs.end());
// All pointer args which are not shared memory pointers have to be moved to
// global address space
unsigned argIndex = 0;
std::vector<unsigned> GlobalMemArgs;
for (Function::arg_iterator ai = F_opencl->arg_begin(),
ae = F_opencl->arg_end();
ai != ae; ++ai) {
if (ai->getType()->isPointerTy()) {
// If the arguement is already chosen for shared memory arguemnt list,
// skip. Else put it in Global memory arguement list
if (std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) ==
0) {
GlobalMemArgs.push_back(argIndex);
}
}
argIndex++;
}
std::sort(GlobalMemArgs.begin(), GlobalMemArgs.end());
/* At this point, we assume that chescks for the fact that SharedMemArgs only
contains pointer arguments to GLOBAL_ADDRSPACE have been performed by the
analysis pass */
// Optimization: Gloabl memory arguments, which are not modified and whose
// loads are not dependent on node id of current node, should be moved to
// constant memory, subject to size of course
std::vector<unsigned> ConstantMemArgs =
globalToConstantMemoryOpt(&GlobalMemArgs, F_opencl);
F_opencl = changeArgAddrspace(F_opencl, ConstantMemArgs, GLOBAL_ADDRSPACE);
F_opencl = changeArgAddrspace(F_opencl, SharedMemArgs, SHARED_ADDRSPACE);
F_opencl = changeArgAddrspace(F_opencl, GlobalMemArgs, GLOBAL_ADDRSPACE);
// Function to replace call instructions to functions in the kernel
std::map<Function *, Function *> OrgToClonedFuncMap;
std::vector<Function *> FuncToBeRemoved;
auto CloneAndReplaceCall = [&](CallInst *CI, Function *OrgFunc) {
Function *NewFunc;
// Check if the called function has already been cloned before.
auto It = OrgToClonedFuncMap.find(OrgFunc);
if (It == OrgToClonedFuncMap.end()) {
ValueToValueMapTy VMap;
NewFunc = CloneFunction(OrgFunc, VMap);
OrgToClonedFuncMap[OrgFunc] = NewFunc;
FuncToBeRemoved.push_back(NewFunc);
} else {
NewFunc = (*It).second;
}
// Replace the calls to this function
std::vector<Value *> args;
for (unsigned i = 0; i < CI->getNumArgOperands(); i++) {
args.push_back(CI->getArgOperand(i));
}
CallInst *Inst = CallInst::Create(
NewFunc, args,
OrgFunc->getReturnType()->isVoidTy() ? "" : CI->getName(), CI);
CI->replaceAllUsesWith(Inst);
IItoRemove.push_back(CI);
return NewFunc;
};
// Go through all the instructions
for (inst_iterator i = inst_begin(F_opencl), e = inst_end(F_opencl); i != e;
++i) {
Instruction *I = &(*i);
// Leaf nodes should not contain HPVM graph intrinsics or launch
assert(!BuildDFG::isHPVMLaunchIntrinsic(I) &&
"Launch intrinsic within a dataflow graph!");
assert(!BuildDFG::isHPVMGraphIntrinsic(I) &&
"HPVM graph intrinsic within a leaf dataflow node!");
if (BuildDFG::isHPVMIntrinsic(I)) {
IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
IntrinsicInst *ArgII;
DFNode *ArgDFNode;
/************************ Handle HPVM Query intrinsics
* ************************/
switch (II->getIntrinsicID()) {
/**************************** llvm.hpvm.getNode()
* *****************************/
case Intrinsic::hpvm_getNode: {
DEBUG(errs() << F_opencl->getName() << "\t: Handling getNode\n");
// add mapping <intrinsic, this node> to the node-specific map
Leaf_HandleToDFNodeMap[II] = N;
IItoRemove.push_back(II);
} break;
/************************* llvm.hpvm.getParentNode()
* **************************/
case Intrinsic::hpvm_getParentNode: {
DEBUG(errs() << F_opencl->getName() << "\t: Handling getParentNode\n");
// get the parent node of the arg node
// get argument node
ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
// get the parent node of the arg node
// Add mapping <intrinsic, parent node> to the node-specific map
// the argument node must have been added to the map, orelse the
// code could not refer to it
Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent();
IItoRemove.push_back(II);
} break;
/*************************** llvm.hpvm.getNumDims()
* ***************************/
case Intrinsic::hpvm_getNumDims: {
DEBUG(errs() << F_opencl->getName() << "\t: Handling getNumDims\n");
// get node from map
// get the appropriate field
ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
int numOfDim = ArgDFNode->getNumOfDim();
DEBUG(errs() << "\t Got node dimension : " << numOfDim << "\n");
IntegerType *IntTy = Type::getInt32Ty(KernelM->getContext());
ConstantInt *numOfDimConstant =
ConstantInt::getSigned(IntTy, (int64_t)numOfDim);
// Replace the result of the intrinsic with the computed value
II->replaceAllUsesWith(numOfDimConstant);
IItoRemove.push_back(II);
} break;
/*********************** llvm.hpvm.getNodeInstanceID()
* ************************/
case Intrinsic::hpvm_getNodeInstanceID_x:
case Intrinsic::hpvm_getNodeInstanceID_y:
case Intrinsic::hpvm_getNodeInstanceID_z: {
DEBUG(errs() << F_opencl->getName()
<< "\t: Handling getNodeInstanceID\n"
<< "\t: " << *II << "\n");
ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
assert(ArgDFNode && "Arg node is NULL");
// A leaf node always has a parent
DFNode *ParentDFNode = ArgDFNode->getParent();
assert(ParentDFNode && "Parent node of a leaf is NULL");
// Get the number associated with the required dimension
// FIXME: The order is important!
// These three intrinsics need to be consecutive x,y,z
uint64_t dim =
II->getIntrinsicID() - Intrinsic::hpvm_getNodeInstanceID_x;
assert((dim < 3) && "Invalid dimension argument");
DEBUG(errs() << "\t dimension = " << dim << "\n");
// Argument of the function to be called
ConstantInt *DimConstant =
ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
// ArrayRef<Value *> Args(DimConstant);
// The following is to find which function to call
Function *OpenCLFunction;
FunctionType *FT =
FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
Type::getInt32Ty(KernelM->getContext()), false);
if (SelectedHierarchy == ONE_LEVEL && ArgDFNode == N) {
// We only have one level in the hierarchy or the parent node is not
// replicated. This indicates that the parent node is the kernel
// launch, so we need to specify a global id.
// We can translate this only if the argument is the current node
// itself
DEBUG(errs() << "Substitute with get_global_id()\n");
DEBUG(errs() << *II << "\n");
OpenCLFunction = cast<Function>(
(KernelM->getOrInsertFunction(StringRef("get_global_id"), FT))
.getCallee());
} else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
// DEBUG(errs() << "Here inside cond 2\n");
// We are asking for this node's id with respect to its parent
// this is a local id call
OpenCLFunction = cast<Function>(
(KernelM->getOrInsertFunction(StringRef("get_local_id"), FT))
.getCallee());
// DEBUG(errs() << "exiting condition 2\n");
} else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
// We are asking for this node's parent's id with respect to its
// parent: this is a group id call
OpenCLFunction = cast<Function>(
(KernelM->getOrInsertFunction(StringRef("get_group_id"), FT))
.getCallee());
} else {
DEBUG(errs() << N->getFuncPointer()->getName() << "\n");
DEBUG(errs() << N->getParent()->getFuncPointer()->getName() << "\n");
DEBUG(errs() << *II << "\n");
assert(false && "Unable to translate getNodeInstanceID intrinsic");
}
// DEBUG(errs() << "Create call instruction, insert it before the
// instrinsic\n"); DEBUG(errs() << "Function: " << *OpenCLFunction <<
// "\n"); DEBUG(errs() << "Arguments size: " << Args.size() << "\n");
// DEBUG(errs() << "Argument: " << Args[0] << "\n");
// DEBUG(errs() << "Arguments: " << *DimConstant << "\n");
// Create call instruction, insert it before the intrinsic and
// replace the uses of the previous instruction with the new one
CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
// DEBUG(errs() << "Replace uses\n");
II->replaceAllUsesWith(CI);
IItoRemove.push_back(II);
} break;
/********************** llvm.hpvm.getNumNodeInstances()
* ***********************/
case Intrinsic::hpvm_getNumNodeInstances_x:
case Intrinsic::hpvm_getNumNodeInstances_y:
case Intrinsic::hpvm_getNumNodeInstances_z: {
// TODO: think about whether this is the best way to go there are hw
// specific registers. therefore it is good to have the intrinsic but
// then, why do we need to keep that info in the graph? (only for the
// kernel configuration during the call)
DEBUG(errs() << F_opencl->getName()
<< "\t: Handling getNumNodeInstances\n");
ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
// A leaf node always has a parent
DFNode *ParentDFNode = ArgDFNode->getParent();
assert(ParentDFNode && "Parent node of a leaf is NULL");
// Get the number associated with the required dimension
// FIXME: The order is important!
// These three intrinsics need to be consecutive x,y,z
uint64_t dim =
II->getIntrinsicID() - Intrinsic::hpvm_getNumNodeInstances_x;
assert((dim < 3) && "Invalid dimension argument");
DEBUG(errs() << "\t dimension = " << dim << "\n");
// Argument of the function to be called
ConstantInt *DimConstant =
ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
// ArrayRef<Value *> Args(DimConstant);
// The following is to find which function to call
Function *OpenCLFunction;
FunctionType *FT =
FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
Type::getInt32Ty(KernelM->getContext()), false);
if (N == ArgDFNode && SelectedHierarchy == ONE_LEVEL) {
// We only have one level in the hierarchy or the parent node is not
// replicated. This indicates that the parent node is the kernel
// launch, so the instances are global_size (gridDim x blockDim)
OpenCLFunction = cast<Function>(
(KernelM->getOrInsertFunction(StringRef("get_global_size"), FT))
.getCallee());
} else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
// We are asking for this node's instances
// this is a local size (block dim) call
OpenCLFunction = cast<Function>(
(KernelM->getOrInsertFunction(StringRef("get_local_size"), FT))
.getCallee());
} else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
// We are asking for this node's parent's instances
// this is a (global_size/local_size) (grid dim) call
OpenCLFunction = cast<Function>(
(KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT))
.getCallee());
} else {
assert(false && "Unable to translate getNumNodeInstances intrinsic");
}
// Create call instruction, insert it before the intrinsic and
// replace the uses of the previous instruction with the new one
CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
II->replaceAllUsesWith(CI);
IItoRemove.push_back(II);
} break;
case Intrinsic::hpvm_barrier: {
DEBUG(errs() << F_opencl->getName() << "\t: Handling barrier\n");
DEBUG(errs() << "Substitute with barrier()\n");
DEBUG(errs() << *II << "\n");
FunctionType *FT = FunctionType::get(
Type::getVoidTy(KernelM->getContext()),
std::vector<Type *>(1, Type::getInt32Ty(KernelM->getContext())),
false);
Function *OpenCLFunction = cast<Function>(
(KernelM->getOrInsertFunction(StringRef("barrier"), FT))
.getCallee());
CallInst *CI =
CallInst::Create(OpenCLFunction,
ArrayRef<Value *>(ConstantInt::get(
Type::getInt32Ty(KernelM->getContext()), 1)),
"", II);
II->replaceAllUsesWith(CI);
IItoRemove.push_back(II);
} break;
case Intrinsic::hpvm_atomic_add:
case Intrinsic::hpvm_atomic_sub:
case Intrinsic::hpvm_atomic_xchg:
case Intrinsic::hpvm_atomic_min:
case Intrinsic::hpvm_atomic_max:
case Intrinsic::hpvm_atomic_and:
case Intrinsic::hpvm_atomic_or:
case Intrinsic::hpvm_atomic_xor: {
DEBUG(errs() << *II << "\n");
// Only have support for i32 atomic intrinsics
assert(II->getType() == Type::getInt32Ty(II->getContext()) &&
"Only support i32 atomic intrinsics for now");
// Substitute with atomicrmw instruction
assert(II->getNumArgOperands() == 2 &&
"Expecting 2 operands for these atomics");
Value *Ptr = II->getArgOperand(0);
Value *Val = II->getArgOperand(1);
assert(
Ptr->getType()->isPointerTy() &&
"First argument of supported atomics is expected to be a pointer");
PointerType *PtrTy = cast<PointerType>(Ptr->getType());
PointerType *TargetTy =
Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace());
if (PtrTy != TargetTy) {
Ptr = CastInst::CreatePointerCast(Ptr, TargetTy, "", II);
PtrTy = TargetTy;
}
std::string name;
if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_add)
name = "atomic_add";
else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_sub)
name = "atomic_sub";
else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_xchg)
name = "atomic_xchg";
else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_min)
name = "atomic_min";
else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_max)
name = "atomic_max";
else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_and)
name = "atomic_and";
else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_or)
name = "atomic_or";
else if (II->getIntrinsicID() == Intrinsic::hpvm_atomic_xor)
name = "atomic_xor";
Type *paramTypes[] = {PtrTy, Val->getType()};
FunctionType *AtomFuncT = FunctionType::get(
II->getType(), ArrayRef<Type *>(paramTypes, 2), false);
FunctionCallee AtomFunc = KernelM->getOrInsertFunction(name, AtomFuncT);
Value *Params[] = {Ptr, Val};
CallInst *AtomCI = CallInst::Create(
AtomFunc, ArrayRef<Value *>(Params, 2), II->getName(), II);
DEBUG(errs() << "Substitute with: " << *AtomCI << "\n");
II->replaceAllUsesWith(AtomCI);
IItoRemove.push_back(II);
} break;
default:
llvm_unreachable("Unknown HPVM Intrinsic!");
break;
}
} else if (MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) {
IRBuilder<> Builder(I);
Value *Source = MemCpyI->getSource();
Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts();
Value *Length = MemCpyI->getOperand(2);
DEBUG(errs() << "Found memcpy instruction: " << *I << "\n");
DEBUG(errs() << "Source: " << *Source << "\n");
DEBUG(errs() << "Destination: " << *Destination << "\n");
DEBUG(errs() << "Length: " << *Length << "\n");
size_t memcpy_length;
unsigned int memcpy_count;
if (ConstantInt *CI = dyn_cast<ConstantInt>(Length)) {
if (CI->getBitWidth() <= 64) {
memcpy_length = CI->getSExtValue();
DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n");
Type *Source_Type = Source->getType()->getPointerElementType();
DEBUG(errs() << "Source Type : " << *Source_Type << "\n");
memcpy_count =
memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8);
DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n");
if (GetElementPtrInst *sourceGEPI =
dyn_cast<GetElementPtrInst>(Source)) {
if (GetElementPtrInst *destGEPI =
dyn_cast<GetElementPtrInst>(Destination)) {
Value *SourcePtrOperand = sourceGEPI->getPointerOperand();
Value *DestPtrOperand = destGEPI->getPointerOperand();
for (unsigned i = 0; i < memcpy_count; ++i) {
Constant *increment;
LoadInst *newLoadI;
StoreInst *newStoreI;
// First, need to increment the correct index for both source
// and dest This invluves checking to see how many indeces the
// GEP has Assume for now only 1 or 2 are the viable options.
std::vector<Value *> GEPlIndex;
if (sourceGEPI->getNumIndices() == 1) {
Value *Index = sourceGEPI->getOperand(1);
increment = ConstantInt::get(Index->getType(), i, false);
Value *incAdd = Builder.CreateAdd(Index, increment);
DEBUG(errs() << "Add: " << *incAdd << "\n");
GEPlIndex.push_back(incAdd);
Value *newGEPIl = Builder.CreateGEP(
SourcePtrOperand, ArrayRef<Value *>(GEPlIndex));
DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n");
newLoadI = Builder.CreateLoad(newGEPIl);
DEBUG(errs() << "Load: " << *newLoadI << "\n");
} else {
llvm_unreachable("Unhandled case where source GEPI has more "
"than 1 indices!\n");
}
std::vector<Value *> GEPsIndex;
if (destGEPI->getNumIndices() == 1) {
} else if (destGEPI->getNumIndices() == 2) {
Value *Index0 = destGEPI->getOperand(1);
GEPsIndex.push_back(Index0);
Value *Index1 = destGEPI->getOperand(2);
increment = ConstantInt::get(Index1->getType(), i, false);
Value *incAdd = Builder.CreateAdd(Index1, increment);
DEBUG(errs() << "Add: " << *incAdd << "\n");
GEPsIndex.push_back(incAdd);
Value *newGEPIs = Builder.CreateGEP(
DestPtrOperand, ArrayRef<Value *>(GEPsIndex));
DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n");
newStoreI = Builder.CreateStore(newLoadI, newGEPIs,
MemCpyI->isVolatile());
DEBUG(errs() << "Store: " << *newStoreI << "\n");
} else {
llvm_unreachable("Unhandled case where dest GEPI has more "
"than 2 indices!\n");
}
}
IItoRemove.push_back(sourceGEPI);
IItoRemove.push_back(destGEPI);
Instruction *destBitcastI =
dyn_cast<Instruction>(MemCpyI->getArgOperand(0));
Instruction *sourceBitcastI =
dyn_cast<Instruction>(MemCpyI->getArgOperand(1));
IItoRemove.push_back(destBitcastI);
IItoRemove.push_back(sourceBitcastI);
IItoRemove.push_back(MemCpyI);
}
}
}
} else {
llvm_unreachable("MEMCPY length is not a constant, not handled!\n");
}
// llvm_unreachable("HERE!");
}
else if (CallInst *CI = dyn_cast<CallInst>(I)) {
DEBUG(errs() << "Found a call: " << *CI << "\n");
Function *calleeF =
cast<Function>(CI->getCalledValue()->stripPointerCasts());
if (calleeF->isDeclaration()) {
// Add the declaration to kernel module
if (calleeF->getName() == "sqrtf") {
calleeF->setName(Twine("sqrt"));
DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
DEBUG(errs() << "CI: " << *CI << "\n");
} else if (calleeF->getName() == "rsqrtf") {
calleeF->setName(Twine("rsqrt"));
DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
DEBUG(errs() << "CI: " << *CI << "\n");
}
DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF
<< "\n");
KernelM->getOrInsertFunction(calleeF->getName(),
calleeF->getFunctionType());
} else {
// Check if the called function has already been cloned before.
Function *NewFunc = CloneAndReplaceCall(CI, calleeF);
// Iterate over the new function to see if it calls any other functions
// in the module.
for (inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc);
i != e; ++i) {
if (auto *Call = dyn_cast<CallInst>(&*i)) {
Function *CalledFunc =
cast<Function>(Call->getCalledValue()->stripPointerCasts());
CloneAndReplaceCall(Call, CalledFunc);
}
}
}
// TODO: how to handle address space qualifiers in load/store
}
}
// search for pattern where float is being casted to int and loaded/stored and
// change it.
DEBUG(errs() << "finding pattern for replacement!\n");
for (inst_iterator i = inst_begin(F_opencl), e = inst_end(F_opencl); i != e;
++i) {
bool cont = false;
bool keepGEPI = false;
bool keepGEPI2 = false;
Instruction *I = &(*i);
GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I);
if (!GEPI) {
// did nod find pattern start, continue
continue;
}
// may have found pattern, check
DEBUG(errs() << "GEPI " << *GEPI << "\n");
// print whatever we want for debug
Value *PtrOp = GEPI->getPointerOperand();
Type *SrcTy = GEPI->getSourceElementType();
unsigned GEPIaddrspace = GEPI->getAddressSpace();
if (SrcTy->isArrayTy())
DEBUG(errs() << *SrcTy << " is an array type! "
<< *(SrcTy->getArrayElementType()) << "\n");
else
DEBUG(errs() << *SrcTy << " is not an array type!\n");
// check that source element type is float
if (SrcTy->isArrayTy()) {
if (!(SrcTy->getArrayElementType()->isFloatTy())) {
DEBUG(errs() << "GEPI type is array but not float!\n");
continue;
}
} else if (!(SrcTy->isFPOrFPVectorTy() /*isFloatTy()*/)) {
DEBUG(errs() << "GEPI type is " << *SrcTy << "\n");
// does not fit this pattern - no float GEP instruction
continue;
}
// check that addressspace is 1
// if (GEPIaddrspace != 1) {
// // does not fit this pattern - addrspace of pointer
// argument is not global continue;
// }
if (!(GEPI->hasOneUse())) {
// does not fit this pattern - more than one uses
// continue;
// Keep GEPI around if it has other uses
keepGEPI = true;
}
DEBUG(errs() << "Found GEPI " << *GEPI << "\n");
// 1st GEPI it has one use
// assert(GEPI->hasOneUse() && "GEPI has a single use");
// See if it is a bitcast
BitCastInst *BitCastI;
for (User *U : GEPI->users()) {
if (Instruction *ui = dyn_cast<Instruction>(U)) {
DEBUG(errs() << "--" << *ui << "\n");
if (isa<BitCastInst>(ui)) {
BitCastI = dyn_cast<BitCastInst>(ui);
DEBUG(errs() << "---Found bitcast as only use of GEP\n");
break;
}
}
DEBUG(errs() << "GEPI does not have a bitcast user, continue\n");
cont = true;
}
// for (Value::user_iterator ui = GEPI->user_begin(),
// ue = GEPI->user_end(); ui!=ue; ++ui) {
// DEBUG(errs() << "--" << *ui << "\n");
// if (isa<BitCastInst>(*ui)) {
// BitCastI = dyn_cast<BitCastInst>(*ui);
// DEBUG(errs() << "Found bitcast as only use of GEP\n");
// }
// }
if (cont /*!BitCastI*/) {
continue; // not in pattern
}
// DEBUG(errs() << *BitCastI << "\n");
// Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand
// has to be the GEP, since this is a use of the GEP.
Value *Op2 = BitCastI->getOperand(0);
DEBUG(errs() << "----" << *Op2 << "\n");
// assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n");
// Type *OpTy = cast<Type>(Op2);
Type *OpTy = BitCastI->getDestTy();
DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n");
// DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) <<
// "\n");
if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) {
// maybe right syntax is (Type::getInt32Ty)->getPointerTo()
continue; // not in pattern
}
DEBUG(errs() << "----Here!\n");
// We are in GEP, bitcast.
// user_iterator, to find the load.
if (!(BitCastI->hasOneUse())) {
// does not fit this pattern - more than one uses
continue;
}
DEBUG(errs() << "----Bitcast has one use!\n");
// it has one use
assert(BitCastI->hasOneUse() && "BitCastI has a single use");
LoadInst *LoadI;
for (User *U : BitCastI->users()) {
if (Instruction *ui = dyn_cast<Instruction>(U)) {
DEBUG(errs() << "-----" << *ui << "\n");
if (isa<LoadInst>(ui)) {
LoadI = dyn_cast<LoadInst>(ui);
DEBUG(errs() << "-----Found load as only use of bitcast\n");
break;
}
}
DEBUG(errs() << "Bitcast does not have a load user, continue!\n");
cont = true;
}
// for (Value::user_iterator ui = BitCastI->user_begin(),
// ue = BitCastI->user_end(); ui!=ue; ++ui) {
// if (isa<LoadInst>(*ui)) {
// LoadI = dyn_cast<LoadInst>(*ui);
// errs() << "Found load as only use of bitcast\n";
// }
// }
if (cont) {
continue; // not in pattern
}
// check that we load from pointer we got from bitcast - assert - the unique
// argument must be the use we found it from
assert(LoadI->getPointerOperand() == BitCastI &&
"Unexpected Load Instruction Operand\n");
// Copy user_iterator, to find the store.
if (!(LoadI->hasOneUse())) {
// does not fit this pattern - more than one uses
continue;
// TODO: generalize: one load can have more than one store users
}
// it has one use
assert(LoadI->hasOneUse() && "LoadI has a single use");
Value::user_iterator ui = LoadI->user_begin();
// skipped loop, because is has a single use
StoreInst *StoreI = dyn_cast<StoreInst>(*ui);
if (!StoreI) {
continue; // not in pattern
}
// Also check that the store uses the loaded value as the value operand
if (StoreI->getValueOperand() != LoadI) {
continue;
}
DEBUG(errs() << "-------Found store instruction\n");
// Look for its bitcast, which is its pointer operand
Value *StPtrOp = StoreI->getPointerOperand();
DEBUG(errs() << "-------" << *StPtrOp << "\n");
BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp);
DEBUG(errs() << "-------" << *BitCastI2 << "\n");
if (!BitCastI2) {
continue; // not in pattern
}
DEBUG(errs() << "-------- Found Bit Cast of store!\n");
// found bitcast. Look for the second GEP, its from operand.
Value *BCFromOp = BitCastI2->getOperand(0);
GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp);
DEBUG(errs() << "---------- " << *GEPI2 << "\n");
if (!GEPI2) {
continue; // not in pattern
}
if (!(GEPI2->hasOneUse())) {
// does not fit this pattern - more than one uses
// continue;
// Keep GEPI around if it has other uses
keepGEPI2 = true;
}
DEBUG(errs() << "---------- Found GEPI of Bitcast!\n");
Value *PtrOp2 = GEPI2->getPointerOperand();
// Found GEPI2. TODO: kind of confused as o what checks I need to add here,
// let's add them together- all the code for int-float type checks is
// already above.
// Assume we found pattern
if (!keepGEPI) {
IItoRemove.push_back(GEPI);
DEBUG(errs() << "Pushing " << *GEPI << " for removal\n");
} else {
DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n");
}
IItoRemove.push_back(BitCastI);
DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n");
IItoRemove.push_back(LoadI);
DEBUG(errs() << "Pushing " << *LoadI << " for removal\n");
IItoRemove.push_back(GEPI2);
DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n");
IItoRemove.push_back(BitCastI2);
DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n");
if (!keepGEPI2) {
IItoRemove.push_back(StoreI);
DEBUG(errs() << "Pushing " << *StoreI << " for removal\n");
} else {
DEBUG(errs() << "Keeping " << *StoreI
<< " since it has multiple uses!\n");
}
std::vector<Value *> GEPlIndex;
if (GEPI->hasIndices()) {
for (auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) {
Value *Index = dyn_cast<Value>(&*ii);
DEBUG(errs() << "GEP-1 Index: " << *Index << "\n");
GEPlIndex.push_back(Index);
}
}
// ArrayRef<Value*> GEPlArrayRef(GEPlIndex);
std::vector<Value *> GEPsIndex;
if (GEPI2->hasIndices()) {
for (auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) {
Value *Index = dyn_cast<Value>(&*ii);
DEBUG(errs() << "GEP-2 Index: " << *Index << "\n");
GEPsIndex.push_back(Index);
}
}
// ArrayRef<Value*> GEPsArrayRef(GEPlIndex);
// ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end());
GetElementPtrInst *newlGEP = GetElementPtrInst::Create(
GEPI->getSourceElementType(), // Type::getFloatTy(M.getContext()),
PtrOp, // operand from 1st GEP
ArrayRef<Value *>(GEPlIndex), Twine(), StoreI);
DEBUG(errs() << "Adding: " << *newlGEP << "\n");
// insert load before GEPI
LoadInst *newLoadI =
new LoadInst(Type::getFloatTy(M.getContext()),
newlGEP, // new GEP
Twine(), LoadI->isVolatile(), LoadI->getAlignment(),
LoadI->getOrdering(), LoadI->getSyncScopeID(), StoreI);
DEBUG(errs() << "Adding: " << *newLoadI << "\n");
// same for GEP for store, for store operand
GetElementPtrInst *newsGEP = GetElementPtrInst::Create(
GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()),
PtrOp2, // operand from 2nd GEP
ArrayRef<Value *>(GEPsIndex), Twine(), StoreI);
DEBUG(errs() << "Adding: " << *newsGEP << "\n");
// insert store before GEPI
StoreInst *newStoreI =
new StoreInst(newLoadI,
newsGEP, // new GEP
StoreI->isVolatile(), StoreI->getAlignment(),
StoreI->getOrdering(), StoreI->getSyncScopeID(), StoreI);
DEBUG(errs() << "Adding: " << *newStoreI << "\n");
}
// We need to do this explicitly: DCE pass will not remove them because we
// have assumed theworst memory behaviour for these function calls
// Traverse the vector backwards, otherwise definitions are deleted while
// their subsequent uses are still around
for (auto *I : reverse(IItoRemove)) {
DEBUG(errs() << "Erasing: " << *I << "\n");
I->eraseFromParent();
}
// Removed the cloned functions from the parent module into the new module
for (auto *F : FuncToBeRemoved) {
F->removeFromParent(); // TODO: MARIA check
KernelM->getFunctionList().push_back(F);
}
addCLMetadata(F_opencl);
kernel->KernelFunction = F_opencl;
DEBUG(errs() << "Identified kernel - " << kernel->KernelFunction->getName()
<< "\n");
DEBUG(errs() << *KernelM);
return;
}
bool DFG2LLVM_OpenCL::runOnModule(Module &M) {
DEBUG(errs() << "\nDFG2LLVM_OpenCL PASS\n");
// Get the BuildDFG Analysis Results:
// - Dataflow graph
// - Maps from i8* hansles to DFNode and DFEdge
BuildDFG &DFG = getAnalysis<BuildDFG>();
// DFInternalNode *Root = DFG.getRoot();
std::vector<DFInternalNode *> Roots = DFG.getRoots();
// Visitor for Code Generation Graph Traversal
CGT_OpenCL *CGTVisitor = new CGT_OpenCL(M, DFG);
// Iterate over all the DFGs and produce code for each one of them
for (auto rootNode : Roots) {
// Initiate code generation for root DFNode
CGTVisitor->visit(rootNode);
}
CGTVisitor->writeKernelsModule();
// TODO: Edit module epilogue to remove the HPVM intrinsic declarations
delete CGTVisitor;
return true;
}
std::string CGT_OpenCL::getKernelsModuleName(Module &M) {
std::string mid = M.getModuleIdentifier();
return mid.append(".kernels.ll");
}
void CGT_OpenCL::fixValueAddrspace(Value *V, unsigned addrspace) {
assert(isa<PointerType>(V->getType()) && "Value should be of Pointer Type!");
PointerType *OldTy = cast<PointerType>(V->getType());
PointerType *NewTy = PointerType::get(OldTy->getElementType(), addrspace);
V->mutateType(NewTy);
for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue;
ui++) {
// Change all uses producing pointer type in same address space to new
// addressspace.
if (PointerType *PTy = dyn_cast<PointerType>((*ui)->getType())) {
if (PTy->getAddressSpace() == OldTy->getAddressSpace()) {
fixValueAddrspace(*ui, addrspace);
}
}
}
}
std::vector<unsigned>
CGT_OpenCL::globalToConstantMemoryOpt(std::vector<unsigned> *GlobalMemArgs,
Function *F) {
std::vector<unsigned> ConstantMemArgs;
for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
++ai) {
Argument *arg = &*ai;
std::vector<unsigned>::iterator pos = std::find(
GlobalMemArgs->begin(), GlobalMemArgs->end(), arg->getArgNo());
// It has to be a global memory argument to be promotable
if (pos == GlobalMemArgs->end())
continue;
// Check if it can/should be promoted
if (canBePromoted(arg, F)) {
DEBUG(errs() << "Promoting << " << arg->getName()
<< " to constant memory."
<< "\n");
ConstantMemArgs.push_back(arg->getArgNo());
GlobalMemArgs->erase(pos);
}
}
return ConstantMemArgs;
}
Function *CGT_OpenCL::changeArgAddrspace(Function *F,
std::vector<unsigned> &Args,
unsigned addrspace) {
unsigned idx = 0;
std::vector<Type *> ArgTypes;
for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
++ai) {
Argument *arg = &*ai;
DEBUG(errs() << *arg << "\n");
unsigned argno = arg->getArgNo();
if ((idx < Args.size()) && (argno == Args[idx])) {
fixValueAddrspace(arg, addrspace);
idx++;
}
ArgTypes.push_back(arg->getType());
}
FunctionType *newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
// F->mutateType(PTy);
Function *newF = cloneFunction(F, newFT, false);
replaceNodeFunctionInIR(*F->getParent(), F, newF);
DEBUG(errs() << *newF->getFunctionType() << "\n" << *newF << "\n");
return newF;
}
/* Add metadata to module KernelM, for OpenCL kernels */
void CGT_OpenCL::addCLMetadata(Function *F) {
IRBuilder<> Builder(&*F->begin());
SmallVector<Metadata *, 8> KernelMD;
KernelMD.push_back(ValueAsMetadata::get(F));
// TODO: There is additional metadata used by kernel files but we skip them as
// they are not mandatory. In future they might be useful to enable
// optimizations
MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
NamedMDNode *MDN_kernels =
KernelM->getOrInsertNamedMetadata("opencl.kernels");
MDN_kernels->addOperand(MDKernelNode);
KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
// TODO: Replace 1 with the number of the kernel.
// Add when support for multiple launces is added
KernelMD.push_back(ValueAsMetadata::get(
ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)));
MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
NamedMDNode *MDN_annotations =
KernelM->getOrInsertNamedMetadata("nvvm.annotations");
MDN_annotations->addOperand(MDNvvmAnnotationsNode);
}
void CGT_OpenCL::writeKernelsModule() {
// In addition to deleting all other functions, we also want to spiff it
// up a little bit. Do this now.
legacy::PassManager Passes;
DEBUG(errs() << "Writing to File --- ");
DEBUG(errs() << getKernelsModuleName(M).c_str() << "\n");
std::error_code EC;
ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None);
if (EC) {
DEBUG(errs() << EC.message() << '\n');
}
Passes.add(createPrintModulePass(Out.os()));
Passes.run(*KernelM);
// Declare success.
Out.keep();
}
Function *CGT_OpenCL::transformFunctionToVoid(Function *F) {
DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n");
// FIXME: Maybe do that using the Node?
StructType *FRetTy = dyn_cast<StructType>(F->getReturnType());
assert(FRetTy && "Return Type must always be a struct");
// Keeps return statements, because we will need to replace them
std::vector<ReturnInst *> RItoRemove;
findReturnInst(F, RItoRemove);
std::vector<Type *> RetArgTypes;
std::vector<Argument *> RetArgs;
std::vector<Argument *> Args;
// Check for { } return struct, which means that the function returns void
if (FRetTy->isEmptyTy()) {
DEBUG(errs() << "\tFunction output struct is void\n");
DEBUG(errs() << "\tNo parameters added\n");
// Replacing return statements with others returning void
for (auto *RI : RItoRemove) {
ReturnInst::Create((F->getContext()), 0, RI);
RI->eraseFromParent();
}
DEBUG(errs() << "\tChanged return statements to return void\n");
} else {
// The struct has return values, thus needs to be converted to parameter
// Iterate over all element types of return struct and add arguments to the
// function
for (unsigned i = 0; i < FRetTy->getNumElements(); i++) {
Argument *RetArg =
new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
RetArgs.push_back(RetArg);
RetArgTypes.push_back(RetArg->getType());
DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
}
DEBUG(errs() << "\tReplacing Return statements\n");
// Replace return statements with extractValue and store instructions
for (auto *RI : RItoRemove) {
Value *RetVal = RI->getReturnValue();
for (unsigned i = 0; i < RetArgs.size(); i++) {
ExtractValueInst *EI = ExtractValueInst::Create(
RetVal, ArrayRef<unsigned>(i), RetArgs[i]->getName() + ".val", RI);
new StoreInst(EI, RetArgs[i], RI);
}
// assert(RetVal && "Return value should not be null at this point");
// StructType* RetType = cast<StructType>(RetVal->getType());
// assert(RetType && "Return type is not a struct");
ReturnInst::Create((F->getContext()), 0, RI);
RI->eraseFromParent();
}
}
DEBUG(errs() << "\tReplaced return statements\n");
// Create the argument type list with the added argument's type
std::vector<Type *> ArgTypes;
for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
ai != ae; ++ai) {
ArgTypes.push_back(ai->getType());
}
for (auto *RATy : RetArgTypes) {
ArgTypes.push_back(RATy);
}
// Creating Args vector to use in cloning!
for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
++ai) {
Args.push_back(&*ai);
}
for (auto *ai : RetArgs) {
Args.push_back(ai);
}
// Adding new arguments to the function argument list, would not change the
// function type. We need to change the type of this function to reflect the
// added arguments
Type *VoidRetType = Type::getVoidTy(F->getContext());
FunctionType *newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
// Change the function type
// F->mutateType(PTy);
Function *newF = cloneFunction(F, newFT, false, NULL, &Args);
replaceNodeFunctionInIR(*F->getParent(), F, newF);
// F->eraseFromParent();
return newF;
}
/******************************************************************************
* Helper functions *
******************************************************************************/
// Check if argument arg can be promoted to constant memory in Function F
// Condition:
// 1. No stores
// 2. Loads not dependent on getNodeInstanceID itrinsic
static bool findLoadStoreUses(Value *V, std::vector<Value *> *UseList,
std::vector<Value *> *VisitedList) {
if (std::find(VisitedList->begin(), VisitedList->end(), V) !=
VisitedList->end()) {
DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
return false;
}
VisitedList->push_back(V);
for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue;
++ui) {
Instruction *I = dyn_cast<Instruction>(*ui);
if (!I) {
// if use is not an instruction, then skip it
continue;
}
DEBUG(errs() << "\t" << *I << "\n");
if (isa<LoadInst>(I)) {
DEBUG(errs() << "\tFound load instruction: " << *I << "\n");
DEBUG(errs() << "\tAdd to use list: " << *V << "\n");
UseList->push_back(V);
} else if (isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) {
// found a store in use chain
DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n");
return true;
} else if (BuildDFG::isHPVMIntrinsic(I)) {
// If it is an atomic intrinsic, we found a store
IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
assert(II &&
II->getCalledValue()->getName().startswith("llvm.hpvm.atomic") &&
"Only hpvm atomic intrinsics can have an argument as input");
return true;
} else {
DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n");
if (findLoadStoreUses(I, UseList, VisitedList))
return true;
}
}
return false;
}
static bool isDependentOnNodeInstanceID(Value *V,
std::vector<Value *> *DependenceList) {
if (std::find(DependenceList->begin(), DependenceList->end(), V) !=
DependenceList->end()) {
DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
return false;
}
DependenceList->push_back(V);
// If not an instruction, then not dependent on node instance id
if (!isa<Instruction>(V) || isa<Constant>(V)) {
DEBUG(errs() << "\tStop\n");
return false;
}
Instruction *I = cast<Instruction>(V);
for (unsigned i = 0; i < I->getNumOperands(); i++) {
Value *operand = I->getOperand(i);
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(operand)) {
if ((II->getIntrinsicID() == Intrinsic::hpvm_getNodeInstanceID_x ||
II->getIntrinsicID() == Intrinsic::hpvm_getNodeInstanceID_y ||
II->getIntrinsicID() == Intrinsic::hpvm_getNodeInstanceID_z)) {
Value *Node = II->getArgOperand(0);
IntrinsicInst *GN = dyn_cast<IntrinsicInst>(Node);
assert(
GN &&
"NodeInstanceID operande should be node/parent node intrinsic\n");
if (GN->getIntrinsicID() == Intrinsic::hpvm_getNode) {
DEBUG(errs() << "\tDependency found on Node instance ID: " << *II
<< "\n");
return true;
}
}
}
if (CmpInst *CI = dyn_cast<CmpInst>(operand)) {
DEBUG(errs() << "Found compare instruction: " << *CI
<< "\nNot following its dependency list\n");
continue;
}
DEBUG(errs() << "\tTraverse the operand chain of: " << *operand << "\n");
if (isDependentOnNodeInstanceID(operand, DependenceList)) {
return true;
}
}
return false;
}
// Function to check if argument arg can be changed to a constant memory pointer
static bool canBePromoted(Argument *arg, Function *F) {
DEBUG(errs() << "OPT: Check if Argument " << *arg
<< " can be changed to constant memory\n");
std::vector<Value *> UseList;
std::vector<Value *> VisitedList;
// recursively traverse use chain
// if find a store instruction return false, everything fails, cannot be
// promoted
// if find a load instruction as use, add the GEP instruction to list
bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList);
if (foundStore == true)
return false;
// See that the GEP instructions are not dependent on getNodeInstanceID
// intrinsic
DEBUG(errs() << foundStore
<< "\tNo Store Instruction found. Check dependence on node "
"instance ID\n");
std::vector<Value *> DependenceList;
for (auto U : UseList) {
if (isDependentOnNodeInstanceID(U, &DependenceList))
return false;
}
DEBUG(errs() << "\tYes, Promotable to Constant Memory\n");
return true;
}
// Calculate execute node parameters which include, number of diemnsions for
// dynamic instances of the kernel, local and global work group sizes.
static void getExecuteNodeParams(Module &M, Value *&workDim, Value *&LocalWGPtr,
Value *&GlobalWGPtr, Kernel *kernel,
ValueToValueMapTy &VMap, Instruction *IB) {
// Assign number of dimenstions a constant value
workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
// If local work group size if null
if (!kernel->hasLocalWG()) {
LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
} else {
for (unsigned i = 0; i < kernel->localWGSize.size(); i++) {
if (isa<Argument>(kernel->localWGSize[i]))
kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
}
LocalWGPtr =
genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
}
for (unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
if (isa<Argument>(kernel->globalWGSize[i]))
kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
}
// For OpenCL, global work group size is the total bumber of instances in each
// dimension. So, multiply local and global dim limits.
std::vector<Value *> globalWGSizeInsts;
if (kernel->hasLocalWG()) {
for (unsigned i = 0; i < kernel->gridDim; i++) {
BinaryOperator *MulInst =
BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i],
kernel->localWGSize[i], "", IB);
globalWGSizeInsts.push_back(MulInst);
}
} else {
globalWGSizeInsts = kernel->globalWGSize;
}
GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
}
// CodeGen for allocating space for Work Group on stack and returning a pointer
// to its address
static Value *genWorkGroupPtr(Module &M, std::vector<Value *> WGSize,
ValueToValueMapTy &VMap, Instruction *IB,
const Twine &WGName) {
Value *WGPtr;
// Get int64_t and or ease of use
Type *Int64Ty = Type::getInt64Ty(M.getContext());
// Work Group type is [#dim x i64]
Type *WGTy = ArrayType::get(Int64Ty, WGSize.size());
// Allocate space of Global work group data on stack and get pointer to
// first element.
AllocaInst *WG = new AllocaInst(WGTy, 0, WGName, IB);
WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(),
WG->getName() + ".0", IB);
Value *nextDim = WGPtr;
DEBUG(errs() << *WGPtr << "\n");
// Iterate over the number of dimensions and store the global work group
// size in that dimension
for (unsigned i = 0; i < WGSize.size(); i++) {
DEBUG(errs() << *WGSize[i] << "\n");
assert(WGSize[i]->getType()->isIntegerTy() &&
"Dimension not an integer type!");
if (WGSize[i]->getType() != Int64Ty) {
// If number of dimensions are mentioned in any other integer format,
// generate code to extend it to i64. We need to use the mapped value in
// the new generated function, hence the use of VMap
// FIXME: Why are we changing the kernel WGSize vector here?
DEBUG(errs() << "Not i64. Zero extend required.\n");
DEBUG(errs() << *WGSize[i] << "\n");
CastInst *CI =
BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
DEBUG(errs() << "Bitcast done.\n");
StoreInst *SI = new StoreInst(CI, nextDim, IB);
DEBUG(errs() << "Zero extend done.\n");
DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
} else {
// Store the value representing work group size in ith dimension on
// stack
StoreInst *SI = new StoreInst(WGSize[i], nextDim, IB);
DEBUG(errs() << "\t Work group size: " << *SI << "\n");
}
if (i + 1 < WGSize.size()) {
// Move to next dimension
GetElementPtrInst *GEP = GetElementPtrInst::Create(
nullptr, nextDim, ArrayRef<Value *>(ConstantInt::get(Int64Ty, 1)),
WG->getName() + "." + Twine(i + 1), IB);
DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
nextDim = GEP;
}
}
return WGPtr;
}
// Get generated PTX binary name
static std::string getPTXFilename(const Module &M) {
std::string moduleID = M.getModuleIdentifier();
moduleID.append(".kernels.cl");
return moduleID;
}
// Changes the data layout of the Module to be compiled with OpenCL backend
// TODO: Figure out when to call it, probably after duplicating the modules
static void changeDataLayout(Module &M) {
std::string opencl32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64";
std::string opencl64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64";
if (TARGET_PTX == 32)
M.setDataLayout(StringRef(opencl32_layoutStr));
else if (TARGET_PTX == 64)
M.setDataLayout(StringRef(opencl64_layoutStr));
else
assert(false && "Invalid PTX target");
return;
}
static void changeTargetTriple(Module &M) {
std::string opencl32_TargetTriple = "opencl--nvidiacl";
std::string opencl64_TargetTriple = "opencl64--nvidiacl";
if (TARGET_PTX == 32)
M.setTargetTriple(StringRef(opencl32_TargetTriple));
else if (TARGET_PTX == 64)
M.setTargetTriple(StringRef(opencl64_TargetTriple));
else
assert(false && "Invalid PTX target");
return;
}
// Helper function, populate a vector with all return statements in a function
static void findReturnInst(Function *F,
std::vector<ReturnInst *> &ReturnInstVec) {
for (auto &BB : *F) {
if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
ReturnInstVec.push_back(RI);
}
}
// Helper function, populate a vector with all IntrinsicID intrinsics in a
// function
static void findIntrinsicInst(Function *F, Intrinsic::ID IntrinsicID,
std::vector<IntrinsicInst *> &IntrinsicInstVec) {
for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
Instruction *I = &(*i);
IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
if (II && II->getIntrinsicID() == IntrinsicID) {
IntrinsicInstVec.push_back(II);
}
}
}
} // End of namespace
char DFG2LLVM_OpenCL::ID = 0;
static RegisterPass<DFG2LLVM_OpenCL> X("dfg2llvm-opencl",
"Dataflow Graph to LLVM for OpenCL Pass",
false /* does not modify the CFG */,
true /* transformation, *
* not just analysis */);