-
Akash Kothari authoredAkash Kothari authored
DFG2LLVM_CPU.cpp 70.45 KiB
//===-------------------------- DFG2LLVM_CPU.cpp --------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This pass is responsible for generating code for host code and kernel code
// for CPU target using HPVM dataflow graph.
//
//===----------------------------------------------------------------------===//
#define DEBUG_TYPE "DFG2LLVM_CPU"
#include "SupportHPVM/DFG2LLVM.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Module.h"
#include "llvm/IRReader/IRReader.h"
#include "llvm/Linker/Linker.h"
#include "llvm/Pass.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#ifndef LLVM_BUILD_DIR
#error LLVM_BUILD_DIR is not defined
#endif
#define STR_VALUE(X) #X
#define STRINGIFY(X) STR_VALUE(X)
#define LLVM_BUILD_DIR_STR STRINGIFY(LLVM_BUILD_DIR)
using namespace llvm;
using namespace builddfg;
using namespace dfg2llvm;
// HPVM Command line option to use timer or not
static cl::opt<bool> HPVMTimer_CPU("hpvm-timers-cpu",
cl::desc("Enable hpvm timers"));
namespace {
// DFG2LLVM_CPU - The first implementation.
struct DFG2LLVM_CPU : public DFG2LLVM {
static char ID; // Pass identification, replacement for typeid
DFG2LLVM_CPU() : DFG2LLVM(ID) {}
private:
// Member variables
// Functions
public:
bool runOnModule(Module &M);
};
// Visitor for Code generation traversal (tree traversal for now)
class CGT_CPU : public CodeGenTraversal {
private:
// Member variables
FunctionCallee malloc;
// HPVM Runtime API
FunctionCallee llvm_hpvm_cpu_launch;
FunctionCallee llvm_hpvm_cpu_wait;
FunctionCallee llvm_hpvm_cpu_argument_ptr;
FunctionCallee llvm_hpvm_streamLaunch;
FunctionCallee llvm_hpvm_streamPush;
FunctionCallee llvm_hpvm_streamPop;
FunctionCallee llvm_hpvm_streamWait;
FunctionCallee llvm_hpvm_createBindInBuffer;
FunctionCallee llvm_hpvm_createBindOutBuffer;
FunctionCallee llvm_hpvm_createEdgeBuffer;
FunctionCallee llvm_hpvm_createLastInputBuffer;
FunctionCallee llvm_hpvm_createThread;
FunctionCallee llvm_hpvm_bufferPush;
FunctionCallee llvm_hpvm_bufferPop;
FunctionCallee llvm_hpvm_cpu_dstack_push;
FunctionCallee llvm_hpvm_cpu_dstack_pop;
FunctionCallee llvm_hpvm_cpu_getDimLimit;
FunctionCallee llvm_hpvm_cpu_getDimInstance;
// Functions
std::vector<IntrinsicInst *> *getUseList(Value *LI);
Value *addLoop(Instruction *I, Value *limit, const Twine &indexName = "");
void addWhileLoop(Instruction *, Instruction *, Instruction *, Value *);
Instruction *addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *);
Argument *getArgumentFromEnd(Function *F, unsigned offset);
Value *getInValueAt(DFNode *Child, unsigned i, Function *ParentF_CPU,
Instruction *InsertBefore);
void invokeChild_CPU(DFNode *C, Function *F_CPU, ValueToValueMapTy &VMap,
Instruction *InsertBefore);
void invokeChild_PTX(DFNode *C, Function *F_CPU, ValueToValueMapTy &VMap,
Instruction *InsertBefore);
StructType *getArgumentListStructTy(DFNode *);
Function *createFunctionFilter(DFNode *C);
void startNodeThread(DFNode *, std::vector<Value *>,
DenseMap<DFEdge *, Value *>, Value *, Value *,
Instruction *);
Function *createLaunchFunction(DFInternalNode *);
// Virtual Functions
void init() {
HPVMTimer = HPVMTimer_CPU;
TargetName = "CPU";
}
void initRuntimeAPI();
void codeGen(DFInternalNode *N);
void codeGen(DFLeafNode *N);
Function *codeGenStreamPush(DFInternalNode *N);
Function *codeGenStreamPop(DFInternalNode *N);
public:
// Constructor
CGT_CPU(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG) {
init();
initRuntimeAPI();
}
void codeGenLaunch(DFInternalNode *Root);
void codeGenLaunchStreaming(DFInternalNode *Root);
};
bool DFG2LLVM_CPU::runOnModule(Module &M) {
DEBUG(errs() << "\nDFG2LLVM_CPU PASS\n");
// Get the BuildDFG Analysis Results:
// - Dataflow graph
// - Maps from i8* hansles to DFNode and DFEdge
BuildDFG &DFG = getAnalysis<BuildDFG>();
// DFInternalNode *Root = DFG.getRoot();
std::vector<DFInternalNode *> Roots = DFG.getRoots();
// BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
// BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
// Visitor for Code Generation Graph Traversal
CGT_CPU *CGTVisitor = new CGT_CPU(M, DFG);
// Iterate over all the DFGs and produce code for each one of them
for (auto &rootNode : Roots) {
// Initiate code generation for root DFNode
CGTVisitor->visit(rootNode);
// Go ahead and replace the launch intrinsic with pthread call, otherwise
// return now.
// TODO: Later on, we might like to do this in a separate pass, which would
// allow us the flexibility to switch between complete static code
// generation for DFG or having a customized runtime+scheduler
// Do streaming code generation if root node is streaming. Usual otherwise
if (rootNode->isChildGraphStreaming())
CGTVisitor->codeGenLaunchStreaming(rootNode);
else
CGTVisitor->codeGenLaunch(rootNode);
}
delete CGTVisitor;
return true;
}
// Initialize the HPVM runtime API. This makes it easier to insert these calls
void CGT_CPU::initRuntimeAPI() {
// Load Runtime API Module
SMDiagnostic Err;
std::string runtimeAPI = std::string(LLVM_BUILD_DIR_STR) +
"/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc";
runtimeModule = parseIRFile(runtimeAPI, Err, M.getContext());
if (runtimeModule == nullptr) {
DEBUG(errs() << Err.getMessage() << " " << runtimeAPI << "\n");
assert(false && "couldn't parse runtime");
} else
DEBUG(errs() << "Successfully loaded hpvm-rt API module\n");
// Get or insert the global declarations for launch/wait functions
DECLARE(llvm_hpvm_cpu_launch);
DECLARE(malloc);
DECLARE(llvm_hpvm_cpu_wait);
DECLARE(llvm_hpvm_cpu_argument_ptr);
DECLARE(llvm_hpvm_streamLaunch);
DECLARE(llvm_hpvm_streamPush);
DECLARE(llvm_hpvm_streamPop);
DECLARE(llvm_hpvm_streamWait);
DECLARE(llvm_hpvm_createBindInBuffer);
DECLARE(llvm_hpvm_createBindOutBuffer);
DECLARE(llvm_hpvm_createEdgeBuffer);
DECLARE(llvm_hpvm_createLastInputBuffer);
DECLARE(llvm_hpvm_createThread);
DECLARE(llvm_hpvm_bufferPush);
DECLARE(llvm_hpvm_bufferPop);
DECLARE(llvm_hpvm_cpu_dstack_push);
DECLARE(llvm_hpvm_cpu_dstack_pop);
DECLARE(llvm_hpvm_cpu_getDimLimit);
DECLARE(llvm_hpvm_cpu_getDimInstance);
// Get or insert timerAPI functions as well if you plan to use timers
initTimerAPI();
// Insert init context in main
Function *VI = M.getFunction("llvm.hpvm.init");
assert(VI->getNumUses() == 1 && "__hpvm__init should only be used once");
DEBUG(errs() << "Inserting cpu timer initialization\n");
Instruction *I = cast<Instruction>(*VI->user_begin());
initializeTimerSet(I);
switchToTimer(hpvm_TimerID_NONE, I);
// Insert print instruction at hpvm exit
Function *VC = M.getFunction("llvm.hpvm.cleanup");
assert(VC->getNumUses() == 1 && "__hpvm__cleanup should only be used once");
DEBUG(errs() << "Inserting cpu timer print\n");
printTimerSet(I);
}
/* Returns vector of all wait instructions
*/
std::vector<IntrinsicInst *> *CGT_CPU::getUseList(Value *GraphID) {
std::vector<IntrinsicInst *> *UseList = new std::vector<IntrinsicInst *>();
// It must have been loaded from memory somewhere
for (Value::user_iterator ui = GraphID->user_begin(),
ue = GraphID->user_end();
ui != ue; ++ui) {
if (IntrinsicInst *waitI = dyn_cast<IntrinsicInst>(*ui)) {
UseList->push_back(waitI);
} else {
llvm_unreachable("Error: Operation on Graph ID not supported!\n");
}
}
return UseList;
}
/* Traverse the function argument list in reverse order to get argument at a
* distance offset fromt he end of argument list of function F
*/
Argument *CGT_CPU::getArgumentFromEnd(Function *F, unsigned offset) {
assert((F->getFunctionType()->getNumParams() >= offset && offset > 0) &&
"Invalid offset to access arguments!");
Function::arg_iterator e = F->arg_end();
// Last element of argument iterator is dummy. Skip it.
e--;
Argument *arg;
for (; offset != 0; e--) {
offset--;
arg = &*e;
}
return arg;
}
/* Add Loop around the instruction I
* Algorithm:
* (1) Split the basic block of instruction I into three parts, where the
* middleblock/body would contain instruction I.
* (2) Add phi node before instruction I. Add incoming edge to phi node from
* predecessor
* (3) Add increment and compare instruction to index variable
* (4) Replace terminator/branch instruction of body with conditional branch
* which loops over bidy if true and goes to end if false
* (5) Update phi node of body
*/
void CGT_CPU::addWhileLoop(Instruction *CondBlockStart, Instruction *BodyStart,
Instruction *BodyEnd, Value *TerminationCond) {
BasicBlock *Entry = CondBlockStart->getParent();
BasicBlock *CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition");
BasicBlock *WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body");
BasicBlock *WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end");
// Replace the terminator instruction of conditional with new conditional
// branch which goes to while.body if true and branches to while.end otherwise
BranchInst *BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond);
ReplaceInstWithInst(CondBlock->getTerminator(), BI);
// While Body should jump to condition block
BranchInst *UnconditionalBranch = BranchInst::Create(CondBlock);
ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch);
}
Instruction *CGT_CPU::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
BasicBlock *Body) {
Module *M = Entry->getParent()->getParent();
Type *Int64Ty = Type::getInt64Ty(M->getContext());
// Insert a PHI instruction at the beginning of the condition block
Instruction *IB = Cond->getFirstNonPHI();
PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB);
ConstantInt *IConst =
ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true);
Instruction *CounterIncr =
BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst,
"cnt_incr", Body->getTerminator());
// Set incoming values for Phi node
IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true);
CounterPhi->addIncoming(IConst, Entry);
CounterPhi->addIncoming(CounterIncr, Body);
// Return the pointer to the created PHI node in the corresponding argument
return CounterPhi;
}
/* Add Loop around the instruction I
* Algorithm:
* (1) Split the basic block of instruction I into three parts, where the
* middleblock/body would contain instruction I.
* (2) Add phi node before instruction I. Add incoming edge to phi node from
* predecessor
* (3) Add increment and compare instruction to index variable
* (4) Replace terminator/branch instruction of body with conditional branch
* which loops over bidy if true and goes to end if false
* (5) Update phi node of body
*/
Value *CGT_CPU::addLoop(Instruction *I, Value *limit, const Twine &indexName) {
BasicBlock *Entry = I->getParent();
BasicBlock *ForBody = Entry->splitBasicBlock(I, "for.body");
BasicBlock::iterator i(I);
++i;
Instruction *NextI = &*i;
// Next Instruction should also belong to the same basic block as the basic
// block will have a terminator instruction
assert(NextI->getParent() == ForBody &&
"Next Instruction should also belong to the same basic block!");
BasicBlock *ForEnd = ForBody->splitBasicBlock(NextI, "for.end");
// Add Phi Node for index variable
PHINode *IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()), 2,
"index." + indexName, I);
// Add incoming edge to phi
IndexPhi->addIncoming(ConstantInt::get(Type::getInt64Ty(I->getContext()), 0),
Entry);
// Increment index variable
BinaryOperator *IndexInc = BinaryOperator::Create(
Instruction::Add, IndexPhi,
ConstantInt::get(Type::getInt64Ty(I->getContext()), 1),
"index." + indexName + ".inc", ForBody->getTerminator());
// Compare index variable with limit
CmpInst *Cond =
CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc, limit,
"cond." + indexName, ForBody->getTerminator());
// Replace the terminator instruction of for.body with new conditional
// branch which loops over body if true and branches to for.end otherwise
BranchInst *BI = BranchInst::Create(ForBody, ForEnd, Cond);
ReplaceInstWithInst(ForBody->getTerminator(), BI);
// Add incoming edge to phi node in body
IndexPhi->addIncoming(IndexInc, ForBody);
return IndexPhi;
}
// Returns a packed struct type. The structtype is created by packing the input
// types, output types and isLastInput buffer type. All the streaming
// inputs/outputs are converted to i8*, since this is the type of buffer
// handles.
StructType *CGT_CPU::getArgumentListStructTy(DFNode *C) {
std::vector<Type *> TyList;
// Input types
Function *CF = C->getFuncPointer();
for (Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end();
ai != ae; ++ai) {
if (C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge())
TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
else
TyList.push_back(ai->getType());
}
// Output Types
StructType *OutStructTy = cast<StructType>(CF->getReturnType());
for (unsigned i = 0; i < OutStructTy->getNumElements(); i++) {
// All outputs of a node are streaming edge
assert(C->getOutDFEdgeAt(i)->isStreamingEdge() &&
"All output edges of child node have to be streaming");
TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
}
// isLastInput buffer element
TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
StructType *STy =
StructType::create(CF->getContext(), TyList,
Twine("struct.thread." + CF->getName()).str(), true);
return STy;
}
void CGT_CPU::startNodeThread(DFNode *C, std::vector<Value *> Args,
DenseMap<DFEdge *, Value *> EdgeBufferMap,
Value *isLastInputBuffer, Value *graphID,
Instruction *IB) {
DEBUG(errs() << "Starting Pipeline for child node: "
<< C->getFuncPointer()->getName() << "\n");
// Create a filter/pipeline function for the child node
Function *C_Pipeline = createFunctionFilter(C);
Function *CF = C->getFuncPointer();
// Get module context and i32 0 constant, as they would be frequently used in
// this function.
LLVMContext &Ctx = IB->getParent()->getContext();
Constant *IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
// Marshall arguments
// Create a packed struct type with inputs of C followed by outputs and then
// another i8* to indicate isLastInput buffer. Streaming inputs are replaced
// by i8*
//
StructType *STy = getArgumentListStructTy(C);
// Allocate the struct on heap *NOT* stack and bitcast i8* to STy*
CallInst *CI =
CallInst::Create(malloc, ArrayRef<Value *>(ConstantExpr::getSizeOf(STy)),
C->getFuncPointer()->getName() + ".inputs", IB);
CastInst *Struct = BitCastInst::CreatePointerCast(
CI, STy->getPointerTo(), CI->getName() + ".i8ptr", IB);
// AllocaInst* AI = new AllocaInst(STy,
// C->getFuncPointer()->getName()+".inputs", IB);
// Insert elements in the struct
DEBUG(errs() << "Marshall inputs for child node: "
<< C->getFuncPointer()->getName() << "\n");
// Marshall Inputs
for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) {
// Create constant int (i)
Constant *Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i);
// Get Element pointer instruction
Value *GEPIndices[] = {IntZero, Int_i};
GetElementPtrInst *GEP = GetElementPtrInst::Create(
nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2),
Struct->getName() + ".arg_" + Twine(i), IB);
DFEdge *E = C->getInDFEdgeAt(i);
if (E->getSourceDF()->isEntryNode()) {
// This is a Bind Input Edge
if (E->isStreamingEdge()) {
// Streaming Bind Input edge. Get buffer corresponding to it
assert(EdgeBufferMap.count(E) &&
"No mapping buffer for a Streaming Bind DFEdge!");
new StoreInst(EdgeBufferMap[E], GEP, IB);
} else {
// Non-streaming Bind edge
new StoreInst(Args[i], GEP, IB);
}
} else {
// This is an edge between siblings.
// This must be an streaming edge. As it is our assumption that all edges
// between two nodes in a DFG are streaming.
assert(EdgeBufferMap.count(E) &&
"No mapping buffer for a Streaming DFEdge!");
new StoreInst(EdgeBufferMap[E], GEP, IB);
}
}
unsigned numInputs = CF->getFunctionType()->getNumParams();
unsigned numOutputs = cast<StructType>(CF->getReturnType())->getNumElements();
// Marshall Outputs
DEBUG(errs() << "Marshall outputs for child node: "
<< C->getFuncPointer()->getName() << "\n");
for (unsigned i = 0; i < numOutputs; i++) {
// Create constant int (i+numInputs)
Constant *Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i + numInputs);
// Get Element pointer instruction
Value *GEPIndices[] = {IntZero, Int_i};
GetElementPtrInst *GEP = GetElementPtrInst::Create(
nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2),
Struct->getName() + ".out_" + Twine(i), IB);
DFEdge *E = C->getOutDFEdgeAt(i);
assert(E->isStreamingEdge() &&
"Output Edge must be streaming of all nodes");
assert(EdgeBufferMap.count(E) &&
"No mapping buffer for a Out Streaming DFEdge!");
new StoreInst(EdgeBufferMap[E], GEP, IB);
}
// Marshall last argument. isLastInput buffer
DEBUG(errs() << "Marshall isLastInput for child node: "
<< C->getFuncPointer()->getName() << "\n");
// Create constant int (i+numInputs)
Constant *Int_index =
ConstantInt::get(Type::getInt32Ty(Ctx), numInputs + numOutputs);
// Get Element pointer instruction
Value *GEPIndices[] = {IntZero, Int_index};
GetElementPtrInst *GEP = GetElementPtrInst::Create(
nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2),
Struct->getName() + ".isLastInput", IB);
new StoreInst(isLastInputBuffer, GEP, IB);
// AllocaInst AI points to memory with all the arguments packed
// Call runtime to create the thread with these arguments
DEBUG(errs() << "Start Thread for child node: "
<< C->getFuncPointer()->getName() << "\n");
// DEBUG(errs() << *llvm_hpvm_createThread << "\n");
DEBUG(errs() << *graphID->getType() << "\n");
DEBUG(errs() << *C_Pipeline->getType() << "\n");
DEBUG(errs() << *Struct->getType() << "\n");
// Bitcast AI to i8*
CastInst *BI = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx),
Struct->getName(), IB);
Value *CreateThreadArgs[] = {graphID, C_Pipeline, BI};
CallInst::Create(llvm_hpvm_createThread,
ArrayRef<Value *>(CreateThreadArgs, 3), "", IB);
}
Function *CGT_CPU::createLaunchFunction(DFInternalNode *N) {
DEBUG(errs() << "Generating Streaming Launch Function\n");
// Get Function associated with Node N
Function *NF = N->getFuncPointer();
// Map from Streaming edge to buffer
DenseMap<DFEdge *, Value *> EdgeBufferMap;
/* Now we have all the necessary global declarations necessary to generate the
* Launch function, pointer to which can be passed to pthread utils to execute
* DFG. The Launch function has just one input: i8* data.addr
* This is the address of the all the input data that needs to be passed to
* this function. In our case it contains the input arguments of the Root
* function in the correct order.
* (1) Create an empty Launch function of type void (i8* args, i8* GraphID)
* (2) Extract each of inputs from data.addr
* (3) create Buffers for all the streaming edges
* - Put buffers in the context
* (4) Go over each child node
* - marshall its arguments together (use buffers in place of streaming
* arguments)
* - Start the threads
* (5) The return value from Root is stored in memory, pointer to which is
* passed to pthread_exit call.
*/
// (1) Create Launch Function of type void (i8* args, i8* GraphID)
Type *i8Ty = Type::getInt8Ty(M.getContext());
Type *ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()};
FunctionType *LaunchFuncTy = FunctionType::get(
Type::getVoidTy(NF->getContext()), ArrayRef<Type *>(ArgTypes, 2), false);
Function *LaunchFunc = Function::Create(
LaunchFuncTy, NF->getLinkage(), NF->getName() + ".LaunchFunction", &M);
DEBUG(errs() << "Generating Code for Streaming Launch Function\n");
// Give a name to the argument which is used pass data to this thread
Argument *data = &*LaunchFunc->arg_begin();
// NOTE-HS: Check correctness with Maria
Argument *graphID = &*(LaunchFunc->arg_begin() + 1);
data->setName("data.addr");
graphID->setName("graphID");
// Add a basic block to this empty function and a return null statement to it
DEBUG(errs() << *LaunchFunc->getReturnType() << "\n");
BasicBlock *BB =
BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc);
ReturnInst *RI = ReturnInst::Create(LaunchFunc->getContext(), BB);
DEBUG(errs() << "Created Empty Launch Function\n");
// (2) Extract each of inputs from data.addr
std::vector<Type *> TyList;
std::vector<std::string> names;
std::vector<Value *> Args;
for (Function::arg_iterator ai = NF->arg_begin(), ae = NF->arg_end();
ai != ae; ++ai) {
if (N->getChildGraph()
->getEntry()
->getOutDFEdgeAt(ai->getArgNo())
->isStreamingEdge()) {
TyList.push_back(i8Ty->getPointerTo());
names.push_back(Twine(ai->getName() + "_buffer").str());
continue;
}
TyList.push_back(ai->getType());
names.push_back(ai->getName());
}
Args = extractElements(data, TyList, names, RI);
DEBUG(errs() << "Launch function for " << NF->getName() << *LaunchFunc
<< "\n");
// (3) Create buffers for all the streaming edges
for (DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(),
de = N->getChildGraph()->dfedge_end();
di != de; ++di) {
DFEdge *Edge = *di;
DEBUG(errs() << *Edge->getType() << "\n");
Value *size = ConstantExpr::getSizeOf(Edge->getType());
Value *CallArgs[] = {graphID, size};
if (Edge->isStreamingEdge()) {
CallInst *CI;
// Create a buffer call
if (Edge->getSourceDF()->isEntryNode()) {
// Bind Input Edge
Constant *Int_ArgNo = ConstantInt::get(
Type::getInt32Ty(RI->getContext()), Edge->getSourcePosition());
Value *BindInCallArgs[] = {graphID, size, Int_ArgNo};
CI = CallInst::Create(
llvm_hpvm_createBindInBuffer, ArrayRef<Value *>(BindInCallArgs, 3),
"BindIn." + Edge->getDestDF()->getFuncPointer()->getName(), RI);
} else if (Edge->getDestDF()->isExitNode()) {
// Bind Output Edge
CI = CallInst::Create(
llvm_hpvm_createBindOutBuffer, ArrayRef<Value *>(CallArgs, 2),
"BindOut." + Edge->getSourceDF()->getFuncPointer()->getName(), RI);
} else {
// Streaming Edge
CI = CallInst::Create(
llvm_hpvm_createEdgeBuffer, ArrayRef<Value *>(CallArgs, 2),
Edge->getSourceDF()->getFuncPointer()->getName() + "." +
Edge->getDestDF()->getFuncPointer()->getName(),
RI);
}
EdgeBufferMap[Edge] = CI;
}
}
// Create buffer for isLastInput for all the child nodes
DFGraph *G = N->getChildGraph();
DenseMap<DFNode *, Value *> NodeLastInputMap;
for (DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce;
++ci) {
DFNode *child = *ci;
if (child->isDummyNode())
continue;
Value *size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext()));
Value *CallArgs[] = {graphID, size};
CallInst *CI = CallInst::Create(
llvm_hpvm_createLastInputBuffer, ArrayRef<Value *>(CallArgs, 2),
"BindIn.isLastInput." + child->getFuncPointer()->getName(), RI);
NodeLastInputMap[child] = CI;
}
DEBUG(errs() << "Start Each child node filter\n");
// (4) Marshall arguments for each child node and start the thread with its
// pipeline funtion
for (DFGraph::children_iterator ci = N->getChildGraph()->begin(),
ce = N->getChildGraph()->end();
ci != ce; ++ci) {
DFNode *C = *ci;
// Skip dummy node call
if (C->isDummyNode())
continue;
// Marshall all the arguments for this node into an i8*
// Pass to the runtime to create the thread
// Start the thread for child node C
startNodeThread(C, Args, EdgeBufferMap, NodeLastInputMap[C], graphID, RI);
}
DEBUG(errs() << "Launch function:\n");
DEBUG(errs() << *LaunchFunc << "\n");
return LaunchFunc;
}
/* This fuction does the steps necessary to launch a streaming graph
* Steps
* Create Pipeline/Filter function for each node in child graph of Root
* Create Functions DFGLaunch, DFGPush, DFGPop, DFGWait
* Modify each of the instrinsic in host code
* Launch, Push, Pop, Wait
*/
void CGT_CPU::codeGenLaunchStreaming(DFInternalNode *Root) {
IntrinsicInst *LI = Root->getInstruction();
Function *RootLaunch = createLaunchFunction(Root);
// Substitute launch intrinsic main
DEBUG(errs() << "Substitute launch intrinsic\n");
Value *LaunchInstArgs[] = {RootLaunch, LI->getArgOperand(1)};
CallInst *LaunchInst = CallInst::Create(
llvm_hpvm_streamLaunch, ArrayRef<Value *>(LaunchInstArgs, 2),
"graph" + Root->getFuncPointer()->getName(), LI);
DEBUG(errs() << *LaunchInst << "\n");
// Replace all wait instructions with cpu specific wait instructions
DEBUG(errs() << "Substitute wait, push, pop intrinsics\n");
std::vector<IntrinsicInst *> *UseList = getUseList(LI);
for (unsigned i = 0; i < UseList->size(); ++i) {
IntrinsicInst *II = UseList->at(i);
CallInst *CI;
Value *PushArgs[] = {LaunchInst, II->getOperand(1)};
switch (II->getIntrinsicID()) {
case Intrinsic::hpvm_wait:
CI = CallInst::Create(llvm_hpvm_streamWait, ArrayRef<Value *>(LaunchInst),
"");
break;
case Intrinsic::hpvm_push:
CI = CallInst::Create(llvm_hpvm_streamPush,
ArrayRef<Value *>(PushArgs, 2), "");
break;
case Intrinsic::hpvm_pop:
CI = CallInst::Create(llvm_hpvm_streamPop, ArrayRef<Value *>(LaunchInst),
"");
break;
default:
llvm_unreachable(
"GraphID is used by an instruction other than wait, push, pop");
};
DEBUG(errs() << "Replace:\n\t" << *II << "\n");
ReplaceInstWithInst(II, CI);
DEBUG(errs() << "\twith " << *CI << "\n");
}
}
void CGT_CPU::codeGenLaunch(DFInternalNode *Root) {
// TODO: Place an assert to check if the constant passed by launch intrinsic
// as the number of arguments to DFG is same as the number of arguments of the
// root of DFG
DEBUG(errs() << "Generating Launch Function\n");
// Get Launch Instruction
IntrinsicInst *LI = Root->getInstruction();
switchToTimer(hpvm_TimerID_PTHREAD_CREATE, LI);
DEBUG(errs() << "Generating Launch Function\n");
/* Now we have all the necessary global declarations necessary to generate the
* Launch function, pointer to which can be passed to pthread utils to execute
* DFG. The Launch function has just one input: i8* data.addr
* This is the address of the all the input data that needs to be passed to
* this function. In our case it contains the input arguments of the Root
* function in the correct order.
* (1) Create an empty Launch function of type i8*(i8*)
* (2) Extract each of inputs from data.addr and pass them as arguments to the
* call to Root function
* (3) The return value from Root is stored in memory, pointer to which is
* passed to pthread_exit call.
*/
// Create Launch Function of type i8*(i8*) which calls the root function
Type *i8Ty = Type::getInt8Ty(M.getContext());
FunctionType *AppFuncTy = FunctionType::get(
i8Ty->getPointerTo(), ArrayRef<Type *>(i8Ty->getPointerTo()), false);
Function *AppFunc =
Function::Create(AppFuncTy, Root->getFuncPointer()->getLinkage(),
"LaunchDataflowGraph", &M);
DEBUG(errs() << "Generating Launch Function\n");
// Give a name to the argument which is used pass data to this thread
Value *data = &*AppFunc->arg_begin();
data->setName("data.addr");
// Add a basic block to this empty function and a return null statement to it
BasicBlock *BB = BasicBlock::Create(AppFunc->getContext(), "entry", AppFunc);
ReturnInst *RI =
ReturnInst::Create(AppFunc->getContext(),
Constant::getNullValue(AppFunc->getReturnType()), BB);
switchToTimer(hpvm_TimerID_ARG_UNPACK, RI);
DEBUG(errs() << "Created Empty Launch Function\n");
// Find the CPU function generated for Root and
// Function* RootF_CPU = Root->getGenFunc();
Function *RootF_CPU = Root->getGenFuncForTarget(hpvm::CPU_TARGET);
assert(RootF_CPU && "Error: No generated CPU function for Root node\n");
assert(Root->hasCPUGenFuncForTarget(hpvm::CPU_TARGET) &&
"Error: Generated Function for Root node with no cpu wrapper\n");
// Generate a call to RootF_CPU with null parameters for now
std::vector<Value *> Args;
for (unsigned i = 0; i < RootF_CPU->getFunctionType()->getNumParams(); i++) {
Args.push_back(
Constant::getNullValue(RootF_CPU->getFunctionType()->getParamType(i)));
}
CallInst *CI =
CallInst::Create(RootF_CPU, Args, RootF_CPU->getName() + ".output", RI);
// Extract input data from i8* data.addr and patch them to correct argument of
// call to RootF_CPU. For each argument
std::vector<Type *> TyList;
std::vector<std::string> names;
for (Function::arg_iterator ai = RootF_CPU->arg_begin(),
ae = RootF_CPU->arg_end();
ai != ae; ++ai) {
TyList.push_back(ai->getType());
names.push_back(ai->getName());
}
std::vector<Value *> elements = extractElements(data, TyList, names, CI);
// Patch the elements to the call arguments
for (unsigned i = 0; i < CI->getNumArgOperands(); i++)
CI->setArgOperand(i, elements[i]);
// Add timers around Call to RootF_CPU function
switchToTimer(hpvm_TimerID_COMPUTATION, CI);
switchToTimer(hpvm_TimerID_OUTPUT_PACK, RI);
StructType *RootRetTy =
cast<StructType>(RootF_CPU->getFunctionType()->getReturnType());
// if Root has non empty return
if (RootRetTy->getNumElements()) {
// We can't access the type of the arg struct - build it
std::vector<Type *> TyList;
for (Function::arg_iterator ai = RootF_CPU->arg_begin(),
ae = RootF_CPU->arg_end();
ai != ae; ++ai) {
TyList.push_back(ai->getType());
}
TyList.push_back(CI->getType());
StructType *ArgStructTy = StructType::create(
M.getContext(), ArrayRef<Type *>(TyList),
(RootF_CPU->getName() + ".arg.struct.ty").str(), true);
// Cast the data pointer to the type of the arg struct
CastInst *OutputAddrCast = CastInst::CreatePointerCast(
data, ArgStructTy->getPointerTo(), "argStructCast.addr", RI);
// Result struct is the last element of the packed struct passed to launch
unsigned outStructIdx = ArgStructTy->getNumElements() - 1;
ConstantInt *IntZero =
ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
ConstantInt *IntIdx =
ConstantInt::get(Type::getInt32Ty(M.getContext()), outStructIdx);
Value *GEPIIdxList[] = {IntZero, IntIdx};
// Get data pointer to the last element of struct - result field
GetElementPtrInst *OutGEPI = GetElementPtrInst::Create(
ArgStructTy, OutputAddrCast, ArrayRef<Value *>(GEPIIdxList, 2),
CI->getName() + ".addr", RI);
// Store result there
new StoreInst(CI, OutGEPI, RI);
} else {
// There is no return - no need to actually code gen, but for fewer
// changes maintain what code was already doing
// We were casting the data pointer to the result type of Root, and
// returning result there. This would work at the LLVM level, but not
// at the C level, thus the rewrite.
CastInst *OutputAddrCast = CastInst::CreatePointerCast(
data, CI->getType()->getPointerTo(), CI->getName() + ".addr", RI);
new StoreInst(CI, OutputAddrCast, RI);
}
switchToTimer(hpvm_TimerID_NONE, RI);
DEBUG(errs() << "Application specific function:\n");
DEBUG(errs() << *AppFunc << "\n");
// Substitute launch intrinsic main
Value *LaunchInstArgs[] = {AppFunc, LI->getArgOperand(1)};
CallInst *LaunchInst = CallInst::Create(
llvm_hpvm_cpu_launch, ArrayRef<Value *>(LaunchInstArgs, 2),
"graph" + Root->getFuncPointer()->getName(), LI);
// ReplaceInstWithInst(LI, LaunchInst);
DEBUG(errs() << *LaunchInst << "\n");
// Replace all wait instructions with cpu specific wait instructions
std::vector<IntrinsicInst *> *UseList = getUseList(LI);
for (unsigned i = 0; i < UseList->size(); ++i) {
IntrinsicInst *II = UseList->at(i);
CallInst *CI;
switch (II->getIntrinsicID()) {
case Intrinsic::hpvm_wait:
CI = CallInst::Create(llvm_hpvm_cpu_wait, ArrayRef<Value *>(LaunchInst),
"");
break;
case Intrinsic::hpvm_push:
CI = CallInst::Create(llvm_hpvm_bufferPush, ArrayRef<Value *>(LaunchInst),
"");
break;
case Intrinsic::hpvm_pop:
CI = CallInst::Create(llvm_hpvm_bufferPop, ArrayRef<Value *>(LaunchInst),
"");
break;
default:
llvm_unreachable(
"GraphID is used by an instruction other than wait, push, pop");
};
ReplaceInstWithInst(II, CI);
DEBUG(errs() << *CI << "\n");
}
}
Value *CGT_CPU::getInValueAt(DFNode *Child, unsigned i, Function *ParentF_CPU,
Instruction *InsertBefore) {
// TODO: Assumption is that each input port of a node has just one
// incoming edge. May change later on.
// Find the incoming edge at the requested input port
DFEdge *E = Child->getInDFEdgeAt(i);
assert(E && "No incoming edge or binding for input element!");
// Find the Source DFNode associated with the incoming edge
DFNode *SrcDF = E->getSourceDF();
// If Source DFNode is a dummyNode, edge is from parent. Get the
// argument from argument list of this internal node
Value *inputVal;
if (SrcDF->isEntryNode()) {
inputVal = getArgumentAt(ParentF_CPU, E->getSourcePosition());
DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
} else {
// edge is from a sibling
// Check - code should already be generated for this source dfnode
assert(OutputMap.count(SrcDF) &&
"Source node call not found. Dependency violation!");
// Find CallInst associated with the Source DFNode using OutputMap
Value *CI = OutputMap[SrcDF];
// Extract element at source position from this call instruction
std::vector<unsigned> IndexList;
IndexList.push_back(E->getSourcePosition());
DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n");
ExtractValueInst *EI =
ExtractValueInst::Create(CI, IndexList, "", InsertBefore);
inputVal = EI;
}
return inputVal;
}
void CGT_CPU::invokeChild_CPU(DFNode *C, Function *F_CPU,
ValueToValueMapTy &VMap, Instruction *IB) {
Function *CF = C->getFuncPointer();
// Function* CF_CPU = C->getGenFunc();
Function *CF_CPU = C->getGenFuncForTarget(hpvm::CPU_TARGET);
assert(CF_CPU != NULL &&
"Found leaf node for which code generation has not happened yet!\n");
assert(C->hasCPUGenFuncForTarget(hpvm::CPU_TARGET) &&
"The generated function to be called from cpu backend is not an cpu "
"function\n");
DEBUG(errs() << "Invoking child node" << CF_CPU->getName() << "\n");
std::vector<Value *> Args;
// Create argument list to pass to call instruction
// First find the correct values using the edges
// The remaing six values are inserted as constants for now.
for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) {
Args.push_back(getInValueAt(C, i, F_CPU, IB));
}
Value *I64Zero = ConstantInt::get(Type::getInt64Ty(F_CPU->getContext()), 0);
for (unsigned j = 0; j < 6; j++)
Args.push_back(I64Zero);
DEBUG(errs() << "Gen Function type: " << *CF_CPU->getType() << "\n");
DEBUG(errs() << "Node Function type: " << *CF->getType() << "\n");
DEBUG(errs() << "Arguments: " << Args.size() << "\n");
// Call the F_CPU function associated with this node
CallInst *CI =
CallInst::Create(CF_CPU, Args, CF_CPU->getName() + "_output", IB);
DEBUG(errs() << *CI << "\n");
OutputMap[C] = CI;
// Find num of dimensions this node is replicated in.
// Based on number of dimensions, insert loop instructions
std::string varNames[3] = {"x", "y", "z"};
unsigned numArgs = CI->getNumArgOperands();
for (unsigned j = 0; j < C->getNumOfDim(); j++) {
Value *indexLimit = NULL;
// Limit can either be a constant or an arguement of the internal node.
// In case of constant we can use that constant value directly in the
// new F_CPU function. In case of an argument, we need to get the mapped
// value using VMap
if (isa<Constant>(C->getDimLimits()[j])) {
indexLimit = C->getDimLimits()[j];
DEBUG(errs() << "In Constant case:\n"
<< " indexLimit type = " << *indexLimit->getType() << "\n");
} else {
indexLimit = VMap[C->getDimLimits()[j]];
DEBUG(errs() << "In VMap case:"
<< " indexLimit type = " << *indexLimit->getType() << "\n");
}
assert(indexLimit && "Invalid dimension limit!");
// Insert loop
Value *indexVar = addLoop(CI, indexLimit, varNames[j]);
DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n");
// Insert index variable and limit arguments
CI->setArgOperand(numArgs - 6 + j, indexVar);
CI->setArgOperand(numArgs - 3 + j, indexLimit);
}
// Insert call to runtime to push the dim limits and instanceID on the depth
// stack
Value *args[] = {
ConstantInt::get(Type::getInt32Ty(CI->getContext()),
C->getNumOfDim()), // numDim
CI->getArgOperand(numArgs - 3 + 0), // limitX
CI->getArgOperand(numArgs - 6 + 0), // iX
CI->getArgOperand(numArgs - 3 + 1), // limitY
CI->getArgOperand(numArgs - 6 + 1), // iY
CI->getArgOperand(numArgs - 3 + 2), // limitZ
CI->getArgOperand(numArgs - 6 + 2) // iZ
};
CallInst *Push = CallInst::Create(llvm_hpvm_cpu_dstack_push,
ArrayRef<Value *>(args, 7), "", CI);
DEBUG(errs() << "Push on stack: " << *Push << "\n");
// Insert call to runtime to pop the dim limits and instanceID from the depth
// stack
BasicBlock::iterator i(CI);
++i;
Instruction *NextI = &*i;
// Next Instruction should also belong to the same basic block as the basic
// block will have a terminator instruction
assert(NextI->getParent() == CI->getParent() &&
"Next Instruction should also belong to the same basic block!");
CallInst *Pop = CallInst::Create(llvm_hpvm_cpu_dstack_pop, None, "", NextI);
DEBUG(errs() << "Pop from stack: " << *Pop << "\n");
DEBUG(errs() << *CI->getParent()->getParent());
}
/* This function takes a DFNode, and creates a filter function for it. By filter
* function we mean a function which keeps on getting input from input buffers,
* applying the function on the inputs and then pushes data on output buffers
*/
// Create a function with void* (void*) type.
// Create a new basic block
// Add a return instruction to the basic block
// extract arguments from the aggregate data input. Type list would be
// Replace the streaming inputs with i8* types signifying handle to
// corresponding buffers
// Add a boolean argument isLastInput
// Add runtime API calls to get input for each of the streaming inputs
// Add a call to the generated function of the child node
// Add runtime API calls to push output for each of the streaming outputs
// Add loop around the basic block, which exits the loop if isLastInput is false
Function *CGT_CPU::createFunctionFilter(DFNode *C) {
DEBUG(errs() << "*********Creating Function filter for "
<< C->getFuncPointer()->getName() << "*****\n");
/* Create a function with same argument list as child.*/
DEBUG(errs() << "\tCreate a function with the same argument list as child\n");
// Get the generated function for child node
Function *CF = C->getFuncPointer();
// Create Filter Function of type i8*(i8*) which calls the root function
Type *i8Ty = Type::getInt8Ty(M.getContext());
FunctionType *CF_PipelineTy = FunctionType::get(
i8Ty->getPointerTo(), ArrayRef<Type *>(i8Ty->getPointerTo()), false);
Function *CF_Pipeline = Function::Create(CF_PipelineTy, CF->getLinkage(),
CF->getName() + "_Pipeline", &M);
DEBUG(errs() << "Generating Pipeline Function\n");
// Give a name to the argument which is used pass data to this thread
Value *data = &*CF_Pipeline->arg_begin();
data->setName("data.addr");
// Create a new basic block
DEBUG(errs() << "\tCreate new BB and add a return function\n");
// Add a basic block to this empty function
BasicBlock *BB =
BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline);
// Add a return instruction to the basic block
ReturnInst *RI =
ReturnInst::Create(CF_Pipeline->getContext(),
UndefValue::get(CF_Pipeline->getReturnType()), BB);
/* Extract the elements from the aggregate argument to the function.
* Replace the streaming inputs with i8* types signifying handle to
* corresponding buffers
* Add outputs to the list as well
* Add isLastInput to the list
*/
DEBUG(errs() << "\tReplace streaming input arguments with i8* type\n");
// These Args will be used when passing arguments to the generated function
// inside loop, and reading outputs as well.
std::vector<Value *> Args;
std::vector<Type *> TyList;
std::vector<std::string> names;
// Adding inputs
for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e;
++i) {
if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
TyList.push_back(i8Ty->getPointerTo());
names.push_back((Twine(i->getName()) + "_buffer").str());
} else {
TyList.push_back(i->getType());
names.push_back(i->getName());
}
}
// Adding outputs. FIXME: Since we assume all outputs to be streaming edges,
// because we get there buffer handles
StructType *RetTy = cast<StructType>(CF->getReturnType());
for (unsigned i = 0; i < RetTy->getNumElements(); i++) {
TyList.push_back(i8Ty->getPointerTo());
names.push_back("out");
}
/* Add a boolean argument isLastInput */
DEBUG(errs() << "\tAdd a boolean argument called isLastInput to function\n");
TyList.push_back(i8Ty->getPointerTo());
names.push_back("isLastInput_buffer");
// Extract the inputs, outputs
Args = extractElements(data, TyList, names, RI);
for (unsigned i = 0; i < Args.size(); i++) {
DEBUG(errs() << *Args[i] << "\n");
}
// Split the Args vector into, input output and isLastInput
unsigned numInputs = CF->getFunctionType()->getNumParams();
unsigned numOutputs = RetTy->getNumElements();
std::vector<Value *> InputArgs(Args.begin(), Args.begin() + numInputs);
std::vector<Value *> OutputArgs(Args.begin() + numInputs,
Args.begin() + numInputs + numOutputs);
Instruction *isLastInput = cast<Instruction>(Args[Args.size() - 1]);
/* Add runtime API calls to get input for each of the streaming input edges */
DEBUG(errs() << "\tAdd runtime API calls to get input for each of the "
"streaming input edges\n");
// First read the termination condition variable islastInput
CallInst *isLastInputPop = CallInst::Create(
llvm_hpvm_bufferPop, ArrayRef<Value *>(isLastInput), "", RI);
CastInst *BI = BitCastInst::CreateIntegerCast(
isLastInputPop, Type::getInt64Ty(CF_Pipeline->getContext()), false,
"isLastInput", RI);
isLastInput = BI;
// Create a loop termination condition
CmpInst *Cond = CmpInst::Create(
Instruction::ICmp, CmpInst::ICMP_NE, isLastInput,
Constant::getNullValue(Type::getInt64Ty(CF->getContext())),
"isLastInputNotZero", RI);
// Get input from buffers of all the incoming streaming edges
for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e;
++i) {
if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
CallInst *bufferIn =
CallInst::Create(llvm_hpvm_bufferPop,
ArrayRef<Value *>(InputArgs[i->getArgNo()]), "", RI);
CastInst *BI;
if (i->getType()->isPointerTy()) {
BI = CastInst::Create(CastInst::IntToPtr, bufferIn, i->getType(),
i->getName() + ".addr", RI);
} else if (i->getType()->isFloatTy()) {
BI = CastInst::CreateFPCast(bufferIn, i->getType(),
i->getName() + ".addr", RI);
} else {
BI = CastInst::CreateIntegerCast(bufferIn, i->getType(), false,
i->getName() + ".addr", RI);
}
// Replace the argument in Args vector. We would be using the vector as
// parameters passed to the call
InputArgs[i->getArgNo()] = BI;
}
}
/* Add a call to the generated function of the child node */
DEBUG(errs() << "\tAdd a call to the generated function of the child node\n");
// DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n");
// CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs,
// C->getGenFunc()->getName()+".output", RI);
Function *CGenF = C->getGenFuncForTarget(hpvm::CPU_TARGET);
DEBUG(errs() << "Type: " << *CGenF->getType() << "\n");
CallInst *CI =
CallInst::Create(CGenF, InputArgs, CGenF->getName() + ".output", RI);
/* Add runtime API calls to push output for each of the streaming outputs */
// FIXME: Assumption
// All edges between siblings are streaming edges
DEBUG(errs() << "\tAdd runtime API calls to push output for each of the "
"streaming outputs\n");
for (unsigned i = 0; i < numOutputs; i++) {
// Extract output
ExtractValueInst *EI =
ExtractValueInst::Create(CI, ArrayRef<unsigned>(i), "", RI);
// Convert to i64
CastInst *BI;
if (EI->getType()->isPointerTy())
BI =
CastInst::Create(CastInst::PtrToInt, EI,
Type::getInt64Ty(CF_Pipeline->getContext()), "", RI);
else
BI = CastInst::CreateIntegerCast(
EI, Type::getInt64Ty(CF_Pipeline->getContext()), false, "", RI);
// Push to Output buffer
Value *bufferOutArgs[] = {OutputArgs[i], BI};
CallInst::Create(llvm_hpvm_bufferPush, ArrayRef<Value *>(bufferOutArgs, 2),
"", RI);
}
// Add loop around the basic block, which exits the loop if isLastInput is
// false Pointers to keep the created loop structure
Instruction *CondStartI = cast<Instruction>(isLastInputPop);
Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode();
addWhileLoop(CondStartI, BodyStartI, RI, Cond);
// Return the Function pointer
DEBUG(errs() << "Pipeline Version of " << CF->getName() << ":\n");
DEBUG(errs() << *CF_Pipeline << "\n");
return CF_Pipeline;
}
void CGT_CPU::codeGen(DFInternalNode *N) {
// Check if N is root node and its graph is streaming. We do not do codeGen
// for Root in such a case
if (N->isRoot() && N->isChildGraphStreaming())
return;
// Check if clone already exists. If it does, it means we have visited this
// function before and nothing else needs to be done for this leaf node.
// if(N->getGenFunc() != NULL)
// return;
if (!preferredTargetIncludes(N, hpvm::CPU_TARGET)) {
DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName()
<< " : skipping it\n");
return;
}
assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) == NULL &&
"Error: Visiting a node for which code already generated\n");
// Sort children in topological order before code generation
N->getChildGraph()->sortChildren();
// Only process if all children have a CPU cpu function
// Otherwise skip to end
bool codeGen = true;
for (DFGraph::children_iterator ci = N->getChildGraph()->begin(),
ce = N->getChildGraph()->end();
ci != ce; ++ci) {
DFNode *C = *ci;
// Skip dummy node call
if (C->isDummyNode())
continue;
if (!(C->hasCPUGenFuncForTarget(hpvm::CPU_TARGET))) {
DEBUG(errs() << "No CPU cpu version for child node "
<< C->getFuncPointer()->getName()
<< "\n Skip code gen for parent node "
<< N->getFuncPointer()->getName() << "\n");
codeGen = false;
}
}
if (codeGen) {
Function *F = N->getFuncPointer();
// Create of clone of F with no instructions. Only the type is the same as F
// without the extra arguments.
Function *F_CPU;
// Clone the function, if we are seeing this function for the first time. We
// only need a clone in terms of type.
ValueToValueMapTy VMap;
// Create new function with the same type
F_CPU = Function::Create(F->getFunctionType(), F->getLinkage(),
F->getName(), &M);
// Loop over the arguments, copying the names of arguments over.
Function::arg_iterator dest_iterator = F_CPU->arg_begin();
for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
i != e; ++i) {
dest_iterator->setName(i->getName()); // Copy the name over...
// Increment dest iterator
++dest_iterator;
}
// Add a basic block to this empty function
BasicBlock *BB = BasicBlock::Create(F_CPU->getContext(), "entry", F_CPU);
ReturnInst *RI = ReturnInst::Create(
F_CPU->getContext(), UndefValue::get(F_CPU->getReturnType()), BB);
// Add Index and Dim arguments except for the root node and the child graph
// of parent node is not streaming
if (!N->isRoot() && !N->getParent()->isChildGraphStreaming())
F_CPU = addIdxDimArgs(F_CPU);
BB = &*F_CPU->begin();
RI = cast<ReturnInst>(BB->getTerminator());
// Add generated function info to DFNode
// N->setGenFunc(F_CPU, hpvm::CPU_TARGET);
N->addGenFunc(F_CPU, hpvm::CPU_TARGET, true);
// Loop over the arguments, to create the VMap.
dest_iterator = F_CPU->arg_begin();
for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
i != e; ++i) {
// Add mapping and increment dest iterator
VMap[&*i] = &*dest_iterator;
++dest_iterator;
}
// Iterate over children in topological order
for (DFGraph::children_iterator ci = N->getChildGraph()->begin(),
ce = N->getChildGraph()->end();
ci != ce; ++ci) {
DFNode *C = *ci;
// Skip dummy node call
if (C->isDummyNode())
continue;
// Create calls to CPU function of child node
invokeChild_CPU(C, F_CPU, VMap, RI);
}
DEBUG(errs() << "*** Generating epilogue code for the function****\n");
// Generate code for output bindings
// Get Exit node
DFNode *C = N->getChildGraph()->getExit();
// Get OutputType of this node
StructType *OutTy = N->getOutputType();
Value *retVal = UndefValue::get(F_CPU->getReturnType());
// Find all the input edges to exit node
for (unsigned i = 0; i < OutTy->getNumElements(); i++) {
DEBUG(errs() << "Output Edge " << i << "\n");
// Find the incoming edge at the requested input port
DFEdge *E = C->getInDFEdgeAt(i);
assert(E && "No Binding for output element!");
// Find the Source DFNode associated with the incoming edge
DFNode *SrcDF = E->getSourceDF();
DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName()
<< "\n");
// If Source DFNode is a dummyNode, edge is from parent. Get the
// argument from argument list of this internal node
Value *inputVal;
if (SrcDF->isEntryNode()) {
inputVal = getArgumentAt(F_CPU, i);
DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
} else {
// edge is from a internal node
// Check - code should already be generated for this source dfnode
assert(OutputMap.count(SrcDF) &&
"Source node call not found. Dependency violation!");
// Find Output Value associated with the Source DFNode using OutputMap
Value *CI = OutputMap[SrcDF];
// Extract element at source position from this call instruction
std::vector<unsigned> IndexList;
IndexList.push_back(E->getSourcePosition());
DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI
<< "\n");
ExtractValueInst *EI = ExtractValueInst::Create(CI, IndexList, "", RI);
inputVal = EI;
}
std::vector<unsigned> IdxList;
IdxList.push_back(i);
retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
}
DEBUG(errs() << "Extracted all\n");
retVal->setName("output");
ReturnInst *newRI = ReturnInst::Create(F_CPU->getContext(), retVal);
ReplaceInstWithInst(RI, newRI);
}
//-------------------------------------------------------------------------//
// Here, we need to check if this node (N) has more than one versions
// If so, we query the policy and have a call to each version
// If not, we see which version exists, check that it is in fact an cpu
// function and save it as the CPU_TARGET function
// TODO: hpvm_id per node, so we can use this for id for policies
// For now, use node function name and change it later
Function *CF = N->getGenFuncForTarget(hpvm::CPU_TARGET);
Function *GF = N->getGenFuncForTarget(hpvm::GPU_TARGET);
bool CFcpu = N->hasCPUGenFuncForTarget(hpvm::CPU_TARGET);
bool GFcpu = N->hasCPUGenFuncForTarget(hpvm::GPU_TARGET);
DEBUG(errs() << "Before editing\n");
DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag "
<< N->getTag() << "\n");
DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null") << "\n");
DEBUG(errs() << "hascpuGenFuncForCPU : " << CFcpu << "\n");
DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n");
DEBUG(errs() << "hascpuGenFuncForGPU : " << GFcpu << "\n");
if (N->getTag() == hpvm::None) {
// No code is available for this node. This (usually) means that this
// node is a node that
// - from the accelerator backends has been mapped to an intermediate
// node, and thus they have not produced a genFunc
// - a child node had no CPU hint, thus no code gen for CPU could
// take place
DEBUG(errs() << "No GenFunc - Skipping CPU code generation for node "
<< N->getFuncPointer()->getName() << "\n");
} else if (hpvmUtils::isSingleTargetTag(N->getTag())) {
// There is a single version for this node according to code gen hints.
// Therefore, we do not need to check the policy, we simply use the
// available implementation, whichever target it is for.
// Sanity check - to be removed TODO
switch (N->getTag()) {
case hpvm::CPU_TARGET:
assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) && "");
assert(N->hasCPUGenFuncForTarget(hpvm::CPU_TARGET) && "");
assert(!(N->getGenFuncForTarget(hpvm::GPU_TARGET)) && "");
assert(!(N->hasCPUGenFuncForTarget(hpvm::GPU_TARGET)) && "");
break;
case hpvm::GPU_TARGET:
assert(!(N->getGenFuncForTarget(hpvm::CPU_TARGET)) && "");
assert(!(N->hasCPUGenFuncForTarget(hpvm::CPU_TARGET)) && "");
assert(N->getGenFuncForTarget(hpvm::GPU_TARGET) && "");
assert(N->hasCPUGenFuncForTarget(hpvm::GPU_TARGET) && "");
break;
default:
assert(false && "Unreachable: we checked that tag was single target!\n");
break;
}
N->addGenFunc(N->getGenFuncForTarget(N->getTag()), hpvm::CPU_TARGET, true);
N->removeGenFuncForTarget(hpvm::GPU_TARGET);
N->setTag(hpvm::CPU_TARGET);
// Sanity checks - to be removed TODO
CF = N->getGenFuncForTarget(hpvm::CPU_TARGET);
GF = N->getGenFuncForTarget(hpvm::GPU_TARGET);
CFcpu = N->hasCPUGenFuncForTarget(hpvm::CPU_TARGET);
GFcpu = N->hasCPUGenFuncForTarget(hpvm::GPU_TARGET);
DEBUG(errs() << "After editing\n");
DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag "
<< N->getTag() << "\n");
DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null") << "\n");
DEBUG(errs() << "hascpuGenFuncForCPU : " << CFcpu << "\n");
DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n");
DEBUG(errs() << "hascpuGenFuncForGPU : " << GFcpu << "\n");
} else {
assert(false && "Multiple tags unsupported!");
}
}
// Code generation for leaf nodes
void CGT_CPU::codeGen(DFLeafNode *N) {
// Skip code generation if it is a dummy node
if (N->isDummyNode()) {
DEBUG(errs() << "Skipping dummy node\n");
return;
}
// At this point, the CPU backend does not support code generation for
// the case where allocation node is used, so we skip. This means that a
// CPU version will not be created, and therefore code generation will
// only succeed if another backend (opencl or spir) has been invoked to
// generate a node function for the node including the allocation node.
if (N->isAllocationNode()) {
DEBUG(errs() << "Skipping allocation node\n");
return;
}
// Check if clone already exists. If it does, it means we have visited this
// function before and nothing else needs to be done for this leaf node.
// if(N->getGenFunc() != NULL)
// return;
if (!preferredTargetIncludes(N, hpvm::CPU_TARGET)) {
DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName()
<< " : skipping it\n");
switch (N->getTag()) {
case hpvm::GPU_TARGET:
{
// A leaf node should not have an cpu function for GPU
// by design of DFG2LLVM_OpenCL backend
assert(!(N->hasCPUGenFuncForTarget(hpvm::GPU_TARGET)) &&
"Leaf node not expected to have GPU GenFunc");
break;
}
case hpvm::CUDNN_TARGET:
{
errs() << "CUDNN hint found. Store CUDNN function as CPU funtion.\n";
// Make sure there is a generated CPU function for cudnn
assert(N->getGenFuncForTarget(hpvm::CUDNN_TARGET) && "");
assert(N->hasCPUGenFuncForTarget(hpvm::CUDNN_TARGET) && "");
// Store the CUDNN x86 function as the CPU generated function
Function *Ftmp = N->getGenFuncForTarget(N->getTag());
// after adding the required number of arguments
if (!N->getParent()->isChildGraphStreaming()) {
Ftmp = addIdxDimArgs(Ftmp);
}
N->removeGenFuncForTarget(hpvm::CUDNN_TARGET);
N->setTag(hpvm::None);
N->addGenFunc(Ftmp, hpvm::CPU_TARGET, true);
N->setTag(hpvm::CPU_TARGET);
break;
}
case hpvm::TENSOR_TARGET:
{
errs() << "Promise hint found. Store PROMISE function as CPU funtion.\n";
// Make sure there is a generated x86 function for promise
assert(N->getGenFuncForTarget(hpvm::TENSOR_TARGET) && "");
assert(N->hasCPUGenFuncForTarget(hpvm::TENSOR_TARGET) && "");
// Store the PROMISE x86 function as the CPU generated function
Function *Ftmp = N->getGenFuncForTarget(N->getTag());
// after adding the required number of arguments
if (!N->getParent()->isChildGraphStreaming()) {
Ftmp = addIdxDimArgs(Ftmp);
}
N->setTag(hpvm::None);
N->removeGenFuncForTarget(hpvm::TENSOR_TARGET);
N->addGenFunc(Ftmp, hpvm::CPU_TARGET, true);
N->setTag(hpvm::CPU_TARGET);
break;
}
default:
{
break;
}
}
return;
}
assert(N->getGenFuncForTarget(hpvm::CPU_TARGET) == NULL &&
"Error: Visiting a node for which code already generated\n");
std::vector<IntrinsicInst *> IItoRemove;
std::vector<std::pair<IntrinsicInst *, Value *>> IItoReplace;
BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap;
// Get the function associated woth the dataflow node
Function *F = N->getFuncPointer();
// Clone the function, if we are seeing this function for the first time.
Function *F_CPU;
ValueToValueMapTy VMap;
F_CPU = CloneFunction(F, VMap);
F_CPU->removeFromParent();
// Insert the cloned function into the module
M.getFunctionList().push_back(F_CPU);
// Add the new argument to the argument list. Add arguments only if the cild
// graph of parent node is not streaming
if (!N->getParent()->isChildGraphStreaming())
F_CPU = addIdxDimArgs(F_CPU);
// Add generated function info to DFNode
// N->setGenFunc(F_CPU, hpvm::CPU_TARGET);
N->addGenFunc(F_CPU, hpvm::CPU_TARGET, true);
// Go through the arguments, and any pointer arguments with in attribute need
// to have cpu_argument_ptr call to get the cpu ptr of the argument
// Insert these calls in a new BB which would dominate all other BBs
// Create new BB
BasicBlock *EntryBB = &*F_CPU->begin();
BasicBlock *BB =
BasicBlock::Create(M.getContext(), "getHPVMPtrArgs", F_CPU, EntryBB);
BranchInst *Terminator = BranchInst::Create(EntryBB, BB);
// Insert calls
for (Function::arg_iterator ai = F_CPU->arg_begin(), ae = F_CPU->arg_end();
ai != ae; ++ai) {
if (F_CPU->getAttributes().hasAttribute(ai->getArgNo() + 1,
Attribute::In)) {
assert(ai->getType()->isPointerTy() &&
"Only pointer arguments can have hpvm in/out attributes ");
Function::arg_iterator aiNext = ai;
++aiNext;
Argument *size = &*aiNext;
assert(size->getType() == Type::getInt64Ty(M.getContext()) &&
"Next argument after a pointer should be an i64 type");
CastInst *BI = BitCastInst::CreatePointerCast(
&*ai, Type::getInt8PtrTy(M.getContext()), ai->getName() + ".i8ptr",
Terminator);
Value *ArgPtrCallArgs[] = {BI, size};
CallInst::Create(llvm_hpvm_cpu_argument_ptr,
ArrayRef<Value *>(ArgPtrCallArgs, 2), "", Terminator);
}
}
DEBUG(errs() << *BB << "\n");
// Go through all the instructions
for (inst_iterator i = inst_begin(F_CPU), e = inst_end(F_CPU); i != e; ++i) {
Instruction *I = &(*i);
DEBUG(errs() << *I << "\n");
// Leaf nodes should not contain HPVM graph intrinsics or launch
assert(!BuildDFG::isHPVMLaunchIntrinsic(I) &&
"Launch intrinsic within a dataflow graph!");
assert(!BuildDFG::isHPVMGraphIntrinsic(I) &&
"HPVM graph intrinsic within a leaf dataflow node!");
if (BuildDFG::isHPVMQueryIntrinsic(I)) {
IntrinsicInst *II = cast<IntrinsicInst>(I);
IntrinsicInst *ArgII;
DFNode *ArgDFNode;
/***********************************************************************
* Handle HPVM Query intrinsics *
***********************************************************************/
switch (II->getIntrinsicID()) {
/**************************** llvm.hpvm.getNode() *******************/
case Intrinsic::hpvm_getNode: {
// add mapping <intrinsic, this node> to the node-specific map
Leaf_HandleToDFNodeMap[II] = N;
IItoRemove.push_back(II);
break;
}
/************************* llvm.hpvm.getParentNode() ****************/
case Intrinsic::hpvm_getParentNode: {
// get the parent node of the arg node
// get argument node
ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
// get the parent node of the arg node
ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
// Add mapping <intrinsic, parent node> to the node-specific map
// the argument node must have been added to the map, orelse the
// code could not refer to it
Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent();
IItoRemove.push_back(II);
break;
}
/*************************** llvm.hpvm.getNumDims() *****************/
case Intrinsic::hpvm_getNumDims: {
// get node from map
// get the appropriate field
ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
int numOfDim = Leaf_HandleToDFNodeMap[ArgII]->getNumOfDim();
IntegerType *IntTy = Type::getInt32Ty(M.getContext());
ConstantInt *numOfDimConstant =
ConstantInt::getSigned(IntTy, (int64_t)numOfDim);
II->replaceAllUsesWith(numOfDimConstant);
IItoRemove.push_back(II);
break;
}
/*********************** llvm.hpvm.getNodeInstanceID() **************/
case Intrinsic::hpvm_getNodeInstanceID_x:
case Intrinsic::hpvm_getNodeInstanceID_y:
case Intrinsic::hpvm_getNodeInstanceID_z: {
ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
// The dfnode argument should be an ancestor of this leaf node or
// the leaf node itself
int parentLevel = N->getAncestorHops(ArgDFNode);
assert((parentLevel >= 0 || ArgDFNode == (DFNode *)N) &&
"Invalid DFNode argument to getNodeInstanceID_[xyz]!");
// Get specified dimension
// (dim = 0) => x
// (dim = 1) => y
// (dim = 2) => z
int dim =
(int)(II->getIntrinsicID() - Intrinsic::hpvm_getNodeInstanceID_x);
assert((dim >= 0) && (dim < 3) &&
"Invalid dimension for getNodeInstanceID_[xyz]. Check Intrinsic "
"ID!");
// For immediate ancestor, use the extra argument introduced in
// F_CPU
int numParamsF = F->getFunctionType()->getNumParams();
int numParamsF_CPU = F_CPU->getFunctionType()->getNumParams();
assert(
(numParamsF_CPU - numParamsF == 6) &&
"Difference of arguments between function and its clone is not 6!");
if (parentLevel == 0) {
// Case when the query is for this node itself
unsigned offset = 3 + (3 - dim);
// Traverse argument list of F_CPU in reverse order to find the
// correct index or dim argument.
Argument *indexVal = getArgumentFromEnd(F_CPU, offset);
assert(indexVal && "Index argument not found. Invalid offset!");
DEBUG(errs() << *II << " replaced with " << *indexVal << "\n");
II->replaceAllUsesWith(indexVal);
IItoRemove.push_back(II);
} else {
// Case when query is for an ancestor
Value *args[] = {
ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)};
CallInst *CI = CallInst::Create(llvm_hpvm_cpu_getDimInstance,
ArrayRef<Value *>(args, 2),
"nodeInstanceID", II);
DEBUG(errs() << *II << " replaced with " << *CI << "\n");
II->replaceAllUsesWith(CI);
IItoRemove.push_back(II);
}
break;
}
/********************** llvm.hpvm.getNumNodeInstances() *************/
case Intrinsic::hpvm_getNumNodeInstances_x:
case Intrinsic::hpvm_getNumNodeInstances_y:
case Intrinsic::hpvm_getNumNodeInstances_z: {
ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
// The dfnode argument should be an ancestor of this leaf node or
// the leaf node itself
int parentLevel = N->getAncestorHops(ArgDFNode);
assert((parentLevel >= 0 || ArgDFNode == (DFNode *)N) &&
"Invalid DFNode argument to getNodeInstanceID_[xyz]!");
// Get specified dimension
// (dim = 0) => x
// (dim = 1) => y
// (dim = 2) => z
int dim =
(int)(II->getIntrinsicID() - Intrinsic::hpvm_getNumNodeInstances_x);
assert((dim >= 0) && (dim < 3) &&
"Invalid dimension for getNumNodeInstances_[xyz]. Check "
"Intrinsic ID!");
// For immediate ancestor, use the extra argument introduced in
// F_CPU
int numParamsF = F->getFunctionType()->getNumParams();
int numParamsF_CPU = F_CPU->getFunctionType()->getNumParams();
assert(
(numParamsF_CPU - numParamsF == 6) &&
"Difference of arguments between function and its clone is not 6!");
if (parentLevel == 0) {
// Case when the query is for this node itself
unsigned offset = 3 - dim;
// Traverse argument list of F_CPU in reverse order to find the
// correct index or dim argument.
Argument *limitVal = getArgumentFromEnd(F_CPU, offset);
assert(limitVal && "Limit argument not found. Invalid offset!");
DEBUG(errs() << *II << " replaced with " << *limitVal << "\n");
II->replaceAllUsesWith(limitVal);
IItoRemove.push_back(II);
} else {
// Case when query is from the ancestor
Value *args[] = {
ConstantInt::get(Type::getInt32Ty(II->getContext()), parentLevel),
ConstantInt::get(Type::getInt32Ty(II->getContext()), dim)};
CallInst *CI = CallInst::Create(llvm_hpvm_cpu_getDimLimit,
ArrayRef<Value *>(args, 2),
"numNodeInstances", II);
DEBUG(errs() << *II << " replaced with " << *CI << "\n");
II->replaceAllUsesWith(CI);
IItoRemove.push_back(II);
}
break;
}
default:
DEBUG(errs() << "Found unknown intrinsic with ID = "
<< II->getIntrinsicID() << "\n");
assert(false && "Unknown HPVM Intrinsic!");
break;
}
} else {
}
}
// Remove them in reverse order
for (std::vector<IntrinsicInst *>::iterator i = IItoRemove.begin();
i != IItoRemove.end(); ++i) {
(*i)->replaceAllUsesWith(UndefValue::get((*i)->getType()));
(*i)->eraseFromParent();
}
DEBUG(errs() << *F_CPU);
}
} // End of namespace
char DFG2LLVM_CPU::ID = 0;
static RegisterPass<DFG2LLVM_CPU>
X("dfg2llvm-cpu", "Dataflow Graph to LLVM for CPU backend",
false /* does not modify the CFG */,
true /* transformation, not just analysis */);