cpu.rs 46.21 KiB
extern crate bitvec;
extern crate hercules_ir;
extern crate hercules_rt;
use std::collections::HashMap;
use std::collections::VecDeque;
use std::iter::zip;
use std::fmt::Write;
use self::bitvec::prelude::*;
use self::hercules_ir::*;
use self::hercules_rt::manifest::*;
use crate::*;
/*
* When assembling LLVM basic blocks, we traverse the nodes in a partition in an
* ad-hoc order. Thus, we cannot assume block terminators will be visited after
* data nodes, for example. However, textual LLVM IR requires that the
* terminator instruction is last. So, we emit nodes into separate strings of
* LLVM IR that will get stichted together when the block is complete.
*/
#[derive(Debug)]
struct LLVMBlock {
header: String,
phis: String,
data: String,
terminator: String,
}
impl<'a> FunctionContext<'a> {
/*
* Top level function to generate code for a partition, targeting the CPU.
*/
pub(crate) fn codegen_cpu_partition<W: Write>(
&self,
top_node: NodeID,
w: &mut W,
) -> Result<PartitionManifest, std::fmt::Error> {
// Step 1: do some analysis to get a bunch of per-partition information.
let partition_id = self.plan.partitions[top_node.idx()];
let partition_context = PartitionContext::new(self, partition_id, top_node);
// Step 2: emit the function signature. The partition function
// parameters are the function parameters, the partition data inputs,
// the array constant pointers, and the dynamic constants.
let mut partition_function_parameters = partition_context
// The data inputs to this partition. These are the data values
// calculated in a different partition in the same function.
.partition_input_types
.iter()
.enumerate()
.map(|(idx, ty_id)| {
(
self.llvm_types[ty_id.idx()].clone(),
format!("%part_arg.{}", idx),
)
})
// The input types of the overall function.
.chain(
self.function
.param_types
.iter()
.enumerate()
.map(|(idx, ty_id)| {
(
self.llvm_types[ty_id.idx()].clone(),
format!("%func_arg.{}", idx),
)
}),
)
// Array constants are passed in, pre-initialized.
.chain(
self.partition_array_constant_inputs()
.into_iter()
.map(|(id, ty_id)| {
(
self.llvm_types[ty_id.idx()].clone(),
format!("%cons.{}", id.idx()),
)
}),
)
// Dynamic constants are passed in, since they are only known right
// before runtime.
.chain(
self.partition_dynamic_constant_inputs()
.into_iter()
.map(|id| ("i64".to_string(), format!("%dyn_cons.{}", id.idx()))),
);
write!(
w,
"define {} @{}_part_{}(",
generate_type_string(&partition_context.return_type, &self.llvm_types),
self.function.name,
partition_id.idx(),
)?;
let (first_ty, first_param) = partition_function_parameters.next().unwrap();
write!(w, "{} {}", first_ty, first_param)?;
for (ty, param) in partition_function_parameters {
write!(w, ", {} {}", ty, param)?;
}
write!(w, ") {{\n")?;
// Step 3: set up basic blocks. A node represents a basic block if its
// entry in the basic blocks vector points to itself.
let mut llvm_bbs = HashMap::new();
for id in &self.partitions_inverted_map[partition_id.idx()] {
if self.bbs[id.idx()] == *id {
llvm_bbs.insert(
id,
LLVMBlock {
header: format!("bb_{}:\n", id.idx()),
phis: "".to_string(),
data: "".to_string(),
terminator: "".to_string(),
},
);
}
}
// Step 4: emit nodes. Nodes are emitted into basic blocks separately as
// nodes are not necessarily emitted in order. Assemble worklist of
// nodes, starting as reverse post order of nodes. For non-phi and non-
// reduce nodes, only emit once all data uses are emitted. In addition,
// consider additional anti-dependence edges from read to write nodes.
let mut visited = bitvec![u8, Lsb0; 0; self.function.nodes.len()];
let mut worklist = VecDeque::from(partition_context.reverse_postorder.clone());
while let Some(id) = worklist.pop_front() {
if !(self.function.nodes[id.idx()].is_phi()
|| self.function.nodes[id.idx()].is_reduce())
&& !get_uses(&self.function.nodes[id.idx()])
.as_ref()
.into_iter()
// If this node isn't a phi or reduce, we need to check that
// all uses, as well as all reads we anti-depend with, have
// been emitted.
.chain(self.antideps.iter().filter_map(|(read, write)| {
if id == *write {
Some(read)
} else {
None
}
}))
// Only data dependencies inside this partition need to have
// already been visited.
.all(|id| {
self.plan.partitions[id.idx()] != partition_id
|| self.function.nodes[id.idx()].is_control()
|| visited[id.idx()]
})
{
// Skip emitting node if it's not a phi or reduce node and if
// its data uses are not emitted yet.
worklist.push_back(id);
} else {
// Once all of the data dependencies for this node are emitted,
// this node can be emitted. For reduce nodes specifically, we
// want to emit the phi in the fork's basic block, not the
// join's, so we handle that ugly case here. This is because
// there is a fundamental mismatch between Hercules' notion of
// reductions and LLVM's phi nodes. This is ok, since we can
// translate between the two. It's just a pain.
let bb = if let Node::Reduce {
control,
init: _,
reduct: _,
} = self.function.nodes[id.idx()]
{
// Figure out the fork corresponding to the associated join.
let fork_id = if let Node::Join { control } = self.function.nodes[control.idx()]
{
if let Type::Control(factors) =
&self.types[self.typing[control.idx()].idx()]
{
*factors.last().unwrap()
} else {
panic!("PANIC: Type of join node associated with reduce node is not a control type.")
}
} else {
panic!("PANIC: Node associated with reduce node isn't a join node.")
};
// Emit in the basic block of the fork.
llvm_bbs.get_mut(&self.bbs[fork_id.idx()]).unwrap()
} else {
// In the normal case, emit in the basic block the node has
// been actually assigned to.
llvm_bbs.get_mut(&self.bbs[id.idx()]).unwrap()
};
partition_context.codegen_cpu_node(id, bb)?;
visited.set(id.idx(), true);
}
}
// Step 5: emit the now completed basic blocks, in order. Emit a dummy
// header block to unconditionally jump to the "top" basic block.
write!(w, "bb_header:\n br label %bb_{}\n", top_node.idx())?;
for id in partition_context.reverse_postorder {
if self.bbs[id.idx()] == id {
write!(
w,
"{}{}{}{}",
llvm_bbs[&id].header,
llvm_bbs[&id].phis,
llvm_bbs[&id].data,
llvm_bbs[&id].terminator
)?;
}
}
// Step 6: close the partition function - we're done. The partition
// manifest is created by the partition context.
write!(w, "}}\n\n")?;
Ok(partition_context.manifest)
}
}
impl<'a> PartitionContext<'a> {
/*
* Emit LLVM IR implementing a single node.
*/
fn codegen_cpu_node(&self, id: NodeID, bb: &mut LLVMBlock) -> std::fmt::Result {
// Helper to emit code to index a collection. All collections are
// pointers to some memory at the LLVM IR level. This memory is passed
// in as a parameter for anything involving arrays, and is alloca-ed for
// product and summation types.
let mut generate_index_code = |collect: NodeID, indices: &[Index]| -> std::fmt::Result {
// Step 1: calculate the list of collection types corresponding to
// each index.
let mut collection_ty_ids = vec![];
let mut curr_ty_id = self.function.typing[collect.idx()];
for index in indices {
match (index, &self.function.types[curr_ty_id.idx()]) {
(Index::Field(idx), Type::Product(ty_ids))
| (Index::Variant(idx), Type::Summation(ty_ids)) => {
collection_ty_ids.push(curr_ty_id);
curr_ty_id = ty_ids[*idx];
}
(Index::Position(_), Type::Array(elem_ty_id, _)) => {
collection_ty_ids.push(curr_ty_id);
curr_ty_id = *elem_ty_id;
}
_ => {
panic!("PANIC: Found unsupported combination of index and collection type.")
}
}
}
assert!(
self.function.types[curr_ty_id.idx()].is_primitive(),
"PANIC: Cannot generate partial indexing code."
);
// Step 2: calculate, as LLVM IR values, the stride and offset
// needed at each level of the collection. For products, the stride
// is calculated using a getelementptr hack (and is the size of the
// struct), and the offset corresponds to the field index (which is
// translated to an offset using another getelementptr hack). For
// arrays, the stride is the dynamic constant extent multiplied by
// the stride of the element type, and the offset is the position
// index multiplied by the stride of the element type. Additionally,
// emit code to add up all of the offsets to get a total offset into
// the collection. TODO: to support summations, and arrays in
// arbitrary places, we need to not use the hacky getelementptr
// technique, since LLVM IR can't represent arrays (in the Hercules
// sense) or summations as primitive types. Instead, we need to do
// collection memory layout entirely ourselves.
let elem_llvm_ty = &self.function.llvm_types[curr_ty_id.idx()];
write!(bb.data, " %index{}.{}.total_offset = add i64 0, 0\n %index{}.{}.stride.ptrhack = getelementptr {}, ptr null, i64 1\n %index{}.{}.stride = ptrtoint ptr %index{}.{}.stride.ptrhack to i64\n",
id.idx(), indices.len(), id.idx(), indices.len(), elem_llvm_ty, id.idx(), indices.len(), id.idx(), indices.len()
)?;
for (idx, index) in indices.into_iter().enumerate().rev() {
match index {
Index::Field(field) => {
let product_llvm_ty =
&self.function.llvm_types[collection_ty_ids[idx].idx()];
write!(
bb.data,
" %index{}.{}.stride.ptrhack = getelementptr {}, ptr null, i64 1\n %index{}.{}.stride = ptrtoint ptr %index{}.{}.stride.ptrhack to i64\n %index{}.{}.offset.ptrhack = getelementptr {}, ptr null, i64 0, i64 {}\n %index{}.{}.offset = ptrtoint ptr %index{}.{}.offset.ptrhack to i64\n",
id.idx(), idx,
product_llvm_ty,
id.idx(), idx,
id.idx(), idx,
id.idx(), idx,
product_llvm_ty,
field,
id.idx(), idx,
id.idx(), idx,
)?;
}
Index::Variant(_) => todo!(),
Index::Position(position) => {
let array_extents = self.function.types[collection_ty_ids[idx].idx()]
.try_extents()
.unwrap();
// TODO: calculate stride for arrays, needed for arrays
// nested in other collections.
write!(bb.data, " %index{}.{}.offset.add.0 = add ", id.idx(), idx)?;
self.cpu_emit_use_of_node(position[0], Some(id), true, &mut bb.data)?;
write!(bb.data, ", {}\n", 0)?;
for (dim_idx, (extent_dc_id, position_id)) in
zip(array_extents, position.into_iter()).enumerate().skip(1)
{
write!(
bb.data,
" %index{}.{}.offset.mul.{} = mul i64 {}, %index{}.{}.offset.add.{}\n",
id.idx(), idx,
dim_idx,
self.function.llvm_dynamic_constants[extent_dc_id.idx()],
id.idx(), idx,
dim_idx - 1
)?;
write!(
bb.data,
" %index{}.{}.offset.add.{} = add ",
id.idx(),
idx,
dim_idx
)?;
self.cpu_emit_use_of_node(*position_id, Some(id), true, &mut bb.data)?;
write!(
bb.data,
", %index{}.{}.offset.mul.{}\n",
id.idx(),
idx,
dim_idx
)?;
}
write!(bb.data, " %index{}.{}.offset = mul i64 %index{}.{}.stride, %index{}.{}.offset.add.{}\n", id.idx(), idx, id.idx(), idx + 1, id.idx(), idx, position.len() - 1)?;
}
Index::Control(_) => panic!(
"PANIC: Found control index when generating collection indexing code."
),
}
write!(
bb.data,
" %index{}.{}.total_offset = add i64 %index{}.{}.total_offset, %index{}.{}.offset\n",
id.idx(), idx,
id.idx(), idx + 1,
id.idx(), idx
)?;
}
// Step 3: emit the getelementptr using the total collection offset.
write!(bb.data, " %index{} = getelementptr i8, ", id.idx(),)?;
self.cpu_emit_use_of_node(collect, Some(id), true, &mut bb.data)?;
write!(bb.data, ", i64 %index{}.0.total_offset\n", id.idx())?;
Ok(())
};
// Helper to find the basic block corresponding to a particular control
// predecessor, for phi nodes. This is needed for when a predecessor
// basic block is in a different partition. In this case, the phi's
// control predecessor is set to the top block of the partition.
let get_phi_predecessor = |pred_id: NodeID| {
if self.function.plan.partitions[pred_id.idx()] == self.partition_id {
format!("{}", self.function.bbs[pred_id.idx()].idx())
} else {
format!("header")
}
};
// Emit the primary IR for each node.
match self.function.function.nodes[id.idx()] {
Node::Start | Node::Region { preds: _ } => {
// Basic blocks containing a start or region node branch
// unconditionally to their single successor.
let successor = self
.function
.def_use
.get_users(id)
.iter()
.filter(|id| self.function.function.nodes[id.idx()].is_strictly_control())
.next()
.unwrap();
bb.terminator = format!(" br label %bb_{}\n", successor.idx());
}
Node::If { control: _, cond } => {
let successors = self.function.def_use.get_users(id);
// Determine the order of the successors (true/false or false/
// true) in the successors slice.
let rev = if let Node::Read {
collect: _,
indices,
} = &self.function.function.nodes[successors[0].idx()]
{
indices[0] != Index::Control(0)
} else {
panic!("PANIC: Successor of if node isn't a read node.")
};
bb.terminator = " br ".to_string();
self.cpu_emit_use_of_node(cond, Some(id), true, &mut bb.terminator)?;
write!(
bb.terminator,
", label %bb_{}, label %bb_{}\n",
successors[(!rev) as usize].idx(),
successors[rev as usize].idx()
)?;
}
Node::Fork { control, factor: _ } => {
// Calculate the join and successor.
let join = self.function.fork_join_map[&id];
let successor = self
.function
.def_use
.get_users(id)
.iter()
.filter(|id| self.function.function.nodes[id.idx()].is_strictly_control())
.next()
.unwrap();
// Create the phi node for the loop index. This is used directly
// by any thread ID user nodes. The control predecessor basic
// blocks are the control node preceding the fork and the
// corresponding join.
write!(bb.phis, " ")?;
self.cpu_emit_use_of_node(id, None, false, &mut bb.phis)?;
write!(
bb.phis,
" = phi i64 [ 0, %bb_{} ], [ %fork_inc{}, %bb_{} ]\n",
get_phi_predecessor(self.function.bbs[control.idx()]),
id.idx(),
get_phi_predecessor(self.function.bbs[join.idx()]),
)?;
// Increment the loop index by one each iteration.
write!(bb.data, " %fork_inc{} = add i64 1, ", id.idx())?;
self.cpu_emit_use_of_node(id, None, false, &mut bb.data)?;
write!(bb.data, "\n")?;
// Branch to the successor basic block.
write!(
bb.terminator,
" br label %bb_{}\n",
self.function.bbs[successor.idx()].idx()
)?;
}
Node::Join { control } => {
// Get the fork, it's factor, and the successor to this join.
let fork_id = if let Type::Control(factors) =
&self.function.types[self.function.typing[control.idx()].idx()]
{
*factors.last().unwrap()
} else {
panic!("PANIC: The type of a join node is incorrect.")
};
let factor = if let Node::Fork { control: _, factor } =
&self.function.function.nodes[fork_id.idx()]
{
*factor
} else {
panic!("PANIC: The node referenced by the control type of a join node is not a fork.")
};
let successor = self
.function
.def_use
.get_users(id)
.iter()
.filter(|id| self.function.function.nodes[id.idx()].is_strictly_control())
.next()
.unwrap();
// Form the bottom of the loop. Check if the loop is finished,
// and branch between the successor and the fork. The structure
// of this loop implies that fork-joins have to iterate at least
// once. Change the loop termination branch target if this is a
// control return (see comment below for more details).
let is_control_return = self.control_returns.contains(&id);
write!(
bb.terminator,
" %join_cond{} = icmp ult i64 %fork_inc{}, {}\n",
id.idx(),
fork_id.idx(),
self.function.llvm_dynamic_constants[factor.idx()]
)?;
write!(
bb.terminator,
" br i1 %join_cond{}, label %bb_{}, label %bb_{}\n",
id.idx(),
self.function.bbs[fork_id.idx()].idx(),
if is_control_return {
format!("{}_join_cr", id.idx())
} else {
format!("{}", self.function.bbs[successor.idx()].idx())
}
)?;
// Join nodes are the only node that can be a control return
// from a partition and generate a conditional branch. This
// means we have to do this really ugly hack where we insert
// another basic block to be the control return that we
// conditionally branch to. Other control nodes that may be
// control returns don't have this problem, because they always
// unconditionally branch to their destination. We add this LLVM
// IR text of a new basic block in the terminator of the current
// basic block, since we don't have mutable access here to the
// set of all LLVM basic blocks.
if is_control_return {
write!(bb.terminator, "bb_{}_join_cr:\n", id.idx())?;
}
}
Node::Phi {
control: _,
ref data,
} => {
// For each predecessor of the associated region, we determine
// if that predecessor is in this partition or not. If so, then
// the predecessor control is just the basic block of the
// predecessor control node. If not, the predecessor control is
// the first basic block of the partition. The corresponding
// datum also needs to be provided by argument to the partition,
// and this is handled by cpu_emit_use_of_node.
let pred_ids =
get_uses(&self.function.function.nodes[self.function.bbs[id.idx()].idx()]);
let mut control_datum_pairs = zip(data.into_iter(), pred_ids.as_ref().iter())
.map(|(datum, pred_id)| (*datum, get_phi_predecessor(*pred_id)));
// TODO: this code burns my eyes to look at, it might be worth
// making this not carcinogenic.
write!(bb.phis, " ")?;
self.cpu_emit_use_of_node(id, None, false, &mut bb.phis)?;
write!(
bb.phis,
" = phi {} [ ",
self.function.llvm_types[self.function.typing[id.idx()].idx()]
)?;
let (first_data, first_control) = control_datum_pairs.next().unwrap();
self.cpu_emit_use_of_node(first_data, Some(id), false, &mut bb.phis)?;
write!(bb.phis, ", %bb_{} ]", first_control)?;
for (data, control) in control_datum_pairs {
write!(bb.phis, ", [ ")?;
self.cpu_emit_use_of_node(data, Some(id), false, &mut bb.phis)?;
write!(bb.phis, ", %bb_{} ]", control)?;
}
write!(bb.phis, "\n")?;
}
Node::ThreadID { control } => {
// Just bitcast the loop index from the fork. The bitcast is a
// no-op, but we add it to copy the value from the virtual
// register the fork generates to the virtual register
// corresponding to this thread ID node.
assert!(self.function.function.nodes[control.idx()].is_fork());
write!(bb.data, " ")?;
self.cpu_emit_use_of_node(id, None, false, &mut bb.data)?;
write!(bb.data, " = bitcast i64 ",)?;
self.cpu_emit_use_of_node(control, Some(id), false, &mut bb.data)?;
write!(bb.data, " to i64\n",)?;
}
Node::Reduce {
control,
init,
reduct,
} => {
// Figure out the fork corresponding to the associated join.
let fork_id = if let Node::Join { control } =
self.function.function.nodes[control.idx()]
{
if let Type::Control(factors) =
&self.function.types[self.function.typing[control.idx()].idx()]
{
*factors.last().unwrap()
} else {
panic!("PANIC: Type of join node associated with reduce node is not a control type.")
}
} else {
panic!("PANIC: Node associated with reduce node isn't a join node.")
};
// Figure out the fork's predecessor.
let pred = if let Node::Fork { control, factor: _ } =
self.function.function.nodes[fork_id.idx()]
{
control
} else {
panic!("PANIC: Node referenced in type of join node associated with a reduce node is not a fork node.")
};
// Reduce nodes just lower to phi nodes. We already did the ugly
// hack so that "bb" refers to the basic block of the fork,
// rather than the join. So, now we just need to emit the phi.
write!(bb.phis, " ")?;
self.cpu_emit_use_of_node(id, Some(id), false, &mut bb.phis)?;
write!(
bb.phis,
" = phi {} [ ",
self.function.llvm_types[self.function.typing[id.idx()].idx()]
)?;
self.cpu_emit_use_of_node(init, Some(id), false, &mut bb.phis)?;
write!(
bb.phis,
", %bb_{} ], [ ",
get_phi_predecessor(self.function.bbs[pred.idx()])
)?;
self.cpu_emit_use_of_node(reduct, Some(id), false, &mut bb.phis)?;
write!(
bb.phis,
", %bb_{} ]\n",
get_phi_predecessor(self.function.bbs[control.idx()])
)?;
}
// These nodes are handled by other mechanisms in the code lowering
// process.
Node::Return {
control: _,
data: _,
}
| Node::Parameter { index: _ }
| Node::Constant { id: _ }
| Node::DynamicConstant { id: _ } => {}
Node::Binary { left, right, op } => {
let op = match op {
BinaryOperator::Add => {
if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
"fadd"
} else {
"add"
}
}
BinaryOperator::Sub => {
if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
"fsub"
} else {
"sub"
}
}
BinaryOperator::Mul => {
if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
"fmul"
} else {
"mul"
}
}
BinaryOperator::Div => {
if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
"fdiv"
} else if self.function.types[self.function.typing[left.idx()].idx()]
.is_unsigned()
{
"udiv"
} else {
"sdiv"
}
}
BinaryOperator::Rem => {
if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
"frem"
} else if self.function.types[self.function.typing[left.idx()].idx()]
.is_unsigned()
{
"urem"
} else {
"srem"
}
}
BinaryOperator::LT => {
if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
"fcmp olt"
} else if self.function.types[self.function.typing[left.idx()].idx()]
.is_unsigned()
{
"icmp ult"
} else {
"icmp slt"
}
}
BinaryOperator::LTE => {
if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
"fcmp ole"
} else if self.function.types[self.function.typing[left.idx()].idx()]
.is_unsigned()
{
"icmp ule"
} else {
"icmp sle"
}
}
BinaryOperator::GT => {
if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
"fcmp ogt"
} else if self.function.types[self.function.typing[left.idx()].idx()]
.is_unsigned()
{
"icmp ugt"
} else {
"icmp sgt"
}
}
BinaryOperator::GTE => {
if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
"fcmp oge"
} else if self.function.types[self.function.typing[left.idx()].idx()]
.is_unsigned()
{
"icmp uge"
} else {
"icmp sge"
}
}
BinaryOperator::EQ => {
if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
"fcmp oeq"
} else {
"icmp eq"
}
}
BinaryOperator::NE => {
if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
"fcmp one"
} else {
"icmp ne"
}
}
BinaryOperator::Or => "or",
BinaryOperator::And => "and",
BinaryOperator::Xor => "xor",
BinaryOperator::LSh => "lsh",
BinaryOperator::RSh => {
if self.function.types[self.function.typing[left.idx()].idx()].is_unsigned()
{
"lshr"
} else {
"ashr"
}
}
};
write!(bb.data, " ")?;
self.cpu_emit_use_of_node(id, None, false, &mut bb.data)?;
write!(bb.data, " = {} ", op)?;
self.cpu_emit_use_of_node(left, Some(id), true, &mut bb.data)?;
write!(bb.data, ", ")?;
self.cpu_emit_use_of_node(right, Some(id), false, &mut bb.data)?;
write!(bb.data, "\n")?;
}
Node::Read {
collect,
ref indices,
} => {
if self.function.function.nodes[collect.idx()].is_strictly_control() {
// Read nodes may be projection succesors of if or match
// nodes.
let successor = self.function.def_use.get_users(id)[0];
write!(
bb.terminator,
" br label %bb_{}\n",
self.function.bbs[successor.idx()].idx()
)?;
} else {
generate_index_code(collect, indices)?;
write!(bb.data, " ")?;
self.cpu_emit_use_of_node(id, Some(id), false, &mut bb.data)?;
write!(
bb.data,
" = load {}, ptr %index{}\n",
self.function.llvm_types[self.function.typing[id.idx()].idx()],
id.idx(),
)?;
}
}
Node::Write {
collect,
data,
ref indices,
} => {
generate_index_code(collect, indices)?;
write!(
bb.data,
" store {} ",
self.function.llvm_types[self.function.typing[data.idx()].idx()]
)?;
self.cpu_emit_use_of_node(data, Some(id), false, &mut bb.data)?;
write!(bb.data, ", ptr %index{}\n", id.idx())?;
// We can't just "copy" in LLVM IR, but we want to forward the
// pointer, unchanged, as the "output" of this write node. The
// easiest way to do this is to insert a useless bitcast.
write!(bb.data, " ")?;
self.cpu_emit_use_of_node(id, None, false, &mut bb.data)?;
write!(bb.data, " = bitcast ptr ")?;
self.cpu_emit_use_of_node(collect, Some(id), false, &mut bb.data)?;
write!(bb.data, " to ptr\n")?;
}
_ => {
eprintln!("TO LOWER: {:?}", self.function.function.nodes[id.idx()]);
}
}
// If this node is a control return, we emit a return from this
// partition function.
if self.control_returns.contains(&id) {
// Get rid of the old terminator, replace with return. Don't do this
// if this node is a join node, since in that specific case we
// generate specific control return logic. See the join node codegen
// above for more details.
if !self.function.function.nodes[id.idx()].is_join() {
bb.terminator.clear();
}
// Making structs from the aggregated values in LLVM IR is a pain.
// We need to, one-by-one, insertvalue each element into the struct.
let ret_ty_str = generate_type_string(&self.return_type, &self.function.llvm_types);
for (idx, data_output_id) in self.data_outputs.iter().enumerate() {
write!(
bb.terminator,
" %ret_agg{}.{} = insertvalue {} {}, ",
id.idx(),
idx,
ret_ty_str,
if idx == 0 {
"undef".to_string()
} else {
format!("%ret_agg{}.{}", id.idx(), idx - 1)
}
)?;
let mut data_output_id = *data_output_id;
// Handle reduce specially here. Technically, the "user" here is
// the join node, so cpu_emit_use_of_node would normally emit
// the reduce node's virtual register directly. However, if a
// data output is the result of a reduce node, that is
// definitely outside for the corresponding fork-join. Thus, we
// actually need to use the reduction use of the reduce node.
// This all only applies if the reduce node is in the current
// partition. If not, then use the reduce node as the argument
// to cpu_emit_use_of_node as normal, so that the partition
// function argument is properly used.
while let Node::Reduce {
control: _,
init: _,
reduct,
} = self.function.function.nodes[data_output_id.idx()]
&& self.partition_id == self.function.plan.partitions[data_output_id.idx()]
{
data_output_id = reduct;
}
self.cpu_emit_use_of_node(data_output_id, None, true, &mut bb.terminator)?;
write!(bb.terminator, ", {}\n", idx)?;
}
// Now, we can return the aggregate value we calculated.
if self.data_outputs.is_empty() && self.control_returns.len() == 1 {
// If there are no data outputs, just return the empty struct.
write!(bb.terminator, " ret {} zeroinitializer\n", ret_ty_str)?;
} else if self.data_outputs.is_empty() {
// If there are multiple control returns, we need to return the
// node ID of the control return, so that the runtime can do
// control flow between partitions. In this case, there aren't
// any data outputs that also need to be returned.
write!(bb.terminator, " %ret_agg{}.ctrl_pos = insertvalue {} undef, i64 {}, 0\n ret {} %ret_agg{}.ctrl_pos\n",
id.idx(),
ret_ty_str,
id.idx(),
ret_ty_str,
id.idx()
)?;
} else if self.control_returns.len() == 1 {
// In the normal case, we return the struct containing just the
// data outputs.
write!(
bb.terminator,
" ret {} %ret_agg{}.{}\n",
ret_ty_str,
id.idx(),
self.data_outputs.len() - 1,
)?;
} else {
// If there are multiple control returns from this partition and
// there are data outputs, we add the control return node ID to
// the return aggregate.
write!(
bb.terminator,
" %ret_agg{}.ctrl_pos = insertvalue {} %ret_agg{}.{}, i64 {}, {}\n ret {} %ret_agg{}.ctrl_pos\n",
id.idx(),
ret_ty_str,
id.idx(),
self.data_outputs.len() - 1,
id.idx(),
self.data_outputs.len(),
ret_ty_str,
id.idx(),
)?;
}
}
Ok(())
}
/*
* Emit the LLVM value corresponding to a node. Optionally prefix with the
* LLVM type, which is required by textual LLVM IR in a few places.
* Optionally provide the node that will be using this emission. This is
* unused by all emitted node values except reduce nodes, which require the
* user argument to be given. We chose this interface because at the
* callsite of a cpu_emit_use_of_node, it is always known whether this thing
* being emitted could (or should) possibly be a reduce node. If not, then
* providing none gives a nice early panic when it is a reduce node, either
* because the developer misjudged or because there is a bug.
*/
fn cpu_emit_use_of_node<W: Write>(
&self,
id: NodeID,
user: Option<NodeID>,
emit_type: bool,
w: &mut W,
) -> std::fmt::Result {
// First, emit the type before the value (if applicable).
if emit_type {
write!(
w,
"{} ",
self.function.llvm_types[self.function.typing[id.idx()].idx()]
)?;
}
// Emitting the value can be surprisingly complicated, depending on what
// the node is. For example, partition arguments are emitted specially.
if let Some(input_idx) = self.data_inputs.iter().position(|inp_id| *inp_id == id) {
// If a use is in another partition, it needs to get passed to this
// partition's function as a parameter.
write!(w, "%part_arg.{}", input_idx)?;
} else {
match self.function.function.nodes[id.idx()] {
// Parameter nodes in this partition also represent parameters
// to this partition function.
Node::Parameter { index } => write!(w, "%func_arg.{}", index)?,
// Constants are pre-defined.
Node::Constant { id } => write!(w, "{}", self.function.llvm_constants[id.idx()])?,
Node::DynamicConstant { id } => {
write!(w, "{}", self.function.llvm_dynamic_constants[id.idx()])?
}
// Reduce nodes, as usual, are not nice to handle. We need to
// emit different LLVM depending on whether the user is inside
// or outside the reduce's corresponding fork-join nest. Inside,
// we emit as usual, since the user needs to use the phi node
// inside the reduction loop. Outside, we need to use the reduct
// use of the reduce node, so that we don't grab the reduction
// variable one loop iteration too early.
Node::Reduce {
control,
init: _,
reduct,
} => {
// Figure out the fork corresponding to the associated join.
let fork_id = if let Node::Join { control } =
self.function.function.nodes[control.idx()]
{
if let Type::Control(factors) =
&self.function.types[self.function.typing[control.idx()].idx()]
{
*factors.last().unwrap()
} else {
panic!()
}
} else {
panic!()
};
// Check if the basic block containing the user node is in
// the fork-join nest for this reduce node. We make the user
// node an optional argument as a debugging tool - if we
// exercise this code branch when generating the code for a
// node that absolutely should not be using the result of a
// reduce node, we would like to know!
if self.function.fork_join_nest[&self.function.bbs[user.expect("PANIC: cpu_emit_use_of_node was called on a reduce node, but no user node ID was given.").idx()]]
.contains(&fork_id)
{
// If the user is inside the fork-join nest, then emit
// the reduce node directly.
assert_eq!(self.partition_id, self.function.plan.partitions[id.idx()]);
write!(w, "%virt.{}", id.idx())?;
} else {
// If the user is outside the fork-join nest, then
// recursively emit on the reduction input to the reduce
// node. This is needed when there is a reduce chain.
assert_eq!(
self.partition_id,
self.function.plan.partitions[reduct.idx()]
);
self.cpu_emit_use_of_node(reduct, user, emit_type, w)?;
}
}
// Uses that are in this partition are just virtual registers.
// Clang is really annoying about numbering virtual registers,
// so to avoid that silliness we prepend all our virtual
// registers with a prefix indicating what kind of thing it is.
// For normal values, we use "virt" for virtual register.
_ => {
assert_eq!(self.partition_id, self.function.plan.partitions[id.idx()]);
write!(w, "%virt.{}", id.idx())?;
}
}
}
Ok(())
}
}