Skip to content
Snippets Groups Projects

Fixes to backend to support matmul

Merged rarbore2 requested to merge working_matmul into main
14 files
+ 521
101
Compare changes
  • Side-by-side
  • Inline
Files
14
+ 119
72
@@ -2,6 +2,7 @@ extern crate bitvec;
use std::collections::{HashMap, VecDeque};
use std::fmt::{Error, Write};
use std::iter::once;
use self::bitvec::prelude::*;
@@ -30,37 +31,22 @@ use crate::*;
* be identical, even though for both, one of v1 or v2 doesn't dominate the
* PartitionExit. What *should* happen here is that each PartitionExit gets
* lowered to an LLVM `ret`, where the non-dominating output is set to
* `undef` This works since in the original, un-partitioned, Hercules IR,
* `undef`. This works since in the original, un-partitioned, Hercules IR,
* defs must dominate uses, so we won't run into a situation where a returned
* `undef` value is actually read. What happens currently is that the
* generated LLVM will `ret` `%v1` and `%v2`, which LLVM won't compile (since
* the code wouldn't be in SSA form). This should get fixed when we start
* compiling more complicated codes.
*
* 2. We're a bit, "loose", with how certain basic blocks involving fork-joins
* are handled. In particular, the following things need to be handled
* properly, but aren't yet:
* - Phis in a block with a reduce block predecessor need to generate LLVM
* phis that actually depend on the top parallel block of that fork-join,
* not the reduce block. This is because we hijack the top block to be the
* loop header, rather than use the reduce block to be the loop header.
* - The above also applies in the case of phis generated from thread IDs and
* reduction variables inside the top parallel block of a fork-join. This
* case occurs when there is a parallel-reduce section inside another
* parallel-reduce section.
* - The above also applies in the case of a parallel-reduce section
* immediately following another parallel-reduce section (a reduce block
* jumps to the top parallel block of another parallel-reduce section).
*
* 3. Handle >= 3D fork-joins and array accesses. This isn't conceptually
* 2. Handle >= 3D fork-joins and array accesses. This isn't conceptually
* difficult, but generating the LLVM code to implement these is annoying.
*
* 4. Handle ABI properly when taking in / returning structs taking moret han 16
* 3. Handle ABI properly when taking in / returning structs taking more than 16
* bytes. When a passed / returned struct takes more than 16 bytes, it needs
* to be passed around via pointers. This is one of many platform specific C
* ABI rules we need to handle to be properly called from Rust (that 16 byte
* rule is actually x86-64 specific). I'm honestly not sure how to handle
* this well. We avoid running into the manifestation of this problem by for
* this well. We avoid running into the manifestation of this problem for
* some samples by removing unneeded parameters / return values from
* partitions at the schedule IR level, which we should do anyway, but this
* isn't a complete solution.
@@ -81,6 +67,61 @@ pub fn cpu_compile<W: Write>(
let svalue_types = sched_svalue_types(function);
let parallel_reduce_infos = sched_parallel_reduce_sections(function);
// Calculate the names of each block. For blocks that are the top or bottom
// blocks of sequential fork-joins, references outside the fork-join
// actually need to refer to the header block. This is a bit complicated to
// handle, and we use these names in several places, so pre-calculate the
// block names. Intuitively, if we are "inside" a sequential fork-join,
// references to the top or bottom blocks actually refer to those blocks,
// while if we are "outside" the sequential fork-join, references to both
// the top or bottom blocks actually refer to the loop header block.
let mut block_names = HashMap::new();
for (block_idx, block) in function.blocks.iter().enumerate() {
for fork_join_id in parallel_reduce_infos
.keys()
.map(|id| Some(*id))
.chain(once(None))
{
let block_id = BlockID::new(block_idx);
let possible_parent = block.kind.try_fork_join_id();
let mut walk = fork_join_id;
// Check if the location in the key is "inside" the location of the
// block.
let is_inside = if let Some(parent) = possible_parent {
loop {
if let Some(step) = walk {
if step == parent {
// If we see the block's location, then the key
// location is "inside".
break true;
} else {
// Walk the parent until we find something
// interesting.
walk = parallel_reduce_infos[&step].parent_fork_join_id;
}
} else {
// If we don't find the block, then the key location is
// "outside" the block's parallel-reduce.
break false;
}
}
} else {
// Every location is "inside" the top level sequential section.
true
};
if is_inside {
block_names.insert((block_id, fork_join_id), format!("bb_{}", block_idx));
} else {
block_names.insert(
(block_id, fork_join_id),
format!("fork_join_seq_header_{}", possible_parent.unwrap().idx()),
);
}
}
}
// Generate a dummy uninitialized global - this is needed so that there'll
// be a non-empty .bss section in the ELF object file.
write!(
@@ -115,11 +156,8 @@ pub fn cpu_compile<W: Write>(
// Emit the function body. Emit each block, one at a time.
for (block_idx, block) in function.blocks.iter().enumerate() {
// Emit the header for the block.
write!(w, "bb_{}:\n", block_idx)?;
// For "tops" of sequential fork-joins, we hijack to the top block to be
// the loop header for the fork-join loop.
// For "tops" of sequential fork-joins, we emit a special top block to
// be the loop header for the fork-join loop.
if let Some(fork_join_id) = block.kind.try_parallel()
&& parallel_reduce_infos[&fork_join_id]
.top_parallel_block
@@ -127,13 +165,22 @@ pub fn cpu_compile<W: Write>(
== block_idx
{
emit_fork_join_seq_header(
fork_join_id,
&parallel_reduce_infos[&fork_join_id],
&svalue_types,
&block_names,
block_idx,
w,
)?;
}
// Emit the header for the block.
write!(
w,
"{}:\n",
&block_names[&(BlockID::new(block_idx), block.kind.try_fork_join_id())]
)?;
// For each basic block, emit instructions in that block. Emit using a
// worklist over the dependency graph.
let mut emitted = bitvec![u8, Lsb0; 0; block.insts.len()];
@@ -152,10 +199,12 @@ pub fn cpu_compile<W: Write>(
emit_inst(
block.virt_regs[inst_id.idx_1()].0,
&block.insts[inst_idx],
block.kind.try_fork_join_id(),
block
.kind
.try_fork_join_id()
.map(|fork_join_id| &parallel_reduce_infos[&fork_join_id]),
&block_names,
&svalue_types,
w,
)?;
@@ -271,7 +320,9 @@ fn emit_svalue<W: Write>(
fn emit_inst<W: Write>(
virt_reg: usize,
inst: &SInst,
location: Option<ForkJoinID>,
parallel_reduce_info: Option<&ParallelReduceInfo>,
block_names: &HashMap<(BlockID, Option<ForkJoinID>), String>,
types: &HashMap<SValue, SType>,
w: &mut W,
) -> Result<(), Error> {
@@ -295,17 +346,19 @@ fn emit_inst<W: Write>(
Some((pred_block_id, svalue)) => {
write!(w, "[ ")?;
emit_svalue(svalue, false, types, w)?;
write!(w, ", %bb_{} ]", pred_block_id.idx())?;
write!(w, ", %{} ]", &block_names[&(*pred_block_id, location)])?;
Ok(())
}
None => write!(w, ", "),
})
.collect::<Result<(), Error>>()?;
}
SInst::ThreadID { dimension } => {
let block = parallel_reduce_info.unwrap().top_parallel_block;
SInst::ThreadID {
dimension,
fork_join,
} => {
emit_assign(w)?;
write!(w, "add i64 0, %thread_id_{}_{}", block.idx(), dimension)?;
write!(w, "add i64 0, %thread_id_{}_{}", fork_join.idx(), dimension)?;
}
SInst::ReductionVariable { number } => {
write!(w, "; Already emitted reduction variable #{number}.")?;
@@ -318,11 +371,11 @@ fn emit_inst<W: Write>(
if reduce_exit.is_some() {
write!(
w,
"br label %bb_{}",
parallel_reduce_info.unwrap().top_parallel_block.idx()
"br label %fork_join_seq_header_{}",
location.unwrap().idx(),
)?;
} else {
write!(w, "br label %bb_{}", target.idx())?;
write!(w, "br label %{}", &block_names[&(*target, location)])?;
}
}
SInst::Branch {
@@ -334,9 +387,9 @@ fn emit_inst<W: Write>(
emit_svalue(cond, true, types, w)?;
write!(
w,
", label %bb_{}, label %bb_{}",
true_target.idx(),
false_target.idx()
", label %{}, label %{}",
&block_names[&(*true_target, location)],
&block_names[&(*false_target, location)],
)?;
}
SInst::PartitionExit { data_outputs } => {
@@ -444,7 +497,7 @@ fn emit_inst<W: Write>(
} => {
emit_linear_index_calc(virt_reg, position, bounds, types, w)?;
write!(w, "%store_ptr_{} = getelementptr ", virt_reg)?;
emit_type(&types[&self_svalue], w)?;
emit_type(&types[value], w)?;
write!(w, ", ")?;
emit_svalue(array, true, types, w)?;
write!(w, ", i64 %calc_linear_idx_{}\n ", virt_reg)?;
@@ -463,39 +516,33 @@ fn emit_inst<W: Write>(
* Emit the loop header implementing a sequential fork-join.
*/
fn emit_fork_join_seq_header<W: Write>(
fork_join_id: ForkJoinID,
info: &ParallelReduceInfo,
types: &HashMap<SValue, SType>,
block_names: &HashMap<(BlockID, Option<ForkJoinID>), String>,
block_idx: usize,
w: &mut W,
) -> Result<(), Error> {
// Start the header of the loop.
write!(w, "fork_join_seq_header_{}:\n", fork_join_id.idx())?;
// Emit the phis for the linear loop index variable and the reduction
// variables.
// TODO: handle these cases:
// 1. A parallel-reduce section is nested inside another parallel-reduce
// section. If the predecessor block is itself a top parallel block, the
// predecessor needs to be the _body block, not the original block.
// 2. A parallel-reduce section is immediately followed by another parallel-
// reduce section. If the predecessor block is itself a reduce block, the
// predecessor needs to be the top parallel block of the previous
// parallel-reduce section, not its reduce block.
let entry_pred = info.predecessor;
let loop_pred = info.reduce_block;
let entry_name = &block_names[&(info.predecessor, Some(fork_join_id))];
let loop_name = &block_names[&(info.reduce_block, Some(fork_join_id))];
write!(
w,
" %linear_{} = phi i64 [ 0, %bb_{} ], [ %linear_{}_inc, %bb_{} ]\n",
block_idx,
entry_pred.idx(),
block_idx,
loop_pred.idx(),
" %linear_{} = phi i64 [ 0, %{} ], [ %linear_{}_inc, %{} ]\n",
block_idx, entry_name, block_idx, loop_name,
)?;
for (var_num, virt_reg) in info.reduction_variables.iter() {
write!(w, " %v{} = phi ", virt_reg)?;
emit_type(&types[&SValue::VirtualRegister(*virt_reg)], w)?;
write!(w, " [ ")?;
emit_svalue(&info.reduce_inits[*var_num], false, types, w)?;
write!(w, ", %bb_{} ], [ ", entry_pred.idx())?;
write!(w, ", %{} ], [ ", entry_name)?;
emit_svalue(&info.reduce_reducts[*var_num], false, types, w)?;
write!(w, ", %bb_{} ]\n", loop_pred.idx())?;
write!(w, ", %{} ]\n", loop_name)?;
}
// Calculate the loop bounds.
@@ -513,42 +560,28 @@ fn emit_fork_join_seq_header<W: Write>(
todo!("TODO: Handle the 3 or more dimensional fork-join case.")
}
// Emit the branch.
write!(
w,
" %cond_{} = icmp ult i64 %linear_{}, %bound_{}\n",
block_idx, block_idx, block_idx
)?;
write!(
w,
" br i1 %cond_{}, label %bb_{}_body, label %bb_{}\n",
block_idx,
block_idx,
info.successor.idx()
)?;
// Start the body of the loop.
write!(w, "bb_{}_body:\n", block_idx)?;
// Calculate the multi-dimensional thread indices.
if info.thread_counts.len() == 1 {
write!(
w,
" %thread_id_{}_0 = add i64 0, %linear_{}\n",
block_idx, block_idx
fork_join_id.idx(),
block_idx
)?;
} else if info.thread_counts.len() == 2 {
write!(
w,
" %thread_id_{}_0 = udiv i64 %linear_{}, ",
block_idx, block_idx
fork_join_id.idx(),
block_idx
)?;
emit_svalue(&info.thread_counts[1], false, types, w)?;
write!(w, "\n")?;
write!(
w,
" %thread_id_{}_1 = urem i64 %linear_{}, ",
block_idx, block_idx
fork_join_id.idx(),
block_idx
)?;
emit_svalue(&info.thread_counts[1], false, types, w)?;
write!(w, "\n")?;
@@ -563,6 +596,20 @@ fn emit_fork_join_seq_header<W: Write>(
block_idx, block_idx
)?;
// Emit the branch.
write!(
w,
" %cond_{} = icmp ult i64 %linear_{}, %bound_{}\n",
block_idx, block_idx, block_idx
)?;
let top_name = &block_names[&(BlockID::new(block_idx), Some(fork_join_id))];
let succ_name = &block_names[&(info.successor, Some(fork_join_id))];
write!(
w,
" br i1 %cond_{}, label %{}, label %{}\n",
block_idx, top_name, succ_name
)?;
Ok(())
}
Loading