Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • llvm/hercules
1 result
Show changes
Commits on Source (6)
Showing
with 484 additions and 223 deletions
/target
**/target
*.dot
!paper_resources/*.dot
*.bc
......@@ -14,3 +14,4 @@
.vscode
*_env
*.txt
*ncu-rep
\ No newline at end of file
......@@ -1225,11 +1225,13 @@ namespace cg = cooperative_groups;
// because Fork basic block's init section already does gating
write!(
w,
"{}{} = (threadIdx.x % {}) / {};\n",
"{}{} = (((threadIdx.x % {}) / {}) / ({})) % ({});\n",
tabs,
define_variable,
use_thread_quota.unwrap(),
use_thread_quota.unwrap() / parallel_factor.unwrap()
use_thread_quota.unwrap() / parallel_factor.unwrap(),
divide,
modulo,
)?;
}
}
......
......@@ -716,8 +716,10 @@ fn typeflow(
// Check number of run-time arguments.
if inputs.len() - 1 != callee.param_types.len() {
return Error(format!(
"Call node has {} inputs, but calls a function with {} parameters.",
"Call node in {} has {} inputs, but calls a function ({}) with {} parameters.",
function.name,
inputs.len() - 1,
callee.name,
callee.param_types.len(),
));
}
......@@ -725,8 +727,10 @@ fn typeflow(
// Check number of dynamic constant arguments.
if dc_args.len() != callee.num_dynamic_constants as usize {
return Error(format!(
"Call node references {} dynamic constants, but calls a function expecting {} dynamic constants.",
"Call node in {} references {} dynamic constants, but calls a function ({}) expecting {} dynamic constants.",
function.name,
dc_args.len(),
callee.name,
callee.num_dynamic_constants
));
}
......
......@@ -1169,6 +1169,7 @@ pub fn fork_dim_merge(
op: BinaryOperator::Rem,
});
edit.sub_edit(tid, rem);
edit.sub_edit(tid, outer_tid);
edit = edit.replace_all_uses(tid, rem)?;
} else if tid_dim == inner_idx {
let outer_tid = Node::ThreadID {
......@@ -1185,6 +1186,7 @@ pub fn fork_dim_merge(
op: BinaryOperator::Div,
});
edit.sub_edit(tid, div);
edit.sub_edit(tid, outer_tid);
edit = edit.replace_all_uses(tid, div)?;
}
}
......@@ -1479,7 +1481,12 @@ fn fork_fusion(
}
// Perform the fusion.
let bottom_tids: Vec<_> = editor
.get_users(bottom_fork)
.filter(|id| nodes[id.idx()].is_thread_id())
.collect();
editor.edit(|mut edit| {
edit = edit.replace_all_uses_where(bottom_fork, top_fork, |id| bottom_tids.contains(id))?;
if bottom_join_pred != bottom_fork {
// If there is control flow in the bottom fork-join, stitch it into
// the top fork-join.
......
use std::collections::HashMap;
use std::iter::zip;
use hercules_ir::ir::*;
use crate::*;
......@@ -39,15 +36,30 @@ pub fn interprocedural_sroa(
}
let editor: &mut FunctionEditor = &mut editors[func_id.idx()];
let param_types = &editor.func().param_types.to_vec();
let return_types = &editor.func().return_types.to_vec();
// We determine the new return types of the function and track a map
// that tells us how the old return values are constructed from the
// new ones
// We determine the new param/return types of the function and track a
// map that tells us how the old param/return values are constructed
// from the new ones.
let mut new_param_types = vec![];
let mut old_param_type_map = vec![];
let mut new_return_types = vec![];
let mut old_return_type_map = vec![];
let mut changed = false;
for par_typ in param_types.iter() {
if !can_sroa_type(editor, *par_typ) {
old_param_type_map.push(IndexTree::Leaf(new_param_types.len()));
new_param_types.push(*par_typ);
} else {
let (types, index) = sroa_type(editor, *par_typ, new_param_types.len());
old_param_type_map.push(index);
new_param_types.extend(types);
changed = true;
}
}
for ret_typ in return_types.iter() {
if !can_sroa_type(editor, *ret_typ) {
old_return_type_map.push(IndexTree::Leaf(new_return_types.len()));
......@@ -60,25 +72,54 @@ pub fn interprocedural_sroa(
}
}
// If the return type is not changed by IP SROA, skip to the next function
// If the param/return types aren't changed by IP SROA, skip to the next
// function.
if !changed {
continue;
}
// Now, modify each return in the current function and the return type
let return_nodes = editor
.func()
.nodes
.iter()
.enumerate()
.filter_map(|(idx, node)| {
if node.try_return().is_some() {
Some(NodeID::new(idx))
// Modify each parameter in the current function and the param types.
let mut param_nodes: Vec<_> = vec![vec![]; param_types.len()];
for id in editor.node_ids() {
if let Some(idx) = editor.func().nodes[id.idx()].try_parameter() {
param_nodes[idx].push(id);
}
}
let success = editor.edit(|mut edit| {
for (idx, ids) in param_nodes.into_iter().enumerate() {
let new_indices = &old_param_type_map[idx];
let built = if let IndexTree::Leaf(new_idx) = new_indices {
edit.add_node(Node::Parameter { index: *new_idx })
} else {
None
let prod_ty = param_types[idx];
let cons = edit.add_zero_constant(prod_ty);
let mut cons = edit.add_node(Node::Constant { id: cons });
new_indices.for_each(|idx: &Vec<Index>, param_idx: &usize| {
let param = edit.add_node(Node::Parameter { index: *param_idx });
cons = edit.add_node(Node::Write {
collect: cons,
data: param,
indices: idx.clone().into_boxed_slice(),
});
});
cons
};
for id in ids {
edit = edit.replace_all_uses(id, built)?;
edit = edit.delete_node(id)?;
}
})
.collect::<Vec<_>>();
}
edit.set_param_types(new_param_types);
Ok(edit)
});
assert!(success, "IP SROA expects to be able to edit everything, specify what functions to IP SROA via the func_selection argument");
// Modify each return in the current function and the return types.
let return_nodes: Vec<_> = editor
.node_ids()
.filter(|id| editor.func().nodes[id.idx()].is_return())
.collect();
let success = editor.edit(|mut edit| {
for node in return_nodes {
let Node::Return { control, data } = edit.get_node(node) else {
......@@ -114,17 +155,15 @@ pub fn interprocedural_sroa(
}
edit.set_return_types(new_return_types);
Ok(edit)
});
assert!(success, "IP SROA expects to be able to edit everything, specify what functions to IP SROA via the func_selection argument");
// Finally, update calls of this function
// In particular, we actually don't have to update the call node at all but have to update
// its DataProjection users
// Finally, update calls of this function.
for (caller, callsite) in callsites {
let editor = &mut editors[caller.idx()];
assert!(editor.func_id() == caller);
let projs = editor.get_users(callsite).collect::<Vec<_>>();
for proj_id in projs {
let Node::DataProjection { data: _, selection } = editor.node(proj_id) else {
......@@ -134,6 +173,40 @@ pub fn interprocedural_sroa(
let typ = types[caller.idx()][proj_id.idx()];
replace_returned_value(editor, proj_id, typ, new_return_info, callsite);
}
let (control, callee, dc_args, args) =
editor.func().nodes[callsite.idx()].try_call().unwrap();
let dc_args = dc_args.clone();
let args = args.clone();
let success = editor.edit(|mut edit| {
let mut new_args = vec![];
for (idx, (data_id, update_info)) in
args.iter().zip(old_param_type_map.iter()).enumerate()
{
if let IndexTree::Leaf(new_idx) = update_info {
// Unchanged parameter value
assert!(new_args.len() == *new_idx);
new_args.push(*data_id);
} else {
// SROA'd parameter value
let reads = generate_reads_edit(&mut edit, param_types[idx], *data_id);
reads.zip(update_info).for_each(|_, (read_id, ret_idx)| {
assert!(new_args.len() == **ret_idx);
new_args.push(*read_id);
});
}
}
let new_call = edit.add_node(Node::Call {
control,
function: callee,
dynamic_constants: dc_args,
args: new_args.into_boxed_slice(),
});
edit = edit.replace_all_uses(callsite, new_call)?;
edit = edit.delete_node(callsite)?;
Ok(edit)
});
assert!(success);
}
}
}
......
......@@ -447,7 +447,7 @@ pub fn sroa(
field_map.insert(node, generate_reads(editor, types[&node], node));
}
Node::Constant { id } => {
field_map.insert(node, generate_constant_fields(editor, id));
field_map.insert(node, generate_constant_fields(editor, id, node));
to_delete.push(node);
}
_ => {
......@@ -1079,7 +1079,11 @@ pub fn generate_constant(editor: &mut FunctionEditor, typ: TypeID) -> ConstantID
// Given a constant cnst adds node to the function which are the constant values of each field and
// returns a list of pairs of indices and the node that holds that index
fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> IndexTree<NodeID> {
fn generate_constant_fields(
editor: &mut FunctionEditor,
cnst: ConstantID,
old_node: NodeID,
) -> IndexTree<NodeID> {
let cs: Option<Vec<ConstantID>> =
if let Some(cs) = editor.get_constant(cnst).try_product_fields() {
Some(cs.into())
......@@ -1090,13 +1094,14 @@ fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> In
if let Some(cs) = cs {
let mut fields = vec![];
for c in cs {
fields.push(generate_constant_fields(editor, c));
fields.push(generate_constant_fields(editor, c, old_node));
}
IndexTree::Node(fields)
} else {
let mut node = None;
editor.edit(|mut edit| {
node = Some(edit.add_node(Node::Constant { id: cnst }));
edit.sub_edit(old_node, node.unwrap());
Ok(edit)
});
IndexTree::Leaf(node.expect("Add node cannot fail"))
......
......@@ -145,7 +145,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>(
@image_loop for r = 0 to row {
for c = 0 to col {
@l2 let l2_dist : f32[num_ctrl_pts];
for cp = 0 to num_ctrl_pts {
@cp_loop for cp = 0 to num_ctrl_pts {
let v1 = input[0, r, c] - ctrl_pts[cp, 0];
let v2 = input[1, r, c] - ctrl_pts[cp, 1];
let v3 = input[2, r, c] - ctrl_pts[cp, 2];
......@@ -155,7 +155,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>(
@channel_loop for chan = 0 to CHAN {
let chan_val : f32 = 0.0;
for cp = 0 to num_ctrl_pts {
@cp_loop for cp = 0 to num_ctrl_pts {
chan_val += l2_dist[cp] * weights[cp, chan];
}
......
......@@ -115,7 +115,7 @@ array-slf(fuse4);
simpl!(fuse4);
let par = fuse4@image_loop \ fuse4@channel_loop;
fork-tile[4, 1, false, false](par);
fork-tile[4, 0, false, false](par);
fork-tile[8, 0, false, false](par);
fork-interchange[1, 2](par);
let split = fork-split(par);
let fuse4_body = outline(split.cava_3.fj2);
......
......@@ -117,9 +117,9 @@ fixpoint {
simpl!(fuse4);
array-slf(fuse4);
simpl!(fuse4);
//fork-tile[2, 0, false, true](fuse4@channel_loop);
//fork-split(fuse4@channel_loop);
//clean-monoid-reduces(fuse4);
fork-tile[2, 0, false, true](fuse4@channel_loop);
let out = fork-split(fuse4@channel_loop);
fork-unroll(out.cava_3.fj1);
unforkify(fuse4@channel_loop);
no-memset(fuse5@res1);
......@@ -133,6 +133,13 @@ simpl!(fuse5);
array-slf(fuse5);
simpl!(fuse5);
fork-tile[4, 1, false, true](fuse4);
fork-tile[8, 0, false, true](fuse4);
fork-interchange[1, 2](fuse4);
let split = fork-split(fuse4);
fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2);
fork-coalesce(split.cava_3.fj2);
delete-uncalled(*);
simpl!(*);
......
......@@ -124,9 +124,9 @@ pub struct CavaInputs {
#[clap(long = "output-verify", value_name = "PATH")]
pub output_verify: Option<String>,
pub cam_model: String,
#[clap(short, long)]
#[clap(long)]
pub crop_rows: Option<usize>,
#[clap(short, long)]
#[clap(long)]
pub crop_cols: Option<usize>,
}
......
......@@ -26,14 +26,14 @@ predication(gaussian_smoothing);
simpl!(gaussian_smoothing);
let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop;
fork-tile[4, 1, false, false](par);
fork-tile[4, 0, false, false](par);
fork-tile[8, 0, false, false](par);
fork-interchange[1, 2](par);
let split = fork-split(par);
let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2);
fork-coalesce(gaussian_smoothing, gaussian_smoothing_body);
simpl!(gaussian_smoothing, gaussian_smoothing_body);
no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2);
no-memset(laplacian_estimate@res);
fixpoint {
forkify(laplacian_estimate);
fork-guard-elim(laplacian_estimate);
......@@ -42,15 +42,15 @@ fixpoint {
simpl!(laplacian_estimate);
let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop;
fork-tile[4, 1, false, false](par);
fork-tile[4, 0, false, false](par);
fork-tile[8, 0, false, false](par);
fork-interchange[1, 2](par);
let split = fork-split(par);
let body = split._1_laplacian_estimate.fj2 | laplacian_estimate.shr1 | laplacian_estimate.shr2;
let body = split._1_laplacian_estimate.fj2;
let laplacian_estimate_body = outline(body);
fork-coalesce(laplacian_estimate, laplacian_estimate_body);
simpl!(laplacian_estimate, laplacian_estimate_body);
no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2);
no-memset(zero_crossings@res);
fixpoint {
forkify(zero_crossings);
fork-guard-elim(zero_crossings);
......@@ -59,10 +59,10 @@ fixpoint {
simpl!(zero_crossings);
let par = zero_crossings@image_loop \ zero_crossings@filter_loop;
fork-tile[4, 1, false, false](par);
fork-tile[4, 0, false, false](par);
fork-tile[8, 0, false, false](par);
fork-interchange[1, 2](par);
let split = fork-split(par);
let body = split._2_zero_crossings.fj2 | zero_crossings.shr1 | zero_crossings.shr2;
let body = split._2_zero_crossings.fj2;
let zero_crossings_body = outline(body);
fork-coalesce(zero_crossings, zero_crossings_body);
simpl!(zero_crossings, zero_crossings_body);
......@@ -86,7 +86,7 @@ fixpoint {
simpl!(max_gradient);
fork-dim-merge(max_gradient);
simpl!(max_gradient);
fork-tile[16, 0, false, false](max_gradient);
fork-tile[32, 0, false, false](max_gradient);
let split = fork-split(max_gradient);
clean-monoid-reduces(max_gradient);
let out = outline(split._4_max_gradient.fj1);
......@@ -105,7 +105,7 @@ fixpoint {
predication(reject_zero_crossings);
simpl!(reject_zero_crossings);
fork-tile[4, 1, false, false](reject_zero_crossings);
fork-tile[4, 0, false, false](reject_zero_crossings);
fork-tile[8, 0, false, false](reject_zero_crossings);
fork-interchange[1, 2](reject_zero_crossings);
let split = fork-split(reject_zero_crossings);
let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2);
......
......@@ -43,35 +43,16 @@ fn laplacian_estimate<n, m, sz: usize>(
@image_loop for row = 0 to n {
for col = 0 to m {
// Copy data for dilation filter
@shr1 let imageArea : f32[sz, sz];
@filter_loop for i = 0 to sz {
for j = 0 to sz {
imageArea[i, j] = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MIN_BR
else input[row + i - r, col + j - r];
}
}
// Compute pixel of dilated image
let dilated = MIN_BR;
@filter_loop for i = 0 to sz {
for j = 0 to sz {
dilated = max!(dilated, imageArea[i, j] * structure[i, j]);
}
}
// Data copy for erotion filter
@shr2 let imageArea : f32[sz, sz];
@filter_loop for i = 0 to sz {
for j = 0 to sz {
imageArea[i, j] = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MAX_BR
else input[row + i - r, col + j - r];
let filter = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MIN_BR
else input[row + i - r, col + j - r];
dilated = max!(dilated, filter * structure[i, j]);
}
}
......@@ -79,7 +60,12 @@ fn laplacian_estimate<n, m, sz: usize>(
let eroded = MAX_BR;
@filter_loop for i = 0 to sz {
for j = 0 to sz {
eroded = min!(eroded, imageArea[i, j] * structure[i, j]);
let filter = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MAX_BR
else input[row + i - r, col + j - r];
eroded = min!(eroded, filter * structure[i, j]);
}
}
......@@ -101,37 +87,17 @@ fn zero_crossings<n, m, sz: usize>(
@image_loop for row = 0 to n {
for col = 0 to m {
// Data copy for dilation filter
@shr1 let imageArea : f32[sz, sz];
@filter_loop for i = 0 to sz {
for j = 0 to sz {
imageArea[i, j] = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MIN_BR
else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
else MIN_BR;
}
}
// Compute the pixel of dilated image
let dilated = MIN_BR;
@filter_loop for i = 0 to sz {
for j = 0 to sz {
dilated = max!(dilated, imageArea[i, j] * structure[i, j]);
}
}
// Data copy for erotion filter
@shr2 let imageArea : f32[sz, sz];
@filter_loop for i = 0 to sz {
for j = 0 to sz {
imageArea[i, j] = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MAX_BR
else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
else MIN_BR;
let filter = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MIN_BR
else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
else MIN_BR;
dilated = max!(dilated, filter * structure[i, j]);
}
}
......@@ -139,7 +105,13 @@ fn zero_crossings<n, m, sz: usize>(
let eroded = MAX_BR;
@filter_loop for i = 0 to sz {
for j = 0 to sz {
eroded = min!(eroded, imageArea[i, j] * structure[i, j]);
let filter = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MAX_BR
else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
else MIN_BR;
eroded = min!(eroded, filter * structure[i, j]);
}
}
......@@ -166,7 +138,7 @@ fn gradient<n, m, sb: usize>(
let gx = 0;
let gy = 0;
for i = 0 to sb {
@filter_loop for i = 0 to sb {
for j = 0 to sb {
let val = input[if row + i < sbr then 0
else if row + i - sbr > n - 1 then n - 1
......
......@@ -26,22 +26,49 @@ predication(gaussian_smoothing);
simpl!(gaussian_smoothing);
predication(gaussian_smoothing);
simpl!(gaussian_smoothing);
fork-dim-merge(gaussian_smoothing@filter_loop);
unforkify(gaussian_smoothing@filter_loop);
simpl!(gaussian_smoothing);
no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2);
fork-dim-merge(gaussian_smoothing);
fork-tile[32, 0, false, true](gaussian_smoothing);
simpl!(gaussian_smoothing);
fork-split(gaussian_smoothing);
simpl!(gaussian_smoothing);
no-memset(laplacian_estimate@res);
fixpoint {
forkify(laplacian_estimate);
fork-guard-elim(laplacian_estimate);
fork-coalesce(laplacian_estimate);
}
simpl!(laplacian_estimate);
fork-dim-merge(laplacian_estimate@filter_loop);
unforkify(laplacian_estimate@filter_loop);
simpl!(laplacian_estimate);
no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2);
fork-dim-merge(laplacian_estimate);
fork-tile[32, 0, false, true](laplacian_estimate);
simpl!(laplacian_estimate);
fork-split(laplacian_estimate);
simpl!(laplacian_estimate);
no-memset(zero_crossings@res);
fixpoint {
forkify(zero_crossings);
fork-guard-elim(zero_crossings);
fork-coalesce(zero_crossings);
}
simpl!(zero_crossings);
fork-dim-merge(zero_crossings@filter_loop);
unforkify(zero_crossings@filter_loop);
simpl!(zero_crossings);
fork-dim-merge(zero_crossings);
fork-tile[32, 0, false, true](zero_crossings);
simpl!(zero_crossings);
fork-split(zero_crossings);
simpl!(zero_crossings);
no-memset(gradient@res);
fixpoint {
......@@ -53,6 +80,15 @@ predication(gradient);
simpl!(gradient);
predication(gradient);
simpl!(gradient);
fork-dim-merge(gradient@filter_loop);
unforkify(gradient@filter_loop);
simpl!(gradient);
fork-dim-merge(gradient);
fork-tile[32, 0, false, true](gradient);
simpl!(gradient);
fork-split(gradient);
simpl!(gradient);
fixpoint {
forkify(max_gradient);
......@@ -88,6 +124,12 @@ fixpoint {
predication(reject_zero_crossings);
simpl!(reject_zero_crossings);
fork-dim-merge(reject_zero_crossings);
fork-tile[32, 0, false, true](reject_zero_crossings);
simpl!(reject_zero_crossings);
fork-split(reject_zero_crossings);
simpl!(reject_zero_crossings);
async-call(edge_detection@le, edge_detection@zc);
simpl!(*);
......
use juno_build::JunoCompiler;
fn main() {
#[cfg(not(feature = "cuda"))]
{
JunoCompiler::new()
.file_in_src("matmul.jn")
.unwrap()
.schedule_in_src("cpu.sch")
.unwrap()
.build()
.unwrap();
}
#[cfg(feature = "cuda")]
{
JunoCompiler::new()
.file_in_src("matmul.jn")
.unwrap()
.schedule_in_src("gpu.sch")
.unwrap()
.build()
.unwrap();
}
JunoCompiler::new()
.file_in_src("matmul.jn")
.unwrap()
.schedule_in_src("matmul.sch")
.unwrap()
.build()
.unwrap();
}
phi-elim(*);
forkify(*);
fork-guard-elim(*);
dce(*);
fixpoint {
reduce-slf(*);
slf(*);
infer-schedules(*);
}
fork-coalesce(*);
infer-schedules(*);
dce(*);
rewrite(*);
fixpoint {
simplify-cfg(*);
dce(*);
}
ip-sroa(*);
sroa(*);
dce(*);
float-collections(*);
gcm(*);
......@@ -41,21 +41,41 @@ macro unforkify!(X) {
optimize!(*);
forkify!(*);
associative(matmul@outer);
// Parallelize by computing output array as 16 chunks
let par = matmul@outer \ matmul@inner;
fork-tile![4](par);
let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par);
parallelize!(outer \ inner);
if feature("cuda") {
fixpoint {
reduce-slf(*);
slf(*);
infer-schedules(*);
}
fork-coalesce(*);
infer-schedules(*);
dce(*);
rewrite(*);
fixpoint {
simplify-cfg(*);
dce(*);
}
let body = outline(inner);
cpu(body);
optimize!(*);
codegen-prep!(*);
} else {
associative(matmul@outer);
// Tile for cache, assuming 64B cache lines
fork-tile![16](body);
let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body);
// Parallelize by computing output array as 16 chunks
let par = matmul@outer \ matmul@inner;
fork-tile![4](par);
let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par);
parallelize!(outer \ inner);
reduce-slf(inner);
unforkify!(body);
codegen-prep!(*);
let body = outline(inner);
cpu(body);
// Tile for cache, assuming 64B cache lines
fork-tile![16](body);
let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body);
reduce-slf(inner);
unforkify!(body);
codegen-prep!(*);
}
......@@ -28,40 +28,55 @@ fn cfd_bench(c: &mut Criterion) {
elements_surrounding_elements,
normals,
} = read_domain_geometry(data_file, block_size);
let mut variables = initialize_variables(nelr, ff_variable.as_slice());
let mut variables = HerculesMutBox::from(variables.as_mut_slice());
let mut variables = initialize_variables(nelr, &ff_variable);
let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
let areas = HerculesImmBox::from(areas.as_slice());
let elements_surrounding_elements =
HerculesImmBox::from(elements_surrounding_elements.as_slice());
let normals = HerculesImmBox::from(normals.as_slice());
let ff_variable = HerculesImmBox::from(ff_variable.as_slice());
let ff_fc_density_energy = vec![
ff_fc_density_energy.x,
ff_fc_density_energy.y,
ff_fc_density_energy.z,
];
let ff_fc_density_energy = HerculesImmBox::from(ff_fc_density_energy.as_slice());
let ff_fc_momentum_x = vec![ff_fc_momentum_x.x, ff_fc_momentum_x.y, ff_fc_momentum_x.z];
let ff_fc_momentum_x = HerculesImmBox::from(ff_fc_momentum_x.as_slice());
let ff_fc_momentum_y = vec![ff_fc_momentum_y.x, ff_fc_momentum_y.y, ff_fc_momentum_y.z];
let ff_fc_momentum_y = HerculesImmBox::from(ff_fc_momentum_y.as_slice());
let ff_fc_momentum_z = vec![ff_fc_momentum_z.x, ff_fc_momentum_z.y, ff_fc_momentum_z.z];
let ff_fc_momentum_z = HerculesImmBox::from(ff_fc_momentum_z.as_slice());
let normals_x = HerculesImmBox::from(normals.x.as_slice());
let normals_y = HerculesImmBox::from(normals.y.as_slice());
let normals_z = HerculesImmBox::from(normals.z.as_slice());
group.bench_function("cfd bench euler", |b| {
b.iter(|| {
async_std::task::block_on(async {
r.run(
nelr as u64,
iterations as u64,
variables.to(),
v_density.to(),
v_momentum_x.to(),
v_momentum_y.to(),
v_momentum_z.to(),
v_energy.to(),
areas.to(),
elements_surrounding_elements.to(),
normals.to(),
ff_variable.to(),
ff_fc_density_energy.to(),
ff_fc_momentum_x.to(),
ff_fc_momentum_y.to(),
ff_fc_momentum_z.to(),
normals_x.to(),
normals_y.to(),
normals_z.to(),
ff_variable.density,
ff_variable.momentum.x,
ff_variable.momentum.y,
ff_variable.momentum.z,
ff_variable.energy,
ff_fc_density_energy.x,
ff_fc_density_energy.y,
ff_fc_density_energy.z,
ff_fc_momentum_x.x,
ff_fc_momentum_x.y,
ff_fc_momentum_x.z,
ff_fc_momentum_y.x,
ff_fc_momentum_y.y,
ff_fc_momentum_y.z,
ff_fc_momentum_z.x,
ff_fc_momentum_z.y,
ff_fc_momentum_z.z,
)
.await
});
......@@ -85,40 +100,55 @@ fn cfd_bench(c: &mut Criterion) {
elements_surrounding_elements,
normals,
} = read_domain_geometry(data_file, block_size);
let mut variables = initialize_variables(nelr, ff_variable.as_slice());
let mut variables = HerculesMutBox::from(variables.as_mut_slice());
let mut variables = initialize_variables(nelr, &ff_variable);
let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
let areas = HerculesImmBox::from(areas.as_slice());
let elements_surrounding_elements =
HerculesImmBox::from(elements_surrounding_elements.as_slice());
let normals = HerculesImmBox::from(normals.as_slice());
let ff_variable = HerculesImmBox::from(ff_variable.as_slice());
let ff_fc_density_energy = vec![
ff_fc_density_energy.x,
ff_fc_density_energy.y,
ff_fc_density_energy.z,
];
let ff_fc_density_energy = HerculesImmBox::from(ff_fc_density_energy.as_slice());
let ff_fc_momentum_x = vec![ff_fc_momentum_x.x, ff_fc_momentum_x.y, ff_fc_momentum_x.z];
let ff_fc_momentum_x = HerculesImmBox::from(ff_fc_momentum_x.as_slice());
let ff_fc_momentum_y = vec![ff_fc_momentum_y.x, ff_fc_momentum_y.y, ff_fc_momentum_y.z];
let ff_fc_momentum_y = HerculesImmBox::from(ff_fc_momentum_y.as_slice());
let ff_fc_momentum_z = vec![ff_fc_momentum_z.x, ff_fc_momentum_z.y, ff_fc_momentum_z.z];
let ff_fc_momentum_z = HerculesImmBox::from(ff_fc_momentum_z.as_slice());
let normals_x = HerculesImmBox::from(normals.x.as_slice());
let normals_y = HerculesImmBox::from(normals.y.as_slice());
let normals_z = HerculesImmBox::from(normals.z.as_slice());
group.bench_function("cfd bench pre-euler", |b| {
b.iter(|| {
async_std::task::block_on(async {
r.run(
nelr as u64,
iterations as u64,
variables.to(),
v_density.to(),
v_momentum_x.to(),
v_momentum_y.to(),
v_momentum_z.to(),
v_energy.to(),
areas.to(),
elements_surrounding_elements.to(),
normals.to(),
ff_variable.to(),
ff_fc_density_energy.to(),
ff_fc_momentum_x.to(),
ff_fc_momentum_y.to(),
ff_fc_momentum_z.to(),
normals_x.to(),
normals_y.to(),
normals_z.to(),
ff_variable.density,
ff_variable.momentum.x,
ff_variable.momentum.y,
ff_variable.momentum.z,
ff_variable.energy,
ff_fc_density_energy.x,
ff_fc_density_energy.y,
ff_fc_density_energy.z,
ff_fc_momentum_x.x,
ff_fc_momentum_x.y,
ff_fc_momentum_x.z,
ff_fc_momentum_y.x,
ff_fc_momentum_y.y,
ff_fc_momentum_y.z,
ff_fc_momentum_z.x,
ff_fc_momentum_z.y,
ff_fc_momentum_z.z,
)
.await
});
......
......@@ -12,10 +12,11 @@ macro simpl!(X) {
simpl!(*);
inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
delete-uncalled(*);
simpl!(*);
ip-sroa[false](*);
sroa[false](*);
ip-sroa[true](*);
sroa[true](*);
predication(*);
const-inline(*);
simpl!(*);
......@@ -24,8 +25,31 @@ fixpoint {
fork-guard-elim(*);
}
simpl!(*);
no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop);
unforkify(compute_flux@inner_loop);
fork-tile[32, 0, false, false](compute_step_factor);
let split = fork-split(compute_step_factor);
let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
fork-coalesce(compute_step_factor, compute_step_factor_body);
simpl!(compute_step_factor, compute_step_factor_body);
fork-tile[32, 0, false, false](compute_flux);
let split = fork-split(compute_flux);
let compute_flux_body = outline(split._6_compute_flux.fj1);
fork-coalesce(compute_flux, compute_flux_body);
simpl!(compute_flux, compute_flux_body);
fork-tile[32, 0, false, false](time_step);
let split = fork-split(time_step);
let time_step_body = outline(split._7_time_step.fj1);
fork-coalesce(time_step, time_step_body);
simpl!(time_step, time_step_body);
fork-tile[32, 0, false, false](copy_vars);
let split = fork-split(copy_vars);
let copy_vars_body = outline(split._8_copy_vars.fj1);
fork-coalesce(copy_vars, copy_vars_body);
simpl!(copy_vars, copy_vars_body);
unforkify(*);
unforkify(compute_step_factor_body, compute_flux_body, time_step_body, copy_vars_body);
gcm(*);
......@@ -14,8 +14,8 @@ simpl!(*);
inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step);
delete-uncalled(*);
simpl!(*);
ip-sroa[false](*);
sroa[false](*);
ip-sroa[true](*);
sroa[true](*);
predication(*);
const-inline(*);
simpl!(*);
......@@ -24,7 +24,38 @@ fixpoint {
fork-guard-elim(*);
}
simpl!(*);
no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res);
unforkify(compute_flux@inner_loop);
unforkify(*);
fork-tile[32, 0, false, false](compute_step_factor);
let split = fork-split(compute_step_factor);
let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
fork-coalesce(compute_step_factor, compute_step_factor_body);
simpl!(compute_step_factor, compute_step_factor_body);
fork-tile[32, 0, false, false](compute_flux_contributions);
let split = fork-split(compute_flux_contributions);
let compute_flux_contributions_body = outline(split._6_compute_flux_contributions.fj1);
fork-coalesce(compute_flux_contributions, compute_flux_contributions_body);
simpl!(compute_flux_contributions, compute_flux_contributions_body);
fork-tile[32, 0, false, false](compute_flux);
let split = fork-split(compute_flux);
let compute_flux_body = outline(split._7_compute_flux.fj1);
fork-coalesce(compute_flux, compute_flux_body);
simpl!(compute_flux, compute_flux_body);
fork-tile[32, 0, false, false](time_step);
let split = fork-split(time_step);
let time_step_body = outline(split._8_time_step.fj1);
fork-coalesce(time_step, time_step_body);
simpl!(time_step, time_step_body);
fork-tile[32, 0, false, false](copy_vars);
let split = fork-split(copy_vars);
let copy_vars_body = outline(split._9_copy_vars.fj1);
fork-coalesce(copy_vars, copy_vars_body);
simpl!(copy_vars, copy_vars_body);
unforkify(compute_step_factor_body, compute_flux_contributions_body, compute_flux_body, time_step_body, copy_vars_body);
gcm(*);
gvn(*);
dce(*);
phi-elim(*);
dce(*);
crc(*);
dce(*);
slf(*);
dce(*);
macro simpl!(X) {
ccp(X);
simplify-cfg(X);
lift-dc-math(X);
gvn(X);
phi-elim(X);
crc(X);
slf(X);
dce(X);
infer-schedules(X);
}
let auto = auto-outline(euler);
gpu(auto.euler);
inline(auto.euler);
inline(auto.euler);
simpl!(*);
inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
delete-uncalled(*);
gpu(copy_vars, compute_step_factor, compute_flux, time_step);
sroa[false](auto.euler);
dce(*);
float-collections(*);
dce(*);
simpl!(*);
ip-sroa[true](*);
sroa[true](*);
predication(*);
const-inline(*);
simpl!(*);
fixpoint {
forkify(*);
fork-guard-elim(*);
}
simpl!(*);
unforkify(compute_flux@inner_loop);
gcm(*);
fork-tile[32, 0, false, true](compute_step_factor);
fork-split(compute_step_factor);
fork-tile[32, 0, false, true](compute_flux);
fork-split(compute_flux);
fork-tile[32, 0, false, true](time_step);
fork-split(time_step);
fork-tile[32, 0, false, true](copy_vars);
fork-split(copy_vars);
gcm(*);