Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • llvm/hercules
1 result
Show changes
Commits on Source (6)
Showing
with 484 additions and 223 deletions
/target **/target
*.dot *.dot
!paper_resources/*.dot !paper_resources/*.dot
*.bc *.bc
...@@ -14,3 +14,4 @@ ...@@ -14,3 +14,4 @@
.vscode .vscode
*_env *_env
*.txt *.txt
*ncu-rep
\ No newline at end of file
...@@ -1225,11 +1225,13 @@ namespace cg = cooperative_groups; ...@@ -1225,11 +1225,13 @@ namespace cg = cooperative_groups;
// because Fork basic block's init section already does gating // because Fork basic block's init section already does gating
write!( write!(
w, w,
"{}{} = (threadIdx.x % {}) / {};\n", "{}{} = (((threadIdx.x % {}) / {}) / ({})) % ({});\n",
tabs, tabs,
define_variable, define_variable,
use_thread_quota.unwrap(), use_thread_quota.unwrap(),
use_thread_quota.unwrap() / parallel_factor.unwrap() use_thread_quota.unwrap() / parallel_factor.unwrap(),
divide,
modulo,
)?; )?;
} }
} }
......
...@@ -716,8 +716,10 @@ fn typeflow( ...@@ -716,8 +716,10 @@ fn typeflow(
// Check number of run-time arguments. // Check number of run-time arguments.
if inputs.len() - 1 != callee.param_types.len() { if inputs.len() - 1 != callee.param_types.len() {
return Error(format!( return Error(format!(
"Call node has {} inputs, but calls a function with {} parameters.", "Call node in {} has {} inputs, but calls a function ({}) with {} parameters.",
function.name,
inputs.len() - 1, inputs.len() - 1,
callee.name,
callee.param_types.len(), callee.param_types.len(),
)); ));
} }
...@@ -725,8 +727,10 @@ fn typeflow( ...@@ -725,8 +727,10 @@ fn typeflow(
// Check number of dynamic constant arguments. // Check number of dynamic constant arguments.
if dc_args.len() != callee.num_dynamic_constants as usize { if dc_args.len() != callee.num_dynamic_constants as usize {
return Error(format!( return Error(format!(
"Call node references {} dynamic constants, but calls a function expecting {} dynamic constants.", "Call node in {} references {} dynamic constants, but calls a function ({}) expecting {} dynamic constants.",
function.name,
dc_args.len(), dc_args.len(),
callee.name,
callee.num_dynamic_constants callee.num_dynamic_constants
)); ));
} }
......
...@@ -1169,6 +1169,7 @@ pub fn fork_dim_merge( ...@@ -1169,6 +1169,7 @@ pub fn fork_dim_merge(
op: BinaryOperator::Rem, op: BinaryOperator::Rem,
}); });
edit.sub_edit(tid, rem); edit.sub_edit(tid, rem);
edit.sub_edit(tid, outer_tid);
edit = edit.replace_all_uses(tid, rem)?; edit = edit.replace_all_uses(tid, rem)?;
} else if tid_dim == inner_idx { } else if tid_dim == inner_idx {
let outer_tid = Node::ThreadID { let outer_tid = Node::ThreadID {
...@@ -1185,6 +1186,7 @@ pub fn fork_dim_merge( ...@@ -1185,6 +1186,7 @@ pub fn fork_dim_merge(
op: BinaryOperator::Div, op: BinaryOperator::Div,
}); });
edit.sub_edit(tid, div); edit.sub_edit(tid, div);
edit.sub_edit(tid, outer_tid);
edit = edit.replace_all_uses(tid, div)?; edit = edit.replace_all_uses(tid, div)?;
} }
} }
...@@ -1479,7 +1481,12 @@ fn fork_fusion( ...@@ -1479,7 +1481,12 @@ fn fork_fusion(
} }
// Perform the fusion. // Perform the fusion.
let bottom_tids: Vec<_> = editor
.get_users(bottom_fork)
.filter(|id| nodes[id.idx()].is_thread_id())
.collect();
editor.edit(|mut edit| { editor.edit(|mut edit| {
edit = edit.replace_all_uses_where(bottom_fork, top_fork, |id| bottom_tids.contains(id))?;
if bottom_join_pred != bottom_fork { if bottom_join_pred != bottom_fork {
// If there is control flow in the bottom fork-join, stitch it into // If there is control flow in the bottom fork-join, stitch it into
// the top fork-join. // the top fork-join.
......
use std::collections::HashMap;
use std::iter::zip;
use hercules_ir::ir::*; use hercules_ir::ir::*;
use crate::*; use crate::*;
...@@ -39,15 +36,30 @@ pub fn interprocedural_sroa( ...@@ -39,15 +36,30 @@ pub fn interprocedural_sroa(
} }
let editor: &mut FunctionEditor = &mut editors[func_id.idx()]; let editor: &mut FunctionEditor = &mut editors[func_id.idx()];
let param_types = &editor.func().param_types.to_vec();
let return_types = &editor.func().return_types.to_vec(); let return_types = &editor.func().return_types.to_vec();
// We determine the new return types of the function and track a map // We determine the new param/return types of the function and track a
// that tells us how the old return values are constructed from the // map that tells us how the old param/return values are constructed
// new ones // from the new ones.
let mut new_param_types = vec![];
let mut old_param_type_map = vec![];
let mut new_return_types = vec![]; let mut new_return_types = vec![];
let mut old_return_type_map = vec![]; let mut old_return_type_map = vec![];
let mut changed = false; let mut changed = false;
for par_typ in param_types.iter() {
if !can_sroa_type(editor, *par_typ) {
old_param_type_map.push(IndexTree::Leaf(new_param_types.len()));
new_param_types.push(*par_typ);
} else {
let (types, index) = sroa_type(editor, *par_typ, new_param_types.len());
old_param_type_map.push(index);
new_param_types.extend(types);
changed = true;
}
}
for ret_typ in return_types.iter() { for ret_typ in return_types.iter() {
if !can_sroa_type(editor, *ret_typ) { if !can_sroa_type(editor, *ret_typ) {
old_return_type_map.push(IndexTree::Leaf(new_return_types.len())); old_return_type_map.push(IndexTree::Leaf(new_return_types.len()));
...@@ -60,25 +72,54 @@ pub fn interprocedural_sroa( ...@@ -60,25 +72,54 @@ pub fn interprocedural_sroa(
} }
} }
// If the return type is not changed by IP SROA, skip to the next function // If the param/return types aren't changed by IP SROA, skip to the next
// function.
if !changed { if !changed {
continue; continue;
} }
// Now, modify each return in the current function and the return type // Modify each parameter in the current function and the param types.
let return_nodes = editor let mut param_nodes: Vec<_> = vec![vec![]; param_types.len()];
.func() for id in editor.node_ids() {
.nodes if let Some(idx) = editor.func().nodes[id.idx()].try_parameter() {
.iter() param_nodes[idx].push(id);
.enumerate() }
.filter_map(|(idx, node)| { }
if node.try_return().is_some() { let success = editor.edit(|mut edit| {
Some(NodeID::new(idx)) for (idx, ids) in param_nodes.into_iter().enumerate() {
let new_indices = &old_param_type_map[idx];
let built = if let IndexTree::Leaf(new_idx) = new_indices {
edit.add_node(Node::Parameter { index: *new_idx })
} else { } else {
None let prod_ty = param_types[idx];
let cons = edit.add_zero_constant(prod_ty);
let mut cons = edit.add_node(Node::Constant { id: cons });
new_indices.for_each(|idx: &Vec<Index>, param_idx: &usize| {
let param = edit.add_node(Node::Parameter { index: *param_idx });
cons = edit.add_node(Node::Write {
collect: cons,
data: param,
indices: idx.clone().into_boxed_slice(),
});
});
cons
};
for id in ids {
edit = edit.replace_all_uses(id, built)?;
edit = edit.delete_node(id)?;
} }
}) }
.collect::<Vec<_>>();
edit.set_param_types(new_param_types);
Ok(edit)
});
assert!(success, "IP SROA expects to be able to edit everything, specify what functions to IP SROA via the func_selection argument");
// Modify each return in the current function and the return types.
let return_nodes: Vec<_> = editor
.node_ids()
.filter(|id| editor.func().nodes[id.idx()].is_return())
.collect();
let success = editor.edit(|mut edit| { let success = editor.edit(|mut edit| {
for node in return_nodes { for node in return_nodes {
let Node::Return { control, data } = edit.get_node(node) else { let Node::Return { control, data } = edit.get_node(node) else {
...@@ -114,17 +155,15 @@ pub fn interprocedural_sroa( ...@@ -114,17 +155,15 @@ pub fn interprocedural_sroa(
} }
edit.set_return_types(new_return_types); edit.set_return_types(new_return_types);
Ok(edit) Ok(edit)
}); });
assert!(success, "IP SROA expects to be able to edit everything, specify what functions to IP SROA via the func_selection argument"); assert!(success, "IP SROA expects to be able to edit everything, specify what functions to IP SROA via the func_selection argument");
// Finally, update calls of this function // Finally, update calls of this function.
// In particular, we actually don't have to update the call node at all but have to update
// its DataProjection users
for (caller, callsite) in callsites { for (caller, callsite) in callsites {
let editor = &mut editors[caller.idx()]; let editor = &mut editors[caller.idx()];
assert!(editor.func_id() == caller); assert!(editor.func_id() == caller);
let projs = editor.get_users(callsite).collect::<Vec<_>>(); let projs = editor.get_users(callsite).collect::<Vec<_>>();
for proj_id in projs { for proj_id in projs {
let Node::DataProjection { data: _, selection } = editor.node(proj_id) else { let Node::DataProjection { data: _, selection } = editor.node(proj_id) else {
...@@ -134,6 +173,40 @@ pub fn interprocedural_sroa( ...@@ -134,6 +173,40 @@ pub fn interprocedural_sroa(
let typ = types[caller.idx()][proj_id.idx()]; let typ = types[caller.idx()][proj_id.idx()];
replace_returned_value(editor, proj_id, typ, new_return_info, callsite); replace_returned_value(editor, proj_id, typ, new_return_info, callsite);
} }
let (control, callee, dc_args, args) =
editor.func().nodes[callsite.idx()].try_call().unwrap();
let dc_args = dc_args.clone();
let args = args.clone();
let success = editor.edit(|mut edit| {
let mut new_args = vec![];
for (idx, (data_id, update_info)) in
args.iter().zip(old_param_type_map.iter()).enumerate()
{
if let IndexTree::Leaf(new_idx) = update_info {
// Unchanged parameter value
assert!(new_args.len() == *new_idx);
new_args.push(*data_id);
} else {
// SROA'd parameter value
let reads = generate_reads_edit(&mut edit, param_types[idx], *data_id);
reads.zip(update_info).for_each(|_, (read_id, ret_idx)| {
assert!(new_args.len() == **ret_idx);
new_args.push(*read_id);
});
}
}
let new_call = edit.add_node(Node::Call {
control,
function: callee,
dynamic_constants: dc_args,
args: new_args.into_boxed_slice(),
});
edit = edit.replace_all_uses(callsite, new_call)?;
edit = edit.delete_node(callsite)?;
Ok(edit)
});
assert!(success);
} }
} }
} }
......
...@@ -447,7 +447,7 @@ pub fn sroa( ...@@ -447,7 +447,7 @@ pub fn sroa(
field_map.insert(node, generate_reads(editor, types[&node], node)); field_map.insert(node, generate_reads(editor, types[&node], node));
} }
Node::Constant { id } => { Node::Constant { id } => {
field_map.insert(node, generate_constant_fields(editor, id)); field_map.insert(node, generate_constant_fields(editor, id, node));
to_delete.push(node); to_delete.push(node);
} }
_ => { _ => {
...@@ -1079,7 +1079,11 @@ pub fn generate_constant(editor: &mut FunctionEditor, typ: TypeID) -> ConstantID ...@@ -1079,7 +1079,11 @@ pub fn generate_constant(editor: &mut FunctionEditor, typ: TypeID) -> ConstantID
// Given a constant cnst adds node to the function which are the constant values of each field and // Given a constant cnst adds node to the function which are the constant values of each field and
// returns a list of pairs of indices and the node that holds that index // returns a list of pairs of indices and the node that holds that index
fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> IndexTree<NodeID> { fn generate_constant_fields(
editor: &mut FunctionEditor,
cnst: ConstantID,
old_node: NodeID,
) -> IndexTree<NodeID> {
let cs: Option<Vec<ConstantID>> = let cs: Option<Vec<ConstantID>> =
if let Some(cs) = editor.get_constant(cnst).try_product_fields() { if let Some(cs) = editor.get_constant(cnst).try_product_fields() {
Some(cs.into()) Some(cs.into())
...@@ -1090,13 +1094,14 @@ fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> In ...@@ -1090,13 +1094,14 @@ fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> In
if let Some(cs) = cs { if let Some(cs) = cs {
let mut fields = vec![]; let mut fields = vec![];
for c in cs { for c in cs {
fields.push(generate_constant_fields(editor, c)); fields.push(generate_constant_fields(editor, c, old_node));
} }
IndexTree::Node(fields) IndexTree::Node(fields)
} else { } else {
let mut node = None; let mut node = None;
editor.edit(|mut edit| { editor.edit(|mut edit| {
node = Some(edit.add_node(Node::Constant { id: cnst })); node = Some(edit.add_node(Node::Constant { id: cnst }));
edit.sub_edit(old_node, node.unwrap());
Ok(edit) Ok(edit)
}); });
IndexTree::Leaf(node.expect("Add node cannot fail")) IndexTree::Leaf(node.expect("Add node cannot fail"))
......
...@@ -145,7 +145,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>( ...@@ -145,7 +145,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>(
@image_loop for r = 0 to row { @image_loop for r = 0 to row {
for c = 0 to col { for c = 0 to col {
@l2 let l2_dist : f32[num_ctrl_pts]; @l2 let l2_dist : f32[num_ctrl_pts];
for cp = 0 to num_ctrl_pts { @cp_loop for cp = 0 to num_ctrl_pts {
let v1 = input[0, r, c] - ctrl_pts[cp, 0]; let v1 = input[0, r, c] - ctrl_pts[cp, 0];
let v2 = input[1, r, c] - ctrl_pts[cp, 1]; let v2 = input[1, r, c] - ctrl_pts[cp, 1];
let v3 = input[2, r, c] - ctrl_pts[cp, 2]; let v3 = input[2, r, c] - ctrl_pts[cp, 2];
...@@ -155,7 +155,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>( ...@@ -155,7 +155,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>(
@channel_loop for chan = 0 to CHAN { @channel_loop for chan = 0 to CHAN {
let chan_val : f32 = 0.0; let chan_val : f32 = 0.0;
for cp = 0 to num_ctrl_pts { @cp_loop for cp = 0 to num_ctrl_pts {
chan_val += l2_dist[cp] * weights[cp, chan]; chan_val += l2_dist[cp] * weights[cp, chan];
} }
......
...@@ -115,7 +115,7 @@ array-slf(fuse4); ...@@ -115,7 +115,7 @@ array-slf(fuse4);
simpl!(fuse4); simpl!(fuse4);
let par = fuse4@image_loop \ fuse4@channel_loop; let par = fuse4@image_loop \ fuse4@channel_loop;
fork-tile[4, 1, false, false](par); fork-tile[4, 1, false, false](par);
fork-tile[4, 0, false, false](par); fork-tile[8, 0, false, false](par);
fork-interchange[1, 2](par); fork-interchange[1, 2](par);
let split = fork-split(par); let split = fork-split(par);
let fuse4_body = outline(split.cava_3.fj2); let fuse4_body = outline(split.cava_3.fj2);
......
...@@ -117,9 +117,9 @@ fixpoint { ...@@ -117,9 +117,9 @@ fixpoint {
simpl!(fuse4); simpl!(fuse4);
array-slf(fuse4); array-slf(fuse4);
simpl!(fuse4); simpl!(fuse4);
//fork-tile[2, 0, false, true](fuse4@channel_loop); fork-tile[2, 0, false, true](fuse4@channel_loop);
//fork-split(fuse4@channel_loop); let out = fork-split(fuse4@channel_loop);
//clean-monoid-reduces(fuse4); fork-unroll(out.cava_3.fj1);
unforkify(fuse4@channel_loop); unforkify(fuse4@channel_loop);
no-memset(fuse5@res1); no-memset(fuse5@res1);
...@@ -133,6 +133,13 @@ simpl!(fuse5); ...@@ -133,6 +133,13 @@ simpl!(fuse5);
array-slf(fuse5); array-slf(fuse5);
simpl!(fuse5); simpl!(fuse5);
fork-tile[4, 1, false, true](fuse4);
fork-tile[8, 0, false, true](fuse4);
fork-interchange[1, 2](fuse4);
let split = fork-split(fuse4);
fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2);
fork-coalesce(split.cava_3.fj2);
delete-uncalled(*); delete-uncalled(*);
simpl!(*); simpl!(*);
......
...@@ -124,9 +124,9 @@ pub struct CavaInputs { ...@@ -124,9 +124,9 @@ pub struct CavaInputs {
#[clap(long = "output-verify", value_name = "PATH")] #[clap(long = "output-verify", value_name = "PATH")]
pub output_verify: Option<String>, pub output_verify: Option<String>,
pub cam_model: String, pub cam_model: String,
#[clap(short, long)] #[clap(long)]
pub crop_rows: Option<usize>, pub crop_rows: Option<usize>,
#[clap(short, long)] #[clap(long)]
pub crop_cols: Option<usize>, pub crop_cols: Option<usize>,
} }
......
...@@ -26,14 +26,14 @@ predication(gaussian_smoothing); ...@@ -26,14 +26,14 @@ predication(gaussian_smoothing);
simpl!(gaussian_smoothing); simpl!(gaussian_smoothing);
let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop; let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop;
fork-tile[4, 1, false, false](par); fork-tile[4, 1, false, false](par);
fork-tile[4, 0, false, false](par); fork-tile[8, 0, false, false](par);
fork-interchange[1, 2](par); fork-interchange[1, 2](par);
let split = fork-split(par); let split = fork-split(par);
let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2); let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2);
fork-coalesce(gaussian_smoothing, gaussian_smoothing_body); fork-coalesce(gaussian_smoothing, gaussian_smoothing_body);
simpl!(gaussian_smoothing, gaussian_smoothing_body); simpl!(gaussian_smoothing, gaussian_smoothing_body);
no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2); no-memset(laplacian_estimate@res);
fixpoint { fixpoint {
forkify(laplacian_estimate); forkify(laplacian_estimate);
fork-guard-elim(laplacian_estimate); fork-guard-elim(laplacian_estimate);
...@@ -42,15 +42,15 @@ fixpoint { ...@@ -42,15 +42,15 @@ fixpoint {
simpl!(laplacian_estimate); simpl!(laplacian_estimate);
let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop; let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop;
fork-tile[4, 1, false, false](par); fork-tile[4, 1, false, false](par);
fork-tile[4, 0, false, false](par); fork-tile[8, 0, false, false](par);
fork-interchange[1, 2](par); fork-interchange[1, 2](par);
let split = fork-split(par); let split = fork-split(par);
let body = split._1_laplacian_estimate.fj2 | laplacian_estimate.shr1 | laplacian_estimate.shr2; let body = split._1_laplacian_estimate.fj2;
let laplacian_estimate_body = outline(body); let laplacian_estimate_body = outline(body);
fork-coalesce(laplacian_estimate, laplacian_estimate_body); fork-coalesce(laplacian_estimate, laplacian_estimate_body);
simpl!(laplacian_estimate, laplacian_estimate_body); simpl!(laplacian_estimate, laplacian_estimate_body);
no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2); no-memset(zero_crossings@res);
fixpoint { fixpoint {
forkify(zero_crossings); forkify(zero_crossings);
fork-guard-elim(zero_crossings); fork-guard-elim(zero_crossings);
...@@ -59,10 +59,10 @@ fixpoint { ...@@ -59,10 +59,10 @@ fixpoint {
simpl!(zero_crossings); simpl!(zero_crossings);
let par = zero_crossings@image_loop \ zero_crossings@filter_loop; let par = zero_crossings@image_loop \ zero_crossings@filter_loop;
fork-tile[4, 1, false, false](par); fork-tile[4, 1, false, false](par);
fork-tile[4, 0, false, false](par); fork-tile[8, 0, false, false](par);
fork-interchange[1, 2](par); fork-interchange[1, 2](par);
let split = fork-split(par); let split = fork-split(par);
let body = split._2_zero_crossings.fj2 | zero_crossings.shr1 | zero_crossings.shr2; let body = split._2_zero_crossings.fj2;
let zero_crossings_body = outline(body); let zero_crossings_body = outline(body);
fork-coalesce(zero_crossings, zero_crossings_body); fork-coalesce(zero_crossings, zero_crossings_body);
simpl!(zero_crossings, zero_crossings_body); simpl!(zero_crossings, zero_crossings_body);
...@@ -86,7 +86,7 @@ fixpoint { ...@@ -86,7 +86,7 @@ fixpoint {
simpl!(max_gradient); simpl!(max_gradient);
fork-dim-merge(max_gradient); fork-dim-merge(max_gradient);
simpl!(max_gradient); simpl!(max_gradient);
fork-tile[16, 0, false, false](max_gradient); fork-tile[32, 0, false, false](max_gradient);
let split = fork-split(max_gradient); let split = fork-split(max_gradient);
clean-monoid-reduces(max_gradient); clean-monoid-reduces(max_gradient);
let out = outline(split._4_max_gradient.fj1); let out = outline(split._4_max_gradient.fj1);
...@@ -105,7 +105,7 @@ fixpoint { ...@@ -105,7 +105,7 @@ fixpoint {
predication(reject_zero_crossings); predication(reject_zero_crossings);
simpl!(reject_zero_crossings); simpl!(reject_zero_crossings);
fork-tile[4, 1, false, false](reject_zero_crossings); fork-tile[4, 1, false, false](reject_zero_crossings);
fork-tile[4, 0, false, false](reject_zero_crossings); fork-tile[8, 0, false, false](reject_zero_crossings);
fork-interchange[1, 2](reject_zero_crossings); fork-interchange[1, 2](reject_zero_crossings);
let split = fork-split(reject_zero_crossings); let split = fork-split(reject_zero_crossings);
let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2); let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2);
......
...@@ -43,35 +43,16 @@ fn laplacian_estimate<n, m, sz: usize>( ...@@ -43,35 +43,16 @@ fn laplacian_estimate<n, m, sz: usize>(
@image_loop for row = 0 to n { @image_loop for row = 0 to n {
for col = 0 to m { for col = 0 to m {
// Copy data for dilation filter
@shr1 let imageArea : f32[sz, sz];
@filter_loop for i = 0 to sz {
for j = 0 to sz {
imageArea[i, j] = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MIN_BR
else input[row + i - r, col + j - r];
}
}
// Compute pixel of dilated image // Compute pixel of dilated image
let dilated = MIN_BR; let dilated = MIN_BR;
@filter_loop for i = 0 to sz { @filter_loop for i = 0 to sz {
for j = 0 to sz { for j = 0 to sz {
dilated = max!(dilated, imageArea[i, j] * structure[i, j]); let filter = if row + i < r
} || row + i - r > n - 1
} || col + j < r
|| col + j - r > m - 1 then MIN_BR
// Data copy for erotion filter else input[row + i - r, col + j - r];
@shr2 let imageArea : f32[sz, sz]; dilated = max!(dilated, filter * structure[i, j]);
@filter_loop for i = 0 to sz {
for j = 0 to sz {
imageArea[i, j] = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MAX_BR
else input[row + i - r, col + j - r];
} }
} }
...@@ -79,7 +60,12 @@ fn laplacian_estimate<n, m, sz: usize>( ...@@ -79,7 +60,12 @@ fn laplacian_estimate<n, m, sz: usize>(
let eroded = MAX_BR; let eroded = MAX_BR;
@filter_loop for i = 0 to sz { @filter_loop for i = 0 to sz {
for j = 0 to sz { for j = 0 to sz {
eroded = min!(eroded, imageArea[i, j] * structure[i, j]); let filter = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MAX_BR
else input[row + i - r, col + j - r];
eroded = min!(eroded, filter * structure[i, j]);
} }
} }
...@@ -101,37 +87,17 @@ fn zero_crossings<n, m, sz: usize>( ...@@ -101,37 +87,17 @@ fn zero_crossings<n, m, sz: usize>(
@image_loop for row = 0 to n { @image_loop for row = 0 to n {
for col = 0 to m { for col = 0 to m {
// Data copy for dilation filter
@shr1 let imageArea : f32[sz, sz];
@filter_loop for i = 0 to sz {
for j = 0 to sz {
imageArea[i, j] = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MIN_BR
else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
else MIN_BR;
}
}
// Compute the pixel of dilated image // Compute the pixel of dilated image
let dilated = MIN_BR; let dilated = MIN_BR;
@filter_loop for i = 0 to sz { @filter_loop for i = 0 to sz {
for j = 0 to sz { for j = 0 to sz {
dilated = max!(dilated, imageArea[i, j] * structure[i, j]); let filter = if row + i < r
} || row + i - r > n - 1
} || col + j < r
|| col + j - r > m - 1 then MIN_BR
// Data copy for erotion filter else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
@shr2 let imageArea : f32[sz, sz]; else MIN_BR;
@filter_loop for i = 0 to sz { dilated = max!(dilated, filter * structure[i, j]);
for j = 0 to sz {
imageArea[i, j] = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MAX_BR
else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
else MIN_BR;
} }
} }
...@@ -139,7 +105,13 @@ fn zero_crossings<n, m, sz: usize>( ...@@ -139,7 +105,13 @@ fn zero_crossings<n, m, sz: usize>(
let eroded = MAX_BR; let eroded = MAX_BR;
@filter_loop for i = 0 to sz { @filter_loop for i = 0 to sz {
for j = 0 to sz { for j = 0 to sz {
eroded = min!(eroded, imageArea[i, j] * structure[i, j]); let filter = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MAX_BR
else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
else MIN_BR;
eroded = min!(eroded, filter * structure[i, j]);
} }
} }
...@@ -166,7 +138,7 @@ fn gradient<n, m, sb: usize>( ...@@ -166,7 +138,7 @@ fn gradient<n, m, sb: usize>(
let gx = 0; let gx = 0;
let gy = 0; let gy = 0;
for i = 0 to sb { @filter_loop for i = 0 to sb {
for j = 0 to sb { for j = 0 to sb {
let val = input[if row + i < sbr then 0 let val = input[if row + i < sbr then 0
else if row + i - sbr > n - 1 then n - 1 else if row + i - sbr > n - 1 then n - 1
......
...@@ -26,22 +26,49 @@ predication(gaussian_smoothing); ...@@ -26,22 +26,49 @@ predication(gaussian_smoothing);
simpl!(gaussian_smoothing); simpl!(gaussian_smoothing);
predication(gaussian_smoothing); predication(gaussian_smoothing);
simpl!(gaussian_smoothing); simpl!(gaussian_smoothing);
fork-dim-merge(gaussian_smoothing@filter_loop);
unforkify(gaussian_smoothing@filter_loop);
simpl!(gaussian_smoothing);
no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2); fork-dim-merge(gaussian_smoothing);
fork-tile[32, 0, false, true](gaussian_smoothing);
simpl!(gaussian_smoothing);
fork-split(gaussian_smoothing);
simpl!(gaussian_smoothing);
no-memset(laplacian_estimate@res);
fixpoint { fixpoint {
forkify(laplacian_estimate); forkify(laplacian_estimate);
fork-guard-elim(laplacian_estimate); fork-guard-elim(laplacian_estimate);
fork-coalesce(laplacian_estimate); fork-coalesce(laplacian_estimate);
} }
simpl!(laplacian_estimate); simpl!(laplacian_estimate);
fork-dim-merge(laplacian_estimate@filter_loop);
unforkify(laplacian_estimate@filter_loop);
simpl!(laplacian_estimate);
no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2); fork-dim-merge(laplacian_estimate);
fork-tile[32, 0, false, true](laplacian_estimate);
simpl!(laplacian_estimate);
fork-split(laplacian_estimate);
simpl!(laplacian_estimate);
no-memset(zero_crossings@res);
fixpoint { fixpoint {
forkify(zero_crossings); forkify(zero_crossings);
fork-guard-elim(zero_crossings); fork-guard-elim(zero_crossings);
fork-coalesce(zero_crossings); fork-coalesce(zero_crossings);
} }
simpl!(zero_crossings); simpl!(zero_crossings);
fork-dim-merge(zero_crossings@filter_loop);
unforkify(zero_crossings@filter_loop);
simpl!(zero_crossings);
fork-dim-merge(zero_crossings);
fork-tile[32, 0, false, true](zero_crossings);
simpl!(zero_crossings);
fork-split(zero_crossings);
simpl!(zero_crossings);
no-memset(gradient@res); no-memset(gradient@res);
fixpoint { fixpoint {
...@@ -53,6 +80,15 @@ predication(gradient); ...@@ -53,6 +80,15 @@ predication(gradient);
simpl!(gradient); simpl!(gradient);
predication(gradient); predication(gradient);
simpl!(gradient); simpl!(gradient);
fork-dim-merge(gradient@filter_loop);
unforkify(gradient@filter_loop);
simpl!(gradient);
fork-dim-merge(gradient);
fork-tile[32, 0, false, true](gradient);
simpl!(gradient);
fork-split(gradient);
simpl!(gradient);
fixpoint { fixpoint {
forkify(max_gradient); forkify(max_gradient);
...@@ -88,6 +124,12 @@ fixpoint { ...@@ -88,6 +124,12 @@ fixpoint {
predication(reject_zero_crossings); predication(reject_zero_crossings);
simpl!(reject_zero_crossings); simpl!(reject_zero_crossings);
fork-dim-merge(reject_zero_crossings);
fork-tile[32, 0, false, true](reject_zero_crossings);
simpl!(reject_zero_crossings);
fork-split(reject_zero_crossings);
simpl!(reject_zero_crossings);
async-call(edge_detection@le, edge_detection@zc); async-call(edge_detection@le, edge_detection@zc);
simpl!(*); simpl!(*);
......
use juno_build::JunoCompiler; use juno_build::JunoCompiler;
fn main() { fn main() {
#[cfg(not(feature = "cuda"))] JunoCompiler::new()
{ .file_in_src("matmul.jn")
JunoCompiler::new() .unwrap()
.file_in_src("matmul.jn") .schedule_in_src("matmul.sch")
.unwrap() .unwrap()
.schedule_in_src("cpu.sch") .build()
.unwrap() .unwrap();
.build()
.unwrap();
}
#[cfg(feature = "cuda")]
{
JunoCompiler::new()
.file_in_src("matmul.jn")
.unwrap()
.schedule_in_src("gpu.sch")
.unwrap()
.build()
.unwrap();
}
} }
phi-elim(*);
forkify(*);
fork-guard-elim(*);
dce(*);
fixpoint {
reduce-slf(*);
slf(*);
infer-schedules(*);
}
fork-coalesce(*);
infer-schedules(*);
dce(*);
rewrite(*);
fixpoint {
simplify-cfg(*);
dce(*);
}
ip-sroa(*);
sroa(*);
dce(*);
float-collections(*);
gcm(*);
...@@ -41,21 +41,41 @@ macro unforkify!(X) { ...@@ -41,21 +41,41 @@ macro unforkify!(X) {
optimize!(*); optimize!(*);
forkify!(*); forkify!(*);
associative(matmul@outer);
// Parallelize by computing output array as 16 chunks if feature("cuda") {
let par = matmul@outer \ matmul@inner; fixpoint {
fork-tile![4](par); reduce-slf(*);
let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par); slf(*);
parallelize!(outer \ inner); infer-schedules(*);
}
fork-coalesce(*);
infer-schedules(*);
dce(*);
rewrite(*);
fixpoint {
simplify-cfg(*);
dce(*);
}
let body = outline(inner); optimize!(*);
cpu(body); codegen-prep!(*);
} else {
associative(matmul@outer);
// Tile for cache, assuming 64B cache lines // Parallelize by computing output array as 16 chunks
fork-tile![16](body); let par = matmul@outer \ matmul@inner;
let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body); fork-tile![4](par);
let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par);
parallelize!(outer \ inner);
reduce-slf(inner); let body = outline(inner);
unforkify!(body); cpu(body);
codegen-prep!(*);
// Tile for cache, assuming 64B cache lines
fork-tile![16](body);
let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body);
reduce-slf(inner);
unforkify!(body);
codegen-prep!(*);
}
...@@ -28,40 +28,55 @@ fn cfd_bench(c: &mut Criterion) { ...@@ -28,40 +28,55 @@ fn cfd_bench(c: &mut Criterion) {
elements_surrounding_elements, elements_surrounding_elements,
normals, normals,
} = read_domain_geometry(data_file, block_size); } = read_domain_geometry(data_file, block_size);
let mut variables = initialize_variables(nelr, ff_variable.as_slice()); let mut variables = initialize_variables(nelr, &ff_variable);
let mut variables = HerculesMutBox::from(variables.as_mut_slice());
let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
let areas = HerculesImmBox::from(areas.as_slice()); let areas = HerculesImmBox::from(areas.as_slice());
let elements_surrounding_elements = let elements_surrounding_elements =
HerculesImmBox::from(elements_surrounding_elements.as_slice()); HerculesImmBox::from(elements_surrounding_elements.as_slice());
let normals = HerculesImmBox::from(normals.as_slice());
let ff_variable = HerculesImmBox::from(ff_variable.as_slice()); let normals_x = HerculesImmBox::from(normals.x.as_slice());
let ff_fc_density_energy = vec![ let normals_y = HerculesImmBox::from(normals.y.as_slice());
ff_fc_density_energy.x, let normals_z = HerculesImmBox::from(normals.z.as_slice());
ff_fc_density_energy.y,
ff_fc_density_energy.z,
];
let ff_fc_density_energy = HerculesImmBox::from(ff_fc_density_energy.as_slice());
let ff_fc_momentum_x = vec![ff_fc_momentum_x.x, ff_fc_momentum_x.y, ff_fc_momentum_x.z];
let ff_fc_momentum_x = HerculesImmBox::from(ff_fc_momentum_x.as_slice());
let ff_fc_momentum_y = vec![ff_fc_momentum_y.x, ff_fc_momentum_y.y, ff_fc_momentum_y.z];
let ff_fc_momentum_y = HerculesImmBox::from(ff_fc_momentum_y.as_slice());
let ff_fc_momentum_z = vec![ff_fc_momentum_z.x, ff_fc_momentum_z.y, ff_fc_momentum_z.z];
let ff_fc_momentum_z = HerculesImmBox::from(ff_fc_momentum_z.as_slice());
group.bench_function("cfd bench euler", |b| { group.bench_function("cfd bench euler", |b| {
b.iter(|| { b.iter(|| {
async_std::task::block_on(async { async_std::task::block_on(async {
r.run( r.run(
nelr as u64, nelr as u64,
iterations as u64, iterations as u64,
variables.to(), v_density.to(),
v_momentum_x.to(),
v_momentum_y.to(),
v_momentum_z.to(),
v_energy.to(),
areas.to(), areas.to(),
elements_surrounding_elements.to(), elements_surrounding_elements.to(),
normals.to(), normals_x.to(),
ff_variable.to(), normals_y.to(),
ff_fc_density_energy.to(), normals_z.to(),
ff_fc_momentum_x.to(), ff_variable.density,
ff_fc_momentum_y.to(), ff_variable.momentum.x,
ff_fc_momentum_z.to(), ff_variable.momentum.y,
ff_variable.momentum.z,
ff_variable.energy,
ff_fc_density_energy.x,
ff_fc_density_energy.y,
ff_fc_density_energy.z,
ff_fc_momentum_x.x,
ff_fc_momentum_x.y,
ff_fc_momentum_x.z,
ff_fc_momentum_y.x,
ff_fc_momentum_y.y,
ff_fc_momentum_y.z,
ff_fc_momentum_z.x,
ff_fc_momentum_z.y,
ff_fc_momentum_z.z,
) )
.await .await
}); });
...@@ -85,40 +100,55 @@ fn cfd_bench(c: &mut Criterion) { ...@@ -85,40 +100,55 @@ fn cfd_bench(c: &mut Criterion) {
elements_surrounding_elements, elements_surrounding_elements,
normals, normals,
} = read_domain_geometry(data_file, block_size); } = read_domain_geometry(data_file, block_size);
let mut variables = initialize_variables(nelr, ff_variable.as_slice()); let mut variables = initialize_variables(nelr, &ff_variable);
let mut variables = HerculesMutBox::from(variables.as_mut_slice());
let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
let areas = HerculesImmBox::from(areas.as_slice()); let areas = HerculesImmBox::from(areas.as_slice());
let elements_surrounding_elements = let elements_surrounding_elements =
HerculesImmBox::from(elements_surrounding_elements.as_slice()); HerculesImmBox::from(elements_surrounding_elements.as_slice());
let normals = HerculesImmBox::from(normals.as_slice());
let ff_variable = HerculesImmBox::from(ff_variable.as_slice()); let normals_x = HerculesImmBox::from(normals.x.as_slice());
let ff_fc_density_energy = vec![ let normals_y = HerculesImmBox::from(normals.y.as_slice());
ff_fc_density_energy.x, let normals_z = HerculesImmBox::from(normals.z.as_slice());
ff_fc_density_energy.y,
ff_fc_density_energy.z,
];
let ff_fc_density_energy = HerculesImmBox::from(ff_fc_density_energy.as_slice());
let ff_fc_momentum_x = vec![ff_fc_momentum_x.x, ff_fc_momentum_x.y, ff_fc_momentum_x.z];
let ff_fc_momentum_x = HerculesImmBox::from(ff_fc_momentum_x.as_slice());
let ff_fc_momentum_y = vec![ff_fc_momentum_y.x, ff_fc_momentum_y.y, ff_fc_momentum_y.z];
let ff_fc_momentum_y = HerculesImmBox::from(ff_fc_momentum_y.as_slice());
let ff_fc_momentum_z = vec![ff_fc_momentum_z.x, ff_fc_momentum_z.y, ff_fc_momentum_z.z];
let ff_fc_momentum_z = HerculesImmBox::from(ff_fc_momentum_z.as_slice());
group.bench_function("cfd bench pre-euler", |b| { group.bench_function("cfd bench pre-euler", |b| {
b.iter(|| { b.iter(|| {
async_std::task::block_on(async { async_std::task::block_on(async {
r.run( r.run(
nelr as u64, nelr as u64,
iterations as u64, iterations as u64,
variables.to(), v_density.to(),
v_momentum_x.to(),
v_momentum_y.to(),
v_momentum_z.to(),
v_energy.to(),
areas.to(), areas.to(),
elements_surrounding_elements.to(), elements_surrounding_elements.to(),
normals.to(), normals_x.to(),
ff_variable.to(), normals_y.to(),
ff_fc_density_energy.to(), normals_z.to(),
ff_fc_momentum_x.to(), ff_variable.density,
ff_fc_momentum_y.to(), ff_variable.momentum.x,
ff_fc_momentum_z.to(), ff_variable.momentum.y,
ff_variable.momentum.z,
ff_variable.energy,
ff_fc_density_energy.x,
ff_fc_density_energy.y,
ff_fc_density_energy.z,
ff_fc_momentum_x.x,
ff_fc_momentum_x.y,
ff_fc_momentum_x.z,
ff_fc_momentum_y.x,
ff_fc_momentum_y.y,
ff_fc_momentum_y.z,
ff_fc_momentum_z.x,
ff_fc_momentum_z.y,
ff_fc_momentum_z.z,
) )
.await .await
}); });
......
...@@ -12,10 +12,11 @@ macro simpl!(X) { ...@@ -12,10 +12,11 @@ macro simpl!(X) {
simpl!(*); simpl!(*);
inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step); inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
delete-uncalled(*); delete-uncalled(*);
simpl!(*); simpl!(*);
ip-sroa[false](*); ip-sroa[true](*);
sroa[false](*); sroa[true](*);
predication(*); predication(*);
const-inline(*); const-inline(*);
simpl!(*); simpl!(*);
...@@ -24,8 +25,31 @@ fixpoint { ...@@ -24,8 +25,31 @@ fixpoint {
fork-guard-elim(*); fork-guard-elim(*);
} }
simpl!(*); simpl!(*);
no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res); unforkify(compute_flux@inner_loop);
parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop);
fork-tile[32, 0, false, false](compute_step_factor);
let split = fork-split(compute_step_factor);
let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
fork-coalesce(compute_step_factor, compute_step_factor_body);
simpl!(compute_step_factor, compute_step_factor_body);
fork-tile[32, 0, false, false](compute_flux);
let split = fork-split(compute_flux);
let compute_flux_body = outline(split._6_compute_flux.fj1);
fork-coalesce(compute_flux, compute_flux_body);
simpl!(compute_flux, compute_flux_body);
fork-tile[32, 0, false, false](time_step);
let split = fork-split(time_step);
let time_step_body = outline(split._7_time_step.fj1);
fork-coalesce(time_step, time_step_body);
simpl!(time_step, time_step_body);
fork-tile[32, 0, false, false](copy_vars);
let split = fork-split(copy_vars);
let copy_vars_body = outline(split._8_copy_vars.fj1);
fork-coalesce(copy_vars, copy_vars_body);
simpl!(copy_vars, copy_vars_body);
unforkify(*); unforkify(compute_step_factor_body, compute_flux_body, time_step_body, copy_vars_body);
gcm(*); gcm(*);
...@@ -14,8 +14,8 @@ simpl!(*); ...@@ -14,8 +14,8 @@ simpl!(*);
inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step); inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step);
delete-uncalled(*); delete-uncalled(*);
simpl!(*); simpl!(*);
ip-sroa[false](*); ip-sroa[true](*);
sroa[false](*); sroa[true](*);
predication(*); predication(*);
const-inline(*); const-inline(*);
simpl!(*); simpl!(*);
...@@ -24,7 +24,38 @@ fixpoint { ...@@ -24,7 +24,38 @@ fixpoint {
fork-guard-elim(*); fork-guard-elim(*);
} }
simpl!(*); simpl!(*);
no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res);
unforkify(compute_flux@inner_loop);
unforkify(*); fork-tile[32, 0, false, false](compute_step_factor);
let split = fork-split(compute_step_factor);
let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
fork-coalesce(compute_step_factor, compute_step_factor_body);
simpl!(compute_step_factor, compute_step_factor_body);
fork-tile[32, 0, false, false](compute_flux_contributions);
let split = fork-split(compute_flux_contributions);
let compute_flux_contributions_body = outline(split._6_compute_flux_contributions.fj1);
fork-coalesce(compute_flux_contributions, compute_flux_contributions_body);
simpl!(compute_flux_contributions, compute_flux_contributions_body);
fork-tile[32, 0, false, false](compute_flux);
let split = fork-split(compute_flux);
let compute_flux_body = outline(split._7_compute_flux.fj1);
fork-coalesce(compute_flux, compute_flux_body);
simpl!(compute_flux, compute_flux_body);
fork-tile[32, 0, false, false](time_step);
let split = fork-split(time_step);
let time_step_body = outline(split._8_time_step.fj1);
fork-coalesce(time_step, time_step_body);
simpl!(time_step, time_step_body);
fork-tile[32, 0, false, false](copy_vars);
let split = fork-split(copy_vars);
let copy_vars_body = outline(split._9_copy_vars.fj1);
fork-coalesce(copy_vars, copy_vars_body);
simpl!(copy_vars, copy_vars_body);
unforkify(compute_step_factor_body, compute_flux_contributions_body, compute_flux_body, time_step_body, copy_vars_body);
gcm(*); gcm(*);
gvn(*); macro simpl!(X) {
dce(*); ccp(X);
phi-elim(*); simplify-cfg(X);
dce(*); lift-dc-math(X);
crc(*); gvn(X);
dce(*); phi-elim(X);
slf(*); crc(X);
dce(*); slf(X);
dce(X);
infer-schedules(X);
}
let auto = auto-outline(euler); simpl!(*);
gpu(auto.euler); inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
inline(auto.euler);
inline(auto.euler);
delete-uncalled(*); delete-uncalled(*);
gpu(copy_vars, compute_step_factor, compute_flux, time_step);
sroa[false](auto.euler); simpl!(*);
dce(*); ip-sroa[true](*);
float-collections(*); sroa[true](*);
dce(*); predication(*);
const-inline(*);
simpl!(*);
fixpoint {
forkify(*);
fork-guard-elim(*);
}
simpl!(*);
unforkify(compute_flux@inner_loop);
gcm(*); fork-tile[32, 0, false, true](compute_step_factor);
fork-split(compute_step_factor);
fork-tile[32, 0, false, true](compute_flux);
fork-split(compute_flux);
fork-tile[32, 0, false, true](time_step);
fork-split(time_step);
fork-tile[32, 0, false, true](copy_vars);
fork-split(copy_vars);
gcm(*);