Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • llvm/hercules
1 result
Show changes
Commits on Source (2)
Showing
with 246 additions and 109 deletions
/target **/target
*.dot *.dot
!paper_resources/*.dot !paper_resources/*.dot
*.bc *.bc
...@@ -14,3 +14,4 @@ ...@@ -14,3 +14,4 @@
.vscode .vscode
*_env *_env
*.txt *.txt
*ncu-rep
\ No newline at end of file
...@@ -1225,11 +1225,13 @@ namespace cg = cooperative_groups; ...@@ -1225,11 +1225,13 @@ namespace cg = cooperative_groups;
// because Fork basic block's init section already does gating // because Fork basic block's init section already does gating
write!( write!(
w, w,
"{}{} = (threadIdx.x % {}) / {};\n", "{}{} = (((threadIdx.x % {}) / {}) / ({})) % ({});\n",
tabs, tabs,
define_variable, define_variable,
use_thread_quota.unwrap(), use_thread_quota.unwrap(),
use_thread_quota.unwrap() / parallel_factor.unwrap() use_thread_quota.unwrap() / parallel_factor.unwrap(),
divide,
modulo,
)?; )?;
} }
} }
......
...@@ -1169,6 +1169,7 @@ pub fn fork_dim_merge( ...@@ -1169,6 +1169,7 @@ pub fn fork_dim_merge(
op: BinaryOperator::Rem, op: BinaryOperator::Rem,
}); });
edit.sub_edit(tid, rem); edit.sub_edit(tid, rem);
edit.sub_edit(tid, outer_tid);
edit = edit.replace_all_uses(tid, rem)?; edit = edit.replace_all_uses(tid, rem)?;
} else if tid_dim == inner_idx { } else if tid_dim == inner_idx {
let outer_tid = Node::ThreadID { let outer_tid = Node::ThreadID {
...@@ -1185,6 +1186,7 @@ pub fn fork_dim_merge( ...@@ -1185,6 +1186,7 @@ pub fn fork_dim_merge(
op: BinaryOperator::Div, op: BinaryOperator::Div,
}); });
edit.sub_edit(tid, div); edit.sub_edit(tid, div);
edit.sub_edit(tid, outer_tid);
edit = edit.replace_all_uses(tid, div)?; edit = edit.replace_all_uses(tid, div)?;
} }
} }
...@@ -1479,7 +1481,12 @@ fn fork_fusion( ...@@ -1479,7 +1481,12 @@ fn fork_fusion(
} }
// Perform the fusion. // Perform the fusion.
let bottom_tids: Vec<_> = editor
.get_users(bottom_fork)
.filter(|id| nodes[id.idx()].is_thread_id())
.collect();
editor.edit(|mut edit| { editor.edit(|mut edit| {
edit = edit.replace_all_uses_where(bottom_fork, top_fork, |id| bottom_tids.contains(id))?;
if bottom_join_pred != bottom_fork { if bottom_join_pred != bottom_fork {
// If there is control flow in the bottom fork-join, stitch it into // If there is control flow in the bottom fork-join, stitch it into
// the top fork-join. // the top fork-join.
......
...@@ -85,7 +85,6 @@ pub fn interprocedural_sroa( ...@@ -85,7 +85,6 @@ pub fn interprocedural_sroa(
param_nodes[idx].push(id); param_nodes[idx].push(id);
} }
} }
println!("{}", editor.func().name);
let success = editor.edit(|mut edit| { let success = editor.edit(|mut edit| {
for (idx, ids) in param_nodes.into_iter().enumerate() { for (idx, ids) in param_nodes.into_iter().enumerate() {
let new_indices = &old_param_type_map[idx]; let new_indices = &old_param_type_map[idx];
......
...@@ -447,7 +447,7 @@ pub fn sroa( ...@@ -447,7 +447,7 @@ pub fn sroa(
field_map.insert(node, generate_reads(editor, types[&node], node)); field_map.insert(node, generate_reads(editor, types[&node], node));
} }
Node::Constant { id } => { Node::Constant { id } => {
field_map.insert(node, generate_constant_fields(editor, id)); field_map.insert(node, generate_constant_fields(editor, id, node));
to_delete.push(node); to_delete.push(node);
} }
_ => { _ => {
...@@ -1079,7 +1079,11 @@ pub fn generate_constant(editor: &mut FunctionEditor, typ: TypeID) -> ConstantID ...@@ -1079,7 +1079,11 @@ pub fn generate_constant(editor: &mut FunctionEditor, typ: TypeID) -> ConstantID
// Given a constant cnst adds node to the function which are the constant values of each field and // Given a constant cnst adds node to the function which are the constant values of each field and
// returns a list of pairs of indices and the node that holds that index // returns a list of pairs of indices and the node that holds that index
fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> IndexTree<NodeID> { fn generate_constant_fields(
editor: &mut FunctionEditor,
cnst: ConstantID,
old_node: NodeID,
) -> IndexTree<NodeID> {
let cs: Option<Vec<ConstantID>> = let cs: Option<Vec<ConstantID>> =
if let Some(cs) = editor.get_constant(cnst).try_product_fields() { if let Some(cs) = editor.get_constant(cnst).try_product_fields() {
Some(cs.into()) Some(cs.into())
...@@ -1090,13 +1094,14 @@ fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> In ...@@ -1090,13 +1094,14 @@ fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> In
if let Some(cs) = cs { if let Some(cs) = cs {
let mut fields = vec![]; let mut fields = vec![];
for c in cs { for c in cs {
fields.push(generate_constant_fields(editor, c)); fields.push(generate_constant_fields(editor, c, old_node));
} }
IndexTree::Node(fields) IndexTree::Node(fields)
} else { } else {
let mut node = None; let mut node = None;
editor.edit(|mut edit| { editor.edit(|mut edit| {
node = Some(edit.add_node(Node::Constant { id: cnst })); node = Some(edit.add_node(Node::Constant { id: cnst }));
edit.sub_edit(old_node, node.unwrap());
Ok(edit) Ok(edit)
}); });
IndexTree::Leaf(node.expect("Add node cannot fail")) IndexTree::Leaf(node.expect("Add node cannot fail"))
......
...@@ -145,7 +145,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>( ...@@ -145,7 +145,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>(
@image_loop for r = 0 to row { @image_loop for r = 0 to row {
for c = 0 to col { for c = 0 to col {
@l2 let l2_dist : f32[num_ctrl_pts]; @l2 let l2_dist : f32[num_ctrl_pts];
for cp = 0 to num_ctrl_pts { @cp_loop for cp = 0 to num_ctrl_pts {
let v1 = input[0, r, c] - ctrl_pts[cp, 0]; let v1 = input[0, r, c] - ctrl_pts[cp, 0];
let v2 = input[1, r, c] - ctrl_pts[cp, 1]; let v2 = input[1, r, c] - ctrl_pts[cp, 1];
let v3 = input[2, r, c] - ctrl_pts[cp, 2]; let v3 = input[2, r, c] - ctrl_pts[cp, 2];
...@@ -155,7 +155,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>( ...@@ -155,7 +155,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>(
@channel_loop for chan = 0 to CHAN { @channel_loop for chan = 0 to CHAN {
let chan_val : f32 = 0.0; let chan_val : f32 = 0.0;
for cp = 0 to num_ctrl_pts { @cp_loop for cp = 0 to num_ctrl_pts {
chan_val += l2_dist[cp] * weights[cp, chan]; chan_val += l2_dist[cp] * weights[cp, chan];
} }
......
...@@ -115,7 +115,7 @@ array-slf(fuse4); ...@@ -115,7 +115,7 @@ array-slf(fuse4);
simpl!(fuse4); simpl!(fuse4);
let par = fuse4@image_loop \ fuse4@channel_loop; let par = fuse4@image_loop \ fuse4@channel_loop;
fork-tile[4, 1, false, false](par); fork-tile[4, 1, false, false](par);
fork-tile[4, 0, false, false](par); fork-tile[8, 0, false, false](par);
fork-interchange[1, 2](par); fork-interchange[1, 2](par);
let split = fork-split(par); let split = fork-split(par);
let fuse4_body = outline(split.cava_3.fj2); let fuse4_body = outline(split.cava_3.fj2);
......
...@@ -117,9 +117,9 @@ fixpoint { ...@@ -117,9 +117,9 @@ fixpoint {
simpl!(fuse4); simpl!(fuse4);
array-slf(fuse4); array-slf(fuse4);
simpl!(fuse4); simpl!(fuse4);
//fork-tile[2, 0, false, true](fuse4@channel_loop); fork-tile[2, 0, false, true](fuse4@channel_loop);
//fork-split(fuse4@channel_loop); let out = fork-split(fuse4@channel_loop);
//clean-monoid-reduces(fuse4); fork-unroll(out.cava_3.fj1);
unforkify(fuse4@channel_loop); unforkify(fuse4@channel_loop);
no-memset(fuse5@res1); no-memset(fuse5@res1);
...@@ -133,6 +133,13 @@ simpl!(fuse5); ...@@ -133,6 +133,13 @@ simpl!(fuse5);
array-slf(fuse5); array-slf(fuse5);
simpl!(fuse5); simpl!(fuse5);
fork-tile[4, 1, false, true](fuse4);
fork-tile[8, 0, false, true](fuse4);
fork-interchange[1, 2](fuse4);
let split = fork-split(fuse4);
fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2);
fork-coalesce(split.cava_3.fj2);
delete-uncalled(*); delete-uncalled(*);
simpl!(*); simpl!(*);
......
...@@ -124,9 +124,9 @@ pub struct CavaInputs { ...@@ -124,9 +124,9 @@ pub struct CavaInputs {
#[clap(long = "output-verify", value_name = "PATH")] #[clap(long = "output-verify", value_name = "PATH")]
pub output_verify: Option<String>, pub output_verify: Option<String>,
pub cam_model: String, pub cam_model: String,
#[clap(short, long)] #[clap(long)]
pub crop_rows: Option<usize>, pub crop_rows: Option<usize>,
#[clap(short, long)] #[clap(long)]
pub crop_cols: Option<usize>, pub crop_cols: Option<usize>,
} }
......
...@@ -26,14 +26,14 @@ predication(gaussian_smoothing); ...@@ -26,14 +26,14 @@ predication(gaussian_smoothing);
simpl!(gaussian_smoothing); simpl!(gaussian_smoothing);
let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop; let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop;
fork-tile[4, 1, false, false](par); fork-tile[4, 1, false, false](par);
fork-tile[4, 0, false, false](par); fork-tile[8, 0, false, false](par);
fork-interchange[1, 2](par); fork-interchange[1, 2](par);
let split = fork-split(par); let split = fork-split(par);
let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2); let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2);
fork-coalesce(gaussian_smoothing, gaussian_smoothing_body); fork-coalesce(gaussian_smoothing, gaussian_smoothing_body);
simpl!(gaussian_smoothing, gaussian_smoothing_body); simpl!(gaussian_smoothing, gaussian_smoothing_body);
no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2); no-memset(laplacian_estimate@res);
fixpoint { fixpoint {
forkify(laplacian_estimate); forkify(laplacian_estimate);
fork-guard-elim(laplacian_estimate); fork-guard-elim(laplacian_estimate);
...@@ -42,15 +42,15 @@ fixpoint { ...@@ -42,15 +42,15 @@ fixpoint {
simpl!(laplacian_estimate); simpl!(laplacian_estimate);
let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop; let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop;
fork-tile[4, 1, false, false](par); fork-tile[4, 1, false, false](par);
fork-tile[4, 0, false, false](par); fork-tile[8, 0, false, false](par);
fork-interchange[1, 2](par); fork-interchange[1, 2](par);
let split = fork-split(par); let split = fork-split(par);
let body = split._1_laplacian_estimate.fj2 | laplacian_estimate.shr1 | laplacian_estimate.shr2; let body = split._1_laplacian_estimate.fj2;
let laplacian_estimate_body = outline(body); let laplacian_estimate_body = outline(body);
fork-coalesce(laplacian_estimate, laplacian_estimate_body); fork-coalesce(laplacian_estimate, laplacian_estimate_body);
simpl!(laplacian_estimate, laplacian_estimate_body); simpl!(laplacian_estimate, laplacian_estimate_body);
no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2); no-memset(zero_crossings@res);
fixpoint { fixpoint {
forkify(zero_crossings); forkify(zero_crossings);
fork-guard-elim(zero_crossings); fork-guard-elim(zero_crossings);
...@@ -59,10 +59,10 @@ fixpoint { ...@@ -59,10 +59,10 @@ fixpoint {
simpl!(zero_crossings); simpl!(zero_crossings);
let par = zero_crossings@image_loop \ zero_crossings@filter_loop; let par = zero_crossings@image_loop \ zero_crossings@filter_loop;
fork-tile[4, 1, false, false](par); fork-tile[4, 1, false, false](par);
fork-tile[4, 0, false, false](par); fork-tile[8, 0, false, false](par);
fork-interchange[1, 2](par); fork-interchange[1, 2](par);
let split = fork-split(par); let split = fork-split(par);
let body = split._2_zero_crossings.fj2 | zero_crossings.shr1 | zero_crossings.shr2; let body = split._2_zero_crossings.fj2;
let zero_crossings_body = outline(body); let zero_crossings_body = outline(body);
fork-coalesce(zero_crossings, zero_crossings_body); fork-coalesce(zero_crossings, zero_crossings_body);
simpl!(zero_crossings, zero_crossings_body); simpl!(zero_crossings, zero_crossings_body);
...@@ -86,7 +86,7 @@ fixpoint { ...@@ -86,7 +86,7 @@ fixpoint {
simpl!(max_gradient); simpl!(max_gradient);
fork-dim-merge(max_gradient); fork-dim-merge(max_gradient);
simpl!(max_gradient); simpl!(max_gradient);
fork-tile[16, 0, false, false](max_gradient); fork-tile[32, 0, false, false](max_gradient);
let split = fork-split(max_gradient); let split = fork-split(max_gradient);
clean-monoid-reduces(max_gradient); clean-monoid-reduces(max_gradient);
let out = outline(split._4_max_gradient.fj1); let out = outline(split._4_max_gradient.fj1);
...@@ -105,7 +105,7 @@ fixpoint { ...@@ -105,7 +105,7 @@ fixpoint {
predication(reject_zero_crossings); predication(reject_zero_crossings);
simpl!(reject_zero_crossings); simpl!(reject_zero_crossings);
fork-tile[4, 1, false, false](reject_zero_crossings); fork-tile[4, 1, false, false](reject_zero_crossings);
fork-tile[4, 0, false, false](reject_zero_crossings); fork-tile[8, 0, false, false](reject_zero_crossings);
fork-interchange[1, 2](reject_zero_crossings); fork-interchange[1, 2](reject_zero_crossings);
let split = fork-split(reject_zero_crossings); let split = fork-split(reject_zero_crossings);
let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2); let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2);
......
...@@ -43,35 +43,16 @@ fn laplacian_estimate<n, m, sz: usize>( ...@@ -43,35 +43,16 @@ fn laplacian_estimate<n, m, sz: usize>(
@image_loop for row = 0 to n { @image_loop for row = 0 to n {
for col = 0 to m { for col = 0 to m {
// Copy data for dilation filter
@shr1 let imageArea : f32[sz, sz];
@filter_loop for i = 0 to sz {
for j = 0 to sz {
imageArea[i, j] = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MIN_BR
else input[row + i - r, col + j - r];
}
}
// Compute pixel of dilated image // Compute pixel of dilated image
let dilated = MIN_BR; let dilated = MIN_BR;
@filter_loop for i = 0 to sz { @filter_loop for i = 0 to sz {
for j = 0 to sz { for j = 0 to sz {
dilated = max!(dilated, imageArea[i, j] * structure[i, j]); let filter = if row + i < r
} || row + i - r > n - 1
} || col + j < r
|| col + j - r > m - 1 then MIN_BR
// Data copy for erotion filter else input[row + i - r, col + j - r];
@shr2 let imageArea : f32[sz, sz]; dilated = max!(dilated, filter * structure[i, j]);
@filter_loop for i = 0 to sz {
for j = 0 to sz {
imageArea[i, j] = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MAX_BR
else input[row + i - r, col + j - r];
} }
} }
...@@ -79,7 +60,12 @@ fn laplacian_estimate<n, m, sz: usize>( ...@@ -79,7 +60,12 @@ fn laplacian_estimate<n, m, sz: usize>(
let eroded = MAX_BR; let eroded = MAX_BR;
@filter_loop for i = 0 to sz { @filter_loop for i = 0 to sz {
for j = 0 to sz { for j = 0 to sz {
eroded = min!(eroded, imageArea[i, j] * structure[i, j]); let filter = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MAX_BR
else input[row + i - r, col + j - r];
eroded = min!(eroded, filter * structure[i, j]);
} }
} }
...@@ -101,37 +87,17 @@ fn zero_crossings<n, m, sz: usize>( ...@@ -101,37 +87,17 @@ fn zero_crossings<n, m, sz: usize>(
@image_loop for row = 0 to n { @image_loop for row = 0 to n {
for col = 0 to m { for col = 0 to m {
// Data copy for dilation filter
@shr1 let imageArea : f32[sz, sz];
@filter_loop for i = 0 to sz {
for j = 0 to sz {
imageArea[i, j] = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MIN_BR
else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
else MIN_BR;
}
}
// Compute the pixel of dilated image // Compute the pixel of dilated image
let dilated = MIN_BR; let dilated = MIN_BR;
@filter_loop for i = 0 to sz { @filter_loop for i = 0 to sz {
for j = 0 to sz { for j = 0 to sz {
dilated = max!(dilated, imageArea[i, j] * structure[i, j]); let filter = if row + i < r
} || row + i - r > n - 1
} || col + j < r
|| col + j - r > m - 1 then MIN_BR
// Data copy for erotion filter else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
@shr2 let imageArea : f32[sz, sz]; else MIN_BR;
@filter_loop for i = 0 to sz { dilated = max!(dilated, filter * structure[i, j]);
for j = 0 to sz {
imageArea[i, j] = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MAX_BR
else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
else MIN_BR;
} }
} }
...@@ -139,7 +105,13 @@ fn zero_crossings<n, m, sz: usize>( ...@@ -139,7 +105,13 @@ fn zero_crossings<n, m, sz: usize>(
let eroded = MAX_BR; let eroded = MAX_BR;
@filter_loop for i = 0 to sz { @filter_loop for i = 0 to sz {
for j = 0 to sz { for j = 0 to sz {
eroded = min!(eroded, imageArea[i, j] * structure[i, j]); let filter = if row + i < r
|| row + i - r > n - 1
|| col + j < r
|| col + j - r > m - 1 then MAX_BR
else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
else MIN_BR;
eroded = min!(eroded, filter * structure[i, j]);
} }
} }
...@@ -166,7 +138,7 @@ fn gradient<n, m, sb: usize>( ...@@ -166,7 +138,7 @@ fn gradient<n, m, sb: usize>(
let gx = 0; let gx = 0;
let gy = 0; let gy = 0;
for i = 0 to sb { @filter_loop for i = 0 to sb {
for j = 0 to sb { for j = 0 to sb {
let val = input[if row + i < sbr then 0 let val = input[if row + i < sbr then 0
else if row + i - sbr > n - 1 then n - 1 else if row + i - sbr > n - 1 then n - 1
......
...@@ -26,22 +26,49 @@ predication(gaussian_smoothing); ...@@ -26,22 +26,49 @@ predication(gaussian_smoothing);
simpl!(gaussian_smoothing); simpl!(gaussian_smoothing);
predication(gaussian_smoothing); predication(gaussian_smoothing);
simpl!(gaussian_smoothing); simpl!(gaussian_smoothing);
fork-dim-merge(gaussian_smoothing@filter_loop);
unforkify(gaussian_smoothing@filter_loop);
simpl!(gaussian_smoothing);
no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2); fork-dim-merge(gaussian_smoothing);
fork-tile[32, 0, false, true](gaussian_smoothing);
simpl!(gaussian_smoothing);
fork-split(gaussian_smoothing);
simpl!(gaussian_smoothing);
no-memset(laplacian_estimate@res);
fixpoint { fixpoint {
forkify(laplacian_estimate); forkify(laplacian_estimate);
fork-guard-elim(laplacian_estimate); fork-guard-elim(laplacian_estimate);
fork-coalesce(laplacian_estimate); fork-coalesce(laplacian_estimate);
} }
simpl!(laplacian_estimate); simpl!(laplacian_estimate);
fork-dim-merge(laplacian_estimate@filter_loop);
unforkify(laplacian_estimate@filter_loop);
simpl!(laplacian_estimate);
no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2); fork-dim-merge(laplacian_estimate);
fork-tile[32, 0, false, true](laplacian_estimate);
simpl!(laplacian_estimate);
fork-split(laplacian_estimate);
simpl!(laplacian_estimate);
no-memset(zero_crossings@res);
fixpoint { fixpoint {
forkify(zero_crossings); forkify(zero_crossings);
fork-guard-elim(zero_crossings); fork-guard-elim(zero_crossings);
fork-coalesce(zero_crossings); fork-coalesce(zero_crossings);
} }
simpl!(zero_crossings); simpl!(zero_crossings);
fork-dim-merge(zero_crossings@filter_loop);
unforkify(zero_crossings@filter_loop);
simpl!(zero_crossings);
fork-dim-merge(zero_crossings);
fork-tile[32, 0, false, true](zero_crossings);
simpl!(zero_crossings);
fork-split(zero_crossings);
simpl!(zero_crossings);
no-memset(gradient@res); no-memset(gradient@res);
fixpoint { fixpoint {
...@@ -53,6 +80,15 @@ predication(gradient); ...@@ -53,6 +80,15 @@ predication(gradient);
simpl!(gradient); simpl!(gradient);
predication(gradient); predication(gradient);
simpl!(gradient); simpl!(gradient);
fork-dim-merge(gradient@filter_loop);
unforkify(gradient@filter_loop);
simpl!(gradient);
fork-dim-merge(gradient);
fork-tile[32, 0, false, true](gradient);
simpl!(gradient);
fork-split(gradient);
simpl!(gradient);
fixpoint { fixpoint {
forkify(max_gradient); forkify(max_gradient);
...@@ -88,6 +124,12 @@ fixpoint { ...@@ -88,6 +124,12 @@ fixpoint {
predication(reject_zero_crossings); predication(reject_zero_crossings);
simpl!(reject_zero_crossings); simpl!(reject_zero_crossings);
fork-dim-merge(reject_zero_crossings);
fork-tile[32, 0, false, true](reject_zero_crossings);
simpl!(reject_zero_crossings);
fork-split(reject_zero_crossings);
simpl!(reject_zero_crossings);
async-call(edge_detection@le, edge_detection@zc); async-call(edge_detection@le, edge_detection@zc);
simpl!(*); simpl!(*);
......
...@@ -12,6 +12,7 @@ macro simpl!(X) { ...@@ -12,6 +12,7 @@ macro simpl!(X) {
simpl!(*); simpl!(*);
inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step); inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
delete-uncalled(*); delete-uncalled(*);
simpl!(*); simpl!(*);
ip-sroa[true](*); ip-sroa[true](*);
...@@ -24,8 +25,31 @@ fixpoint { ...@@ -24,8 +25,31 @@ fixpoint {
fork-guard-elim(*); fork-guard-elim(*);
} }
simpl!(*); simpl!(*);
no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res); unforkify(compute_flux@inner_loop);
parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop);
fork-tile[32, 0, false, false](compute_step_factor);
let split = fork-split(compute_step_factor);
let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
fork-coalesce(compute_step_factor, compute_step_factor_body);
simpl!(compute_step_factor, compute_step_factor_body);
fork-tile[32, 0, false, false](compute_flux);
let split = fork-split(compute_flux);
let compute_flux_body = outline(split._6_compute_flux.fj1);
fork-coalesce(compute_flux, compute_flux_body);
simpl!(compute_flux, compute_flux_body);
fork-tile[32, 0, false, false](time_step);
let split = fork-split(time_step);
let time_step_body = outline(split._7_time_step.fj1);
fork-coalesce(time_step, time_step_body);
simpl!(time_step, time_step_body);
fork-tile[32, 0, false, false](copy_vars);
let split = fork-split(copy_vars);
let copy_vars_body = outline(split._8_copy_vars.fj1);
fork-coalesce(copy_vars, copy_vars_body);
simpl!(copy_vars, copy_vars_body);
unforkify(*); unforkify(compute_step_factor_body, compute_flux_body, time_step_body, copy_vars_body);
gcm(*); gcm(*);
...@@ -24,7 +24,38 @@ fixpoint { ...@@ -24,7 +24,38 @@ fixpoint {
fork-guard-elim(*); fork-guard-elim(*);
} }
simpl!(*); simpl!(*);
no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res);
unforkify(compute_flux@inner_loop);
unforkify(*); fork-tile[32, 0, false, false](compute_step_factor);
let split = fork-split(compute_step_factor);
let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
fork-coalesce(compute_step_factor, compute_step_factor_body);
simpl!(compute_step_factor, compute_step_factor_body);
fork-tile[32, 0, false, false](compute_flux_contributions);
let split = fork-split(compute_flux_contributions);
let compute_flux_contributions_body = outline(split._6_compute_flux_contributions.fj1);
fork-coalesce(compute_flux_contributions, compute_flux_contributions_body);
simpl!(compute_flux_contributions, compute_flux_contributions_body);
fork-tile[32, 0, false, false](compute_flux);
let split = fork-split(compute_flux);
let compute_flux_body = outline(split._7_compute_flux.fj1);
fork-coalesce(compute_flux, compute_flux_body);
simpl!(compute_flux, compute_flux_body);
fork-tile[32, 0, false, false](time_step);
let split = fork-split(time_step);
let time_step_body = outline(split._8_time_step.fj1);
fork-coalesce(time_step, time_step_body);
simpl!(time_step, time_step_body);
fork-tile[32, 0, false, false](copy_vars);
let split = fork-split(copy_vars);
let copy_vars_body = outline(split._9_copy_vars.fj1);
fork-coalesce(copy_vars, copy_vars_body);
simpl!(copy_vars, copy_vars_body);
unforkify(compute_step_factor_body, compute_flux_contributions_body, compute_flux_body, time_step_body, copy_vars_body);
gcm(*); gcm(*);
...@@ -12,6 +12,7 @@ macro simpl!(X) { ...@@ -12,6 +12,7 @@ macro simpl!(X) {
simpl!(*); simpl!(*);
inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step); inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
delete-uncalled(*); delete-uncalled(*);
gpu(copy_vars, compute_step_factor, compute_flux, time_step); gpu(copy_vars, compute_step_factor, compute_flux, time_step);
...@@ -26,9 +27,18 @@ fixpoint { ...@@ -26,9 +27,18 @@ fixpoint {
fork-guard-elim(*); fork-guard-elim(*);
} }
simpl!(*); simpl!(*);
no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res); unforkify(compute_flux@inner_loop);
parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop);
fork-tile[32, 0, false, true](compute_step_factor);
fork-split(compute_step_factor);
fork-tile[32, 0, false, true](compute_flux);
fork-split(compute_flux);
fork-tile[32, 0, false, true](time_step);
fork-split(time_step);
fork-tile[32, 0, false, true](copy_vars);
fork-split(copy_vars);
unforkify(*);
float-collections(*);
gcm(*); gcm(*);
...@@ -12,6 +12,7 @@ macro simpl!(X) { ...@@ -12,6 +12,7 @@ macro simpl!(X) {
simpl!(*); simpl!(*);
inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step); inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step);
no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res);
delete-uncalled(*); delete-uncalled(*);
gpu(copy_vars, compute_step_factor, compute_flux_contributions, compute_flux, time_step); gpu(copy_vars, compute_step_factor, compute_flux_contributions, compute_flux, time_step);
...@@ -26,7 +27,21 @@ fixpoint { ...@@ -26,7 +27,21 @@ fixpoint {
fork-guard-elim(*); fork-guard-elim(*);
} }
simpl!(*); simpl!(*);
unforkify(compute_flux@inner_loop);
fork-tile[32, 0, false, true](compute_step_factor);
fork-split(compute_step_factor);
fork-tile[32, 0, false, true](compute_flux_contributions);
fork-split(compute_flux_contributions);
fork-tile[32, 0, false, true](compute_flux);
fork-split(compute_flux);
fork-tile[32, 0, false, true](time_step);
fork-split(time_step);
fork-tile[32, 0, false, true](copy_vars);
fork-split(copy_vars);
unforkify(*);
float-collections(*);
gcm(*); gcm(*);
...@@ -48,8 +48,7 @@ fn run_euler( ...@@ -48,8 +48,7 @@ fn run_euler(
let normals_z = HerculesImmBox::from(normals.z.as_slice()); let normals_z = HerculesImmBox::from(normals.z.as_slice());
let mut runner = runner!(euler); let mut runner = runner!(euler);
let (density, momentum_x, momentum_y, momentum_z, energy) = let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async {
async_std::task::block_on(async {
runner runner
.run( .run(
nelr as u64, nelr as u64,
...@@ -123,8 +122,7 @@ fn run_pre_euler( ...@@ -123,8 +122,7 @@ fn run_pre_euler(
let normals_z = HerculesImmBox::from(normals.z.as_slice()); let normals_z = HerculesImmBox::from(normals.z.as_slice());
let mut runner = runner!(pre_euler); let mut runner = runner!(pre_euler);
let (density, momentum_x, momentum_y, momentum_z, energy) = let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async {
async_std::task::block_on(async {
runner runner
.run( .run(
nelr as u64, nelr as u64,
...@@ -189,15 +187,30 @@ fn compare_floats(xs: &Variables, ys: &Variables) -> bool { ...@@ -189,15 +187,30 @@ fn compare_floats(xs: &Variables, ys: &Variables) -> bool {
let ys_energy = ys.energy.as_slice(); let ys_energy = ys.energy.as_slice();
xs_density.len() == ys_density.len() xs_density.len() == ys_density.len()
&& xs_density.iter().zip(ys_density.iter()).all(|(x, y)| compare_float(*x, *y)) && xs_density
.iter()
.zip(ys_density.iter())
.all(|(x, y)| compare_float(*x, *y))
&& xs_momentum_x.len() == ys_momentum_x.len() && xs_momentum_x.len() == ys_momentum_x.len()
&& xs_momentum_x.iter().zip(ys_momentum_x.iter()).all(|(x, y)| compare_float(*x, *y)) && xs_momentum_x
.iter()
.zip(ys_momentum_x.iter())
.all(|(x, y)| compare_float(*x, *y))
&& xs_momentum_y.len() == ys_momentum_y.len() && xs_momentum_y.len() == ys_momentum_y.len()
&& xs_momentum_y.iter().zip(ys_momentum_y.iter()).all(|(x, y)| compare_float(*x, *y)) && xs_momentum_y
.iter()
.zip(ys_momentum_y.iter())
.all(|(x, y)| compare_float(*x, *y))
&& xs_momentum_z.len() == ys_momentum_z.len() && xs_momentum_z.len() == ys_momentum_z.len()
&& xs_momentum_z.iter().zip(ys_momentum_z.iter()).all(|(x, y)| compare_float(*x, *y)) && xs_momentum_z
.iter()
.zip(ys_momentum_z.iter())
.all(|(x, y)| compare_float(*x, *y))
&& xs_energy.len() == ys_energy.len() && xs_energy.len() == ys_energy.len()
&& xs_energy.iter().zip(ys_energy.iter()).all(|(x, y)| compare_float(*x, *y)) && xs_energy
.iter()
.zip(ys_energy.iter())
.all(|(x, y)| compare_float(*x, *y))
} }
pub fn cfd_harness(args: CFDInputs) { pub fn cfd_harness(args: CFDInputs) {
...@@ -224,6 +237,7 @@ pub fn cfd_harness(args: CFDInputs) { ...@@ -224,6 +237,7 @@ pub fn cfd_harness(args: CFDInputs) {
} = read_domain_geometry(data_file, block_size); } = read_domain_geometry(data_file, block_size);
let variables = initialize_variables(nelr, &ff_variable); let variables = initialize_variables(nelr, &ff_variable);
println!("Running CFD with nelr = {}.", nelr);
let res_juno = if pre_euler { let res_juno = if pre_euler {
run_pre_euler( run_pre_euler(
......
...@@ -58,7 +58,7 @@ fn compute_speed_of_sound(density: f32, pressure: f32) -> f32 { ...@@ -58,7 +58,7 @@ fn compute_speed_of_sound(density: f32, pressure: f32) -> f32 {
} }
fn compute_step_factor<nelr: usize>(variables: Variables::<nelr>, areas: f32[nelr]) -> f32[nelr] { fn compute_step_factor<nelr: usize>(variables: Variables::<nelr>, areas: f32[nelr]) -> f32[nelr] {
let step_factors : f32[nelr]; @res let step_factors : f32[nelr];
for i in 0..nelr { for i in 0..nelr {
let density = variables.density[i]; let density = variables.density[i];
...@@ -109,10 +109,10 @@ fn compute_flux_contribution( ...@@ -109,10 +109,10 @@ fn compute_flux_contribution(
fn compute_flux_contributions<nelr: usize>( fn compute_flux_contributions<nelr: usize>(
variables: Variables::<nelr>, variables: Variables::<nelr>,
) -> (Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>) { ) -> (Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>) {
let fc_momentum_x: Momentum::<nelr>; @res let fc_momentum_x: Momentum::<nelr>;
let fc_momentum_y: Momentum::<nelr>; @res let fc_momentum_y: Momentum::<nelr>;
let fc_momentum_z: Momentum::<nelr>; @res let fc_momentum_z: Momentum::<nelr>;
let fc_density_energy: Momentum::<nelr>; @res let fc_density_energy: Momentum::<nelr>;
for i in 0..nelr { for i in 0..nelr {
let density_i = variables.density[i]; let density_i = variables.density[i];
...@@ -167,9 +167,9 @@ fn compute_flux<nelr: usize>( ...@@ -167,9 +167,9 @@ fn compute_flux<nelr: usize>(
ff_fc_momentum_z: float3, ff_fc_momentum_z: float3,
) -> Variables::<nelr> { ) -> Variables::<nelr> {
const smoothing_coefficient : f32 = 0.2; const smoothing_coefficient : f32 = 0.2;
let fluxes: Variables::<nelr>; @res let fluxes: Variables::<nelr>;
for i in 0..nelr { @outer_loop for i in 0..nelr {
let density_i = variables.density[i]; let density_i = variables.density[i];
let momentum_i = float3 { x: variables.momentum.x[i], let momentum_i = float3 { x: variables.momentum.x[i],
...@@ -201,7 +201,7 @@ fn compute_flux<nelr: usize>( ...@@ -201,7 +201,7 @@ fn compute_flux<nelr: usize>(
let flux_i_momentum = float3 { x: 0.0, y: 0.0, z: 0.0 }; let flux_i_momentum = float3 { x: 0.0, y: 0.0, z: 0.0 };
let flux_i_density_energy : f32 = 0.0; let flux_i_density_energy : f32 = 0.0;
for j in 0..NNB { @inner_loop for j in 0..NNB {
let nb = elements_surrounding_elements[j, i]; let nb = elements_surrounding_elements[j, i];
let normal = float3 { let normal = float3 {
x: normals.x[j, i], x: normals.x[j, i],
...@@ -328,7 +328,7 @@ fn time_step<nelr: usize>( ...@@ -328,7 +328,7 @@ fn time_step<nelr: usize>(
} }
fn copy_vars<nelr: usize>(variables: Variables::<nelr>) -> Variables::<nelr> { fn copy_vars<nelr: usize>(variables: Variables::<nelr>) -> Variables::<nelr> {
let result : Variables::<nelr>; @res let result : Variables::<nelr>;
for i in 0..nelr { for i in 0..nelr {
result.density[i] = variables.density[i]; result.density[i] = variables.density[i];
......
...@@ -54,4 +54,10 @@ ip-sroa(*); ...@@ -54,4 +54,10 @@ ip-sroa(*);
sroa(*); sroa(*);
simpl!(*); simpl!(*);
fork-dim-merge(main_loops);
fork-tile[32, 0, false, true](main_loops);
dce(main_loops);
fork-split(main_loops);
simpl!(main_loops);
gcm(*); gcm(*);
...@@ -1090,7 +1090,9 @@ impl PassManager { ...@@ -1090,7 +1090,9 @@ impl PassManager {
let mut nvcc_process = Command::new("nvcc") let mut nvcc_process = Command::new("nvcc")
.arg("-c") .arg("-c")
.arg("-Xptxas")
.arg("-O3") .arg("-O3")
.arg("-use_fast_math")
.arg("-diag-suppress") .arg("-diag-suppress")
.arg("177") .arg("177")
.arg("-o") .arg("-o")
......