Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • llvm/hercules
1 result
Show changes
Commits on Source (10)
Showing with 200 additions and 51 deletions
...@@ -1515,6 +1515,10 @@ fn fork_fusion( ...@@ -1515,6 +1515,10 @@ fn fork_fusion(
* Looks for would-be monoid reduces, if not for a gate on the reduction. * Looks for would-be monoid reduces, if not for a gate on the reduction.
* Partially predicate the gated reduction to allow for a proper monoid * Partially predicate the gated reduction to allow for a proper monoid
* reduction. * reduction.
*
* Looks for monoid reduces that occur through a gated write to a single
* location, and lift them to a proper monoid reduction with a single gated
* write afterwards.
*/ */
pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
for id in editor.node_ids() { for id in editor.node_ids() {
...@@ -1676,6 +1680,121 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { ...@@ -1676,6 +1680,121 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
}); });
} }
} }
for id in editor.node_ids() {
// Identify reduce/write/phi cycle through which a sparse AND reduction
// is occurring.
let nodes = &editor.func().nodes;
let Some((join, init, reduct)) = nodes[id.idx()].try_reduce() else {
continue;
};
let join_pred = nodes[join.idx()].try_join().unwrap();
let join_succ = editor
.get_users(join)
.filter(|id| nodes[id.idx()].is_control())
.next()
.unwrap();
let Some((_, phi_data)) = nodes[reduct.idx()].try_phi() else {
continue;
};
if phi_data.len() != 2 {
continue;
}
let phi_other_use = if phi_data[0] == id {
phi_data[1]
} else if phi_data[1] == id {
phi_data[0]
} else {
continue;
};
let Some((collect, data, indices)) = nodes[phi_other_use.idx()].try_write() else {
continue;
};
if collect != id {
continue;
}
if indices.into_iter().any(|idx| idx.is_position()) {
continue;
}
if !is_false(editor, data) {
continue;
}
let Some(preds) = nodes[join_pred.idx()].try_region() else {
continue;
};
if preds.len() != 2 {
continue;
}
let Some((if1, _)) = nodes[preds[0].idx()].try_control_proj() else {
continue;
};
let Some((if2, sel)) = nodes[preds[1].idx()].try_control_proj() else {
continue;
};
if if1 != if2 {
continue;
}
let Some((_, mut cond)) = nodes[if1.idx()].try_if() else {
continue;
};
// Transform to a monoid reduction and a single gated write.
let negated = phi_other_use == phi_data[sel];
let indices = indices.to_vec().into_boxed_slice();
editor.edit(|mut edit| {
let t = edit.add_constant(Constant::Boolean(true));
let t = edit.add_node(Node::Constant { id: t });
let f = edit.add_constant(Constant::Boolean(false));
let f = edit.add_node(Node::Constant { id: f });
if negated {
cond = edit.add_node(Node::Unary {
op: UnaryOperator::Not,
input: cond,
});
}
let reduce_id = NodeID::new(edit.num_node_ids());
let and_id = NodeID::new(edit.num_node_ids() + 1);
edit.add_node(Node::Reduce {
control: join,
init: t,
reduct: and_id,
});
edit.add_node(Node::Binary {
op: BinaryOperator::And,
left: cond,
right: reduce_id,
});
let new_if = edit.add_node(Node::If {
control: join,
cond: reduce_id,
});
let cpj1 = edit.add_node(Node::ControlProjection {
control: new_if,
selection: 0,
});
let cpj2 = edit.add_node(Node::ControlProjection {
control: new_if,
selection: 1,
});
let region = edit.add_node(Node::Region {
preds: Box::new([cpj1, cpj2]),
});
let write = edit.add_node(Node::Write {
collect: init,
data: f,
indices,
});
let phi = edit.add_node(Node::Phi {
control: region,
data: Box::new([write, init]),
});
edit = edit.replace_all_uses_where(id, phi, |other_id| {
*other_id != phi_other_use && *other_id != reduct
})?;
edit.replace_all_uses_where(join, region, |id| *id == join_succ)
});
}
} }
/* /*
...@@ -1741,6 +1860,7 @@ fn extend_fork(editor: &mut FunctionEditor, fork: NodeID, join: NodeID, multiple ...@@ -1741,6 +1860,7 @@ fn extend_fork(editor: &mut FunctionEditor, fork: NodeID, join: NodeID, multiple
control: new_fork, control: new_fork,
dimension: idx, dimension: idx,
}); });
edit.sub_edit(fork, tid);
let old_bound = edit.add_node(Node::DynamicConstant { id: *old_factor }); let old_bound = edit.add_node(Node::DynamicConstant { id: *old_factor });
edit.add_node(Node::Binary { edit.add_node(Node::Binary {
op: BinaryOperator::LT, op: BinaryOperator::LT,
......
...@@ -124,6 +124,8 @@ if !feature("seq") { ...@@ -124,6 +124,8 @@ if !feature("seq") {
fork-coalesce(fuse4, fuse4_body); fork-coalesce(fuse4, fuse4_body);
simpl!(fuse4, fuse4_body); simpl!(fuse4, fuse4_body);
fuse4 = fuse4_body; fuse4 = fuse4_body;
} else {
fork-tile[6, 0, false, true](fuse4@channel_loop);
} }
no-memset(fuse5@res1); no-memset(fuse5@res1);
......
...@@ -87,9 +87,6 @@ fn edge_detection_bench(c: &mut Criterion) { ...@@ -87,9 +87,6 @@ fn edge_detection_bench(c: &mut Criterion) {
r.run( r.run(
height as u64, height as u64,
width as u64, width as u64,
gs as u64,
sz as u64,
sb as u64,
input_h.to(), input_h.to(),
gaussian_filter_h, gaussian_filter_h,
structure_h, structure_h,
......
...@@ -134,7 +134,9 @@ if !feature("seq") { ...@@ -134,7 +134,9 @@ if !feature("seq") {
reject_zero_crossings = reject_zero_crossings_body; reject_zero_crossings = reject_zero_crossings_body;
} }
async-call(edge_detection@le, edge_detection@zc); if !feature("seq") {
async-call(edge_detection@le, edge_detection@zc);
}
fork-split(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings); fork-split(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings);
unforkify(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings); unforkify(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings);
......
fn gaussian_smoothing<n, m, gs : usize>( const gs : usize = 7;
const sz : usize = 3;
const sb : usize = 3;
fn gaussian_smoothing<n, m : usize>(
input: f32[n, m], input: f32[n, m],
filter: f32[gs, gs], filter: f32[7, 7],
) -> f32[n, m] { ) -> f32[n, m] {
@res let result : f32[n, m]; @res let result : f32[n, m];
...@@ -13,12 +17,9 @@ fn gaussian_smoothing<n, m, gs : usize>( ...@@ -13,12 +17,9 @@ fn gaussian_smoothing<n, m, gs : usize>(
@filter_loop for i = 0 to gs { @filter_loop for i = 0 to gs {
for j = 0 to gs { for j = 0 to gs {
let val = input[if row + i < gr then 0 let br = min!(max!(row + i, gr) - gr, n - 1);
else if row + i - gr > n - 1 then n - 1 let bc = min!(max!(col + j, gr) - gr, m - 1);
else row + i - gr, let val = input[br, bc];
if col + j < gr then 0
else if col + j - gr > m - 1 then m - 1
else col + j - gr];
smoothed += val * filter[i, j]; smoothed += val * filter[i, j];
} }
} }
...@@ -33,9 +34,9 @@ fn gaussian_smoothing<n, m, gs : usize>( ...@@ -33,9 +34,9 @@ fn gaussian_smoothing<n, m, gs : usize>(
const MIN_BR : f32 = 0; const MIN_BR : f32 = 0;
const MAX_BR : f32 = 1; const MAX_BR : f32 = 1;
fn laplacian_estimate<n, m, sz: usize>( fn laplacian_estimate<n, m : usize>(
input: f32[n, m], input: f32[n, m],
structure: f32[sz, sz], structure: f32[3, 3],
) -> f32[n, m] { ) -> f32[n, m] {
const r = sz / 2; const r = sz / 2;
...@@ -77,9 +78,9 @@ fn laplacian_estimate<n, m, sz: usize>( ...@@ -77,9 +78,9 @@ fn laplacian_estimate<n, m, sz: usize>(
return result; return result;
} }
fn zero_crossings<n, m, sz: usize>( fn zero_crossings<n, m : usize>(
input: f32[n, m], input: f32[n, m],
structure: f32[sz, sz], structure: f32[3, 3],
) -> f32[n, m] { ) -> f32[n, m] {
const r = sz / 2; const r = sz / 2;
...@@ -123,10 +124,10 @@ fn zero_crossings<n, m, sz: usize>( ...@@ -123,10 +124,10 @@ fn zero_crossings<n, m, sz: usize>(
return result; return result;
} }
fn gradient<n, m, sb: usize>( fn gradient<n, m : usize>(
input: f32[n, m], input: f32[n, m],
sx: f32[sb, sb], sx: f32[3, 3],
sy: f32[sb, sb], sy: f32[3, 3],
) -> f32[n, m] { ) -> f32[n, m] {
const sbr = sb / 2; const sbr = sb / 2;
...@@ -191,18 +192,18 @@ fn reject_zero_crossings<n, m: usize>( ...@@ -191,18 +192,18 @@ fn reject_zero_crossings<n, m: usize>(
} }
#[entry] #[entry]
fn edge_detection<n, m, gs, sz, sb: usize>( fn edge_detection<n, m : usize>(
input: f32[n, m], input: f32[n, m],
gaussian_filter: f32[gs, gs], gaussian_filter: f32[gs, gs],
structure: f32[sz, sz], structure: f32[3, 3],
sx: f32[sb, sb], sx: f32[3, 3],
sy: f32[sb, sb], sy: f32[3, 3],
theta: f32, theta: f32,
) -> f32[n, m] { ) -> f32[n, m] {
let smoothed = gaussian_smoothing::<n, m, gs>(input, gaussian_filter); let smoothed = gaussian_smoothing::<n, m>(input, gaussian_filter);
@le let laplacian = laplacian_estimate::<n, m, sz>(smoothed, structure); @le let laplacian = laplacian_estimate::<n, m>(smoothed, structure);
@zc let zcs = zero_crossings::<n, m, sz>(laplacian, structure); @zc let zcs = zero_crossings::<n, m>(laplacian, structure);
let gradient = gradient::<n, m, sb>(smoothed, sx, sy); let gradient = gradient::<n, m>(smoothed, sx, sy);
let maxgrad = max_gradient::<n, m>(gradient); let maxgrad = max_gradient::<n, m>(gradient);
return reject_zero_crossings::<n, m>(zcs, gradient, maxgrad, theta); return reject_zero_crossings::<n, m>(zcs, gradient, maxgrad, theta);
} }
...@@ -194,9 +194,6 @@ pub fn edge_detection_harness(args: EdgeDetectionInputs) { ...@@ -194,9 +194,6 @@ pub fn edge_detection_harness(args: EdgeDetectionInputs) {
r.run( r.run(
height as u64, height as u64,
width as u64, width as u64,
gs as u64,
sz as u64,
sb as u64,
input_h.to(), input_h.to(),
gaussian_filter_h.to(), gaussian_filter_h.to(),
structure_h.to(), structure_h.to(),
......
...@@ -25,6 +25,10 @@ macro forkify!(X) { ...@@ -25,6 +25,10 @@ macro forkify!(X) {
} }
} }
macro fork-chunk![n](X) {
fork-tile[n, 0, false, false](X);
}
macro fork-tile![n](X) { macro fork-tile![n](X) {
fork-tile[n, 0, false, true](X); fork-tile[n, 0, false, true](X);
} }
...@@ -66,8 +70,8 @@ if feature("cuda") { ...@@ -66,8 +70,8 @@ if feature("cuda") {
// Parallelize by computing output array as 16 chunks // Parallelize by computing output array as 16 chunks
let par = matmul@outer \ matmul@inner; let par = matmul@outer \ matmul@inner;
fork-tile![4](par); fork-chunk![4](par);
let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par); let (outer, inner, _) = fork-reshape[[0, 2], [1], [3]](par);
parallelize!(outer \ inner); parallelize!(outer \ inner);
let body = outline(inner); let body = outline(inner);
......
...@@ -38,8 +38,8 @@ let forward_input = outline(backprop@forward_input); ...@@ -38,8 +38,8 @@ let forward_input = outline(backprop@forward_input);
let forward_hidden = outline(backprop@forward_hidden); let forward_hidden = outline(backprop@forward_hidden);
if !feature("seq") { if !feature("seq") {
fork-tile[16, 0, false, true](forward_input@outer_loop \ forward_input@inner_loop); fork-tile[16, 0, false, false](forward_input@outer_loop \ forward_input@inner_loop);
let (outer, inner) = fork-reshape[[1], [0]](forward_input@outer_loop \ forward_input@inner_loop); let (outer, inner) = fork-reshape[[0], [1]](forward_input@outer_loop \ forward_input@inner_loop);
forward_input = outline(inner); forward_input = outline(inner);
inline(backprop@forward_input); inline(backprop@forward_input);
} }
...@@ -53,8 +53,8 @@ let adjust_hidden = outline(backprop@adjust_hidden); ...@@ -53,8 +53,8 @@ let adjust_hidden = outline(backprop@adjust_hidden);
let adjust_input = outline(backprop@adjust_input); let adjust_input = outline(backprop@adjust_input);
if !feature("seq") { if !feature("seq") {
fork-tile[16, 0, false, true](adjust_input); fork-tile[16, 0, false, false](adjust_input);
let (outer, inner) = fork-reshape[[1], [0, 2]](adjust_input); let (outer, inner) = fork-reshape[[0], [1, 2]](adjust_input);
adjust_input = outline(inner); adjust_input = outline(inner);
inline(backprop@adjust_input); inline(backprop@adjust_input);
} }
......
...@@ -41,14 +41,14 @@ parallel-fork(traverse, collect); ...@@ -41,14 +41,14 @@ parallel-fork(traverse, collect);
parallel-reduce(traverse, collect); parallel-reduce(traverse, collect);
if !feature("seq") { if !feature("seq") {
fork-tile[32, 0, false, true](traverse, collect); fork-tile[32, 0, false, false](traverse, collect);
let (outer, inner) = fork-reshape[[1], [0]](traverse); let (outer, inner) = fork-reshape[[0], [1]](traverse);
traverse = outline(inner); traverse = outline(inner);
let (outer, inner) = fork-reshape[[1], [0]](collect); let (outer, inner) = fork-reshape[[0], [1]](collect);
collect = outline(inner); collect = outline(inner);
fork-tile[32, 0, false, true](init); fork-tile[32, 0, false, false](init);
let (outer, inner) = fork-reshape[[1], [0]](init); let (outer, inner) = fork-reshape[[0], [1]](init);
let init_body = outline(inner); let init_body = outline(inner);
inline(bfs@cost_init, bfs@loop1, bfs@loop2); inline(bfs@cost_init, bfs@loop1, bfs@loop2);
...@@ -56,8 +56,14 @@ if !feature("seq") { ...@@ -56,8 +56,14 @@ if !feature("seq") {
} }
delete-uncalled(*); delete-uncalled(*);
const-inline(*); const-inline(*);
clean-monoid-reduces(collect);
simpl!(*); simpl!(*);
fork-tile[8, 0, false, true](init, traverse, collect);
clean-monoid-reduces(collect);
simpl!(*);
fork-split(init, traverse, collect);
unforkify(init, traverse, collect); unforkify(init, traverse, collect);
simpl!(*); simpl!(*);
gcm(*); gcm(*);
\ No newline at end of file
...@@ -15,7 +15,7 @@ let traverse = outline(bfs@loop1); ...@@ -15,7 +15,7 @@ let traverse = outline(bfs@loop1);
let collect = outline(bfs@loop2); let collect = outline(bfs@loop2);
parallel-reduce(traverse, collect); parallel-reduce(traverse, collect);
no-memset(make_stop_prod); no-memset(make_stop_prod);
gpu(traverse, make_stop_prod, collect); gpu(init, traverse, make_stop_prod, collect);
simpl!(*); simpl!(*);
predication(*); predication(*);
...@@ -38,12 +38,8 @@ fixpoint { ...@@ -38,12 +38,8 @@ fixpoint {
} }
simpl!(collect); simpl!(collect);
fork-tile[32, 0, false, true](init); fork-tile[1024, 0, false, true](init, traverse, collect);
let (outer, inner) = fork-reshape[[1], [0]](init); let out = fork-split(init, traverse, collect);
let init_body = outline(inner); simpl!(*);
fork-tile[1024, 0, false, true](traverse, collect);
fork-split(traverse, collect);
unforkify(init_body);
gcm(*); gcm(*);
...@@ -57,5 +57,10 @@ if !feature("seq") { ...@@ -57,5 +57,10 @@ if !feature("seq") {
copy_vars = copy_vars_body; copy_vars = copy_vars_body;
} }
const-inline[false](*);
simpl!(*);
fork-split(compute_step_factor, compute_flux, time_step, copy_vars);
unforkify(compute_step_factor, compute_flux, time_step, copy_vars); unforkify(compute_step_factor, compute_flux, time_step, copy_vars);
simpl!(*);
gcm(*); gcm(*);
...@@ -64,5 +64,10 @@ if !feature("seq") { ...@@ -64,5 +64,10 @@ if !feature("seq") {
copy_vars = copy_vars_body; copy_vars = copy_vars_body;
} }
const-inline[false](*);
simpl!(*);
fork-split(compute_step_factor, compute_flux_contributions, compute_flux, time_step, copy_vars);
unforkify(compute_step_factor, compute_flux_contributions, compute_flux, time_step, copy_vars); unforkify(compute_step_factor, compute_flux_contributions, compute_flux, time_step, copy_vars);
simpl!(*);
gcm(*); gcm(*);
...@@ -28,14 +28,28 @@ fixpoint { ...@@ -28,14 +28,28 @@ fixpoint {
fork-guard-elim(*); fork-guard-elim(*);
fork-coalesce(*); fork-coalesce(*);
} }
fork-dim-merge(loop1);
simpl!(*); simpl!(*);
fork-interchange[0, 1](loop1);
reduce-slf(*); reduce-slf(*);
simpl!(*); simpl!(*);
slf(*); slf(*);
simpl!(*); simpl!(*);
if !feature("seq") { if !feature("seq") {
fork-tile[32, 0, false, false](loop1);
simpl!(loop1);
let split = fork-split(loop1);
simpl!(loop1);
clean-monoid-reduces(loop1);
let loop1_body = outline(split.srad_0.fj1);
simpl!(loop1, loop1_body);
unforkify(loop1_body);
let fission = fork-fission[split.srad_0.fj0](loop1);
simpl!(loop1, loop1_body);
unforkify(fission.srad_0.fj_bottom);
simpl!(loop1, loop1_body);
loop1 = loop1_body;
fork-tile[32, 0, false, false](loop2); fork-tile[32, 0, false, false](loop2);
let split = fork-split(loop2); let split = fork-split(loop2);
let loop2_body = outline(split.srad_1.fj1); let loop2_body = outline(split.srad_1.fj1);
......