Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • llvm/hercules
1 result
Show changes
Commits on Source (10)
Showing with 200 additions and 51 deletions
......@@ -1515,6 +1515,10 @@ fn fork_fusion(
* Looks for would-be monoid reduces, if not for a gate on the reduction.
* Partially predicate the gated reduction to allow for a proper monoid
* reduction.
*
* Looks for monoid reduces that occur through a gated write to a single
* location, and lift them to a proper monoid reduction with a single gated
* write afterwards.
*/
pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
for id in editor.node_ids() {
......@@ -1676,6 +1680,121 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
});
}
}
for id in editor.node_ids() {
// Identify reduce/write/phi cycle through which a sparse AND reduction
// is occurring.
let nodes = &editor.func().nodes;
let Some((join, init, reduct)) = nodes[id.idx()].try_reduce() else {
continue;
};
let join_pred = nodes[join.idx()].try_join().unwrap();
let join_succ = editor
.get_users(join)
.filter(|id| nodes[id.idx()].is_control())
.next()
.unwrap();
let Some((_, phi_data)) = nodes[reduct.idx()].try_phi() else {
continue;
};
if phi_data.len() != 2 {
continue;
}
let phi_other_use = if phi_data[0] == id {
phi_data[1]
} else if phi_data[1] == id {
phi_data[0]
} else {
continue;
};
let Some((collect, data, indices)) = nodes[phi_other_use.idx()].try_write() else {
continue;
};
if collect != id {
continue;
}
if indices.into_iter().any(|idx| idx.is_position()) {
continue;
}
if !is_false(editor, data) {
continue;
}
let Some(preds) = nodes[join_pred.idx()].try_region() else {
continue;
};
if preds.len() != 2 {
continue;
}
let Some((if1, _)) = nodes[preds[0].idx()].try_control_proj() else {
continue;
};
let Some((if2, sel)) = nodes[preds[1].idx()].try_control_proj() else {
continue;
};
if if1 != if2 {
continue;
}
let Some((_, mut cond)) = nodes[if1.idx()].try_if() else {
continue;
};
// Transform to a monoid reduction and a single gated write.
let negated = phi_other_use == phi_data[sel];
let indices = indices.to_vec().into_boxed_slice();
editor.edit(|mut edit| {
let t = edit.add_constant(Constant::Boolean(true));
let t = edit.add_node(Node::Constant { id: t });
let f = edit.add_constant(Constant::Boolean(false));
let f = edit.add_node(Node::Constant { id: f });
if negated {
cond = edit.add_node(Node::Unary {
op: UnaryOperator::Not,
input: cond,
});
}
let reduce_id = NodeID::new(edit.num_node_ids());
let and_id = NodeID::new(edit.num_node_ids() + 1);
edit.add_node(Node::Reduce {
control: join,
init: t,
reduct: and_id,
});
edit.add_node(Node::Binary {
op: BinaryOperator::And,
left: cond,
right: reduce_id,
});
let new_if = edit.add_node(Node::If {
control: join,
cond: reduce_id,
});
let cpj1 = edit.add_node(Node::ControlProjection {
control: new_if,
selection: 0,
});
let cpj2 = edit.add_node(Node::ControlProjection {
control: new_if,
selection: 1,
});
let region = edit.add_node(Node::Region {
preds: Box::new([cpj1, cpj2]),
});
let write = edit.add_node(Node::Write {
collect: init,
data: f,
indices,
});
let phi = edit.add_node(Node::Phi {
control: region,
data: Box::new([write, init]),
});
edit = edit.replace_all_uses_where(id, phi, |other_id| {
*other_id != phi_other_use && *other_id != reduct
})?;
edit.replace_all_uses_where(join, region, |id| *id == join_succ)
});
}
}
/*
......@@ -1741,6 +1860,7 @@ fn extend_fork(editor: &mut FunctionEditor, fork: NodeID, join: NodeID, multiple
control: new_fork,
dimension: idx,
});
edit.sub_edit(fork, tid);
let old_bound = edit.add_node(Node::DynamicConstant { id: *old_factor });
edit.add_node(Node::Binary {
op: BinaryOperator::LT,
......
......@@ -124,6 +124,8 @@ if !feature("seq") {
fork-coalesce(fuse4, fuse4_body);
simpl!(fuse4, fuse4_body);
fuse4 = fuse4_body;
} else {
fork-tile[6, 0, false, true](fuse4@channel_loop);
}
no-memset(fuse5@res1);
......
......@@ -87,9 +87,6 @@ fn edge_detection_bench(c: &mut Criterion) {
r.run(
height as u64,
width as u64,
gs as u64,
sz as u64,
sb as u64,
input_h.to(),
gaussian_filter_h,
structure_h,
......
......@@ -134,7 +134,9 @@ if !feature("seq") {
reject_zero_crossings = reject_zero_crossings_body;
}
async-call(edge_detection@le, edge_detection@zc);
if !feature("seq") {
async-call(edge_detection@le, edge_detection@zc);
}
fork-split(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings);
unforkify(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings);
......
fn gaussian_smoothing<n, m, gs : usize>(
const gs : usize = 7;
const sz : usize = 3;
const sb : usize = 3;
fn gaussian_smoothing<n, m : usize>(
input: f32[n, m],
filter: f32[gs, gs],
filter: f32[7, 7],
) -> f32[n, m] {
@res let result : f32[n, m];
......@@ -13,12 +17,9 @@ fn gaussian_smoothing<n, m, gs : usize>(
@filter_loop for i = 0 to gs {
for j = 0 to gs {
let val = input[if row + i < gr then 0
else if row + i - gr > n - 1 then n - 1
else row + i - gr,
if col + j < gr then 0
else if col + j - gr > m - 1 then m - 1
else col + j - gr];
let br = min!(max!(row + i, gr) - gr, n - 1);
let bc = min!(max!(col + j, gr) - gr, m - 1);
let val = input[br, bc];
smoothed += val * filter[i, j];
}
}
......@@ -33,9 +34,9 @@ fn gaussian_smoothing<n, m, gs : usize>(
const MIN_BR : f32 = 0;
const MAX_BR : f32 = 1;
fn laplacian_estimate<n, m, sz: usize>(
fn laplacian_estimate<n, m : usize>(
input: f32[n, m],
structure: f32[sz, sz],
structure: f32[3, 3],
) -> f32[n, m] {
const r = sz / 2;
......@@ -77,9 +78,9 @@ fn laplacian_estimate<n, m, sz: usize>(
return result;
}
fn zero_crossings<n, m, sz: usize>(
fn zero_crossings<n, m : usize>(
input: f32[n, m],
structure: f32[sz, sz],
structure: f32[3, 3],
) -> f32[n, m] {
const r = sz / 2;
......@@ -123,10 +124,10 @@ fn zero_crossings<n, m, sz: usize>(
return result;
}
fn gradient<n, m, sb: usize>(
fn gradient<n, m : usize>(
input: f32[n, m],
sx: f32[sb, sb],
sy: f32[sb, sb],
sx: f32[3, 3],
sy: f32[3, 3],
) -> f32[n, m] {
const sbr = sb / 2;
......@@ -191,18 +192,18 @@ fn reject_zero_crossings<n, m: usize>(
}
#[entry]
fn edge_detection<n, m, gs, sz, sb: usize>(
fn edge_detection<n, m : usize>(
input: f32[n, m],
gaussian_filter: f32[gs, gs],
structure: f32[sz, sz],
sx: f32[sb, sb],
sy: f32[sb, sb],
structure: f32[3, 3],
sx: f32[3, 3],
sy: f32[3, 3],
theta: f32,
) -> f32[n, m] {
let smoothed = gaussian_smoothing::<n, m, gs>(input, gaussian_filter);
@le let laplacian = laplacian_estimate::<n, m, sz>(smoothed, structure);
@zc let zcs = zero_crossings::<n, m, sz>(laplacian, structure);
let gradient = gradient::<n, m, sb>(smoothed, sx, sy);
let smoothed = gaussian_smoothing::<n, m>(input, gaussian_filter);
@le let laplacian = laplacian_estimate::<n, m>(smoothed, structure);
@zc let zcs = zero_crossings::<n, m>(laplacian, structure);
let gradient = gradient::<n, m>(smoothed, sx, sy);
let maxgrad = max_gradient::<n, m>(gradient);
return reject_zero_crossings::<n, m>(zcs, gradient, maxgrad, theta);
}
......@@ -194,9 +194,6 @@ pub fn edge_detection_harness(args: EdgeDetectionInputs) {
r.run(
height as u64,
width as u64,
gs as u64,
sz as u64,
sb as u64,
input_h.to(),
gaussian_filter_h.to(),
structure_h.to(),
......
......@@ -25,6 +25,10 @@ macro forkify!(X) {
}
}
macro fork-chunk![n](X) {
fork-tile[n, 0, false, false](X);
}
macro fork-tile![n](X) {
fork-tile[n, 0, false, true](X);
}
......@@ -66,8 +70,8 @@ if feature("cuda") {
// Parallelize by computing output array as 16 chunks
let par = matmul@outer \ matmul@inner;
fork-tile![4](par);
let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par);
fork-chunk![4](par);
let (outer, inner, _) = fork-reshape[[0, 2], [1], [3]](par);
parallelize!(outer \ inner);
let body = outline(inner);
......
......@@ -38,8 +38,8 @@ let forward_input = outline(backprop@forward_input);
let forward_hidden = outline(backprop@forward_hidden);
if !feature("seq") {
fork-tile[16, 0, false, true](forward_input@outer_loop \ forward_input@inner_loop);
let (outer, inner) = fork-reshape[[1], [0]](forward_input@outer_loop \ forward_input@inner_loop);
fork-tile[16, 0, false, false](forward_input@outer_loop \ forward_input@inner_loop);
let (outer, inner) = fork-reshape[[0], [1]](forward_input@outer_loop \ forward_input@inner_loop);
forward_input = outline(inner);
inline(backprop@forward_input);
}
......@@ -53,8 +53,8 @@ let adjust_hidden = outline(backprop@adjust_hidden);
let adjust_input = outline(backprop@adjust_input);
if !feature("seq") {
fork-tile[16, 0, false, true](adjust_input);
let (outer, inner) = fork-reshape[[1], [0, 2]](adjust_input);
fork-tile[16, 0, false, false](adjust_input);
let (outer, inner) = fork-reshape[[0], [1, 2]](adjust_input);
adjust_input = outline(inner);
inline(backprop@adjust_input);
}
......
......@@ -41,14 +41,14 @@ parallel-fork(traverse, collect);
parallel-reduce(traverse, collect);
if !feature("seq") {
fork-tile[32, 0, false, true](traverse, collect);
let (outer, inner) = fork-reshape[[1], [0]](traverse);
fork-tile[32, 0, false, false](traverse, collect);
let (outer, inner) = fork-reshape[[0], [1]](traverse);
traverse = outline(inner);
let (outer, inner) = fork-reshape[[1], [0]](collect);
let (outer, inner) = fork-reshape[[0], [1]](collect);
collect = outline(inner);
fork-tile[32, 0, false, true](init);
let (outer, inner) = fork-reshape[[1], [0]](init);
fork-tile[32, 0, false, false](init);
let (outer, inner) = fork-reshape[[0], [1]](init);
let init_body = outline(inner);
inline(bfs@cost_init, bfs@loop1, bfs@loop2);
......@@ -56,8 +56,14 @@ if !feature("seq") {
}
delete-uncalled(*);
const-inline(*);
clean-monoid-reduces(collect);
simpl!(*);
fork-tile[8, 0, false, true](init, traverse, collect);
clean-monoid-reduces(collect);
simpl!(*);
fork-split(init, traverse, collect);
unforkify(init, traverse, collect);
simpl!(*);
gcm(*);
\ No newline at end of file
gcm(*);
......@@ -15,7 +15,7 @@ let traverse = outline(bfs@loop1);
let collect = outline(bfs@loop2);
parallel-reduce(traverse, collect);
no-memset(make_stop_prod);
gpu(traverse, make_stop_prod, collect);
gpu(init, traverse, make_stop_prod, collect);
simpl!(*);
predication(*);
......@@ -38,12 +38,8 @@ fixpoint {
}
simpl!(collect);
fork-tile[32, 0, false, true](init);
let (outer, inner) = fork-reshape[[1], [0]](init);
let init_body = outline(inner);
fork-tile[1024, 0, false, true](traverse, collect);
fork-split(traverse, collect);
fork-tile[1024, 0, false, true](init, traverse, collect);
let out = fork-split(init, traverse, collect);
simpl!(*);
unforkify(init_body);
gcm(*);
......@@ -57,5 +57,10 @@ if !feature("seq") {
copy_vars = copy_vars_body;
}
const-inline[false](*);
simpl!(*);
fork-split(compute_step_factor, compute_flux, time_step, copy_vars);
unforkify(compute_step_factor, compute_flux, time_step, copy_vars);
simpl!(*);
gcm(*);
......@@ -64,5 +64,10 @@ if !feature("seq") {
copy_vars = copy_vars_body;
}
const-inline[false](*);
simpl!(*);
fork-split(compute_step_factor, compute_flux_contributions, compute_flux, time_step, copy_vars);
unforkify(compute_step_factor, compute_flux_contributions, compute_flux, time_step, copy_vars);
simpl!(*);
gcm(*);
......@@ -28,14 +28,28 @@ fixpoint {
fork-guard-elim(*);
fork-coalesce(*);
}
fork-dim-merge(loop1);
simpl!(*);
fork-interchange[0, 1](loop1);
reduce-slf(*);
simpl!(*);
slf(*);
simpl!(*);
if !feature("seq") {
fork-tile[32, 0, false, false](loop1);
simpl!(loop1);
let split = fork-split(loop1);
simpl!(loop1);
clean-monoid-reduces(loop1);
let loop1_body = outline(split.srad_0.fj1);
simpl!(loop1, loop1_body);
unforkify(loop1_body);
let fission = fork-fission[split.srad_0.fj0](loop1);
simpl!(loop1, loop1_body);
unforkify(fission.srad_0.fj_bottom);
simpl!(loop1, loop1_body);
loop1 = loop1_body;
fork-tile[32, 0, false, false](loop2);
let split = fork-split(loop2);
let loop2_body = outline(split.srad_1.fj1);
......