Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • llvm/hercules
1 result
Show changes
Commits on Source (2)
Showing
with 209 additions and 125 deletions
......@@ -9,11 +9,16 @@ use crate::*;
* c) no domination by any other fork that's also dominated by F, where we do count self-domination
* Here too we include the non-fork start node, as key for all controls outside any fork.
*/
pub fn fork_control_map(fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>) -> HashMap<NodeID, HashSet<NodeID>> {
pub fn fork_control_map(
fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>,
) -> HashMap<NodeID, HashSet<NodeID>> {
let mut fork_control_map = HashMap::new();
for (control, forks) in fork_join_nesting {
let fork = forks.first().copied().unwrap_or(NodeID::new(0));
fork_control_map.entry(fork).or_insert_with(HashSet::new).insert(*control);
fork_control_map
.entry(fork)
.or_insert_with(HashSet::new)
.insert(*control);
}
fork_control_map
}
......@@ -24,13 +29,19 @@ pub fn fork_control_map(fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>) -> Has
* c) no domination by any other fork that's also dominated by F, where we don't count self-domination
* Note that the fork_tree also includes the non-fork start node, as unique root node.
*/
pub fn fork_tree(function: &Function, fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>) -> HashMap<NodeID, HashSet<NodeID>> {
pub fn fork_tree(
function: &Function,
fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>,
) -> HashMap<NodeID, HashSet<NodeID>> {
let mut fork_tree = HashMap::new();
for (control, forks) in fork_join_nesting {
if function.nodes[control.idx()].is_fork() {
fork_tree.entry(*control).or_insert_with(HashSet::new);
let nesting_fork = forks.get(1).copied().unwrap_or(NodeID::new(0));
fork_tree.entry(nesting_fork).or_insert_with(HashSet::new).insert(*control);
fork_tree
.entry(nesting_fork)
.or_insert_with(HashSet::new)
.insert(*control);
}
}
fork_tree
......
......@@ -1521,6 +1521,8 @@ extern \"C\" {} {}(",
let collect_variable = self.get_value(*collect, false, false);
write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?;
}
// Undef nodes never need to be assigned to.
Node::Undef { ty: _ } => {}
_ => {
panic!(
"Unsupported data node type: {:?}",
......
......@@ -77,13 +77,16 @@ fn guarded_fork(
};
// Filter out any terms which are just 1s
let non_ones = xs.iter().filter(|i| {
if let DynamicConstant::Constant(1) = editor.get_dynamic_constant(**i).deref() {
false
} else {
true
}
}).collect::<Vec<_>>();
let non_ones = xs
.iter()
.filter(|i| {
if let DynamicConstant::Constant(1) = editor.get_dynamic_constant(**i).deref() {
false
} else {
true
}
})
.collect::<Vec<_>>();
// If we're left with just one term x, we had max { 1, x }
if non_ones.len() == 1 {
Factor::Max(idx, *non_ones[0])
......
......@@ -181,8 +181,13 @@ fn preliminary_fixups(
reduce_cycles: &HashMap<NodeID, HashSet<NodeID>>,
) -> bool {
let nodes = &editor.func().nodes;
let schedules = &editor.func().schedules;
// Sequentialize non-parallel forks that contain problematic reduce cycles.
for (reduce, cycle) in reduce_cycles {
if cycle.into_iter().any(|id| nodes[id.idx()].is_reduce()) {
if !schedules[reduce.idx()].contains(&Schedule::ParallelReduce)
&& cycle.into_iter().any(|id| nodes[id.idx()].is_reduce())
{
let join = nodes[reduce.idx()].try_reduce().unwrap().0;
let fork = fork_join_map
.into_iter()
......@@ -198,6 +203,31 @@ fn preliminary_fixups(
return true;
}
}
// Get rid of the backward edge on parallel reduces in fork-joins.
for (_, join) in fork_join_map {
let parallel_reduces: Vec<_> = editor
.get_users(*join)
.filter(|id| {
nodes[id.idx()].is_reduce()
&& schedules[id.idx()].contains(&Schedule::ParallelReduce)
})
.collect();
for reduce in parallel_reduces {
if reduce_cycles[&reduce].is_empty() {
continue;
}
let (_, init, _) = nodes[reduce.idx()].try_reduce().unwrap();
// Replace uses of the reduce in its cycle with the init.
let success = editor.edit(|edit| {
edit.replace_all_uses_where(reduce, init, |id| reduce_cycles[&reduce].contains(id))
});
assert!(success);
return true;
}
}
false
}
......@@ -511,7 +541,8 @@ fn basic_blocks(
// outside of reduce loops. Nodes that do need to be in a reduce
// loop use the reduce node forming the loop, so the dominator chain
// will consist of one block, and this loop won't ever iterate.
let currently_at_join = function.nodes[location.idx()].is_join();
let currently_at_join = function.nodes[location.idx()].is_join()
&& !function.nodes[control_node.idx()].is_join();
if (!is_constant_or_undef || is_gpu_returned)
&& (shallower_nest || currently_at_join)
......@@ -811,7 +842,14 @@ fn spill_clones(
.into_iter()
.any(|u| *u == *b)
&& (editor.func().nodes[a.idx()].is_phi()
|| editor.func().nodes[a.idx()].is_reduce()))
|| editor.func().nodes[a.idx()].is_reduce())
&& !editor.func().nodes[a.idx()]
.try_reduce()
.map(|(_, init, _)| {
init == *b
&& editor.func().schedules[a.idx()].contains(&Schedule::ParallelReduce)
})
.unwrap_or(false))
});
// Step 3: if there is a spill edge, spill it and return true. Otherwise,
......@@ -989,15 +1027,16 @@ fn liveness_dataflow(
}
let mut num_phis_reduces = vec![0; function.nodes.len()];
let mut has_phi = vec![false; function.nodes.len()];
let mut has_reduce = vec![false; function.nodes.len()];
let mut has_seq_reduce = vec![false; function.nodes.len()];
for (node_idx, bb) in bbs.0.iter().enumerate() {
let node = &function.nodes[node_idx];
if node.is_phi() || node.is_reduce() {
num_phis_reduces[bb.idx()] += 1;
}
has_phi[bb.idx()] = node.is_phi();
has_reduce[bb.idx()] = node.is_reduce();
assert!(!has_phi[bb.idx()] || !has_reduce[bb.idx()]);
has_seq_reduce[bb.idx()] =
node.is_reduce() && !function.schedules[node_idx].contains(&Schedule::ParallelReduce);
assert!(!node.is_phi() || !node.is_reduce());
}
let is_obj = |id: NodeID| !objects[&func_id].objects(id).is_empty();
......@@ -1009,11 +1048,14 @@ fn liveness_dataflow(
let last_pt = bbs.1[bb.idx()].len();
let old_value = &liveness[&bb][last_pt];
let mut new_value = BTreeSet::new();
for succ in control_subgraph.succs(*bb).chain(if has_reduce[bb.idx()] {
Either::Left(once(*bb))
} else {
Either::Right(empty())
}) {
for succ in control_subgraph
.succs(*bb)
.chain(if has_seq_reduce[bb.idx()] {
Either::Left(once(*bb))
} else {
Either::Right(empty())
})
{
// The liveness at the bottom of a basic block is the union of:
// 1. The liveness of each succecessor right after its phis and
// reduces.
......@@ -1041,7 +1083,9 @@ fn liveness_dataflow(
assert_eq!(control, succ);
if succ == *bb {
new_value.insert(reduct);
} else {
} else if !function.schedules[id.idx()]
.contains(&Schedule::ParallelReduce)
{
new_value.insert(init);
}
}
......@@ -1058,6 +1102,7 @@ fn liveness_dataflow(
let mut new_value = liveness[&bb][pt + 1].clone();
let id = bbs.1[bb.idx()][pt];
let uses = get_uses(&function.nodes[id.idx()]);
let is_obj = |id: &NodeID| is_obj(*id);
new_value.remove(&id);
new_value.extend(
if let Node::Write {
......@@ -1070,14 +1115,19 @@ fn liveness_dataflow(
// If this write is a cloning write, the `collect` input
// isn't actually live, because its value doesn't
// matter.
Either::Left(once(data).filter(|id| is_obj(*id)))
Either::Left(once(data).filter(is_obj))
} else if let Node::Reduce {
control: _,
init: _,
reduct,
} = function.nodes[id.idx()]
&& function.schedules[id.idx()].contains(&Schedule::ParallelReduce)
{
// If this reduce is a parallel reduce, the `init` input
// isn't actually live.
Either::Left(once(reduct).filter(is_obj))
} else {
Either::Right(
uses.as_ref()
.into_iter()
.map(|id| *id)
.filter(|id| is_obj(*id)),
)
Either::Right(uses.as_ref().into_iter().map(|id| *id).filter(is_obj))
},
);
changed |= *old_value != new_value;
......
......@@ -4,7 +4,11 @@ fn main() {
JunoCompiler::new()
.ir_in_src("dot.hir")
.unwrap()
.schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
.schedule_in_src(if cfg!(feature = "cuda") {
"gpu.sch"
} else {
"cpu.sch"
})
.unwrap()
.build()
.unwrap();
......
#![feature(concat_idents)]
use hercules_rt::{runner, HerculesCPURef};
#[cfg(feature = "cuda")]
use hercules_rt::CUDABox;
use hercules_rt::{runner, HerculesCPURef};
juno_build::juno!("dot");
......
......@@ -4,7 +4,11 @@ fn main() {
JunoCompiler::new()
.ir_in_src("matmul.hir")
.unwrap()
.schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
.schedule_in_src(if cfg!(feature = "cuda") {
"gpu.sch"
} else {
"cpu.sch"
})
.unwrap()
.build()
.unwrap();
......
......@@ -2,9 +2,9 @@
use rand::random;
use hercules_rt::{runner, HerculesCPURef};
#[cfg(feature = "cuda")]
use hercules_rt::CUDABox;
use hercules_rt::{runner, HerculesCPURef};
juno_build::juno!("matmul");
......@@ -36,7 +36,9 @@ fn main() {
let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a));
let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
let mut r = runner!(matmul);
let c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await;
let c = r
.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref())
.await;
let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice();
c.to_cpu_ref(&mut c_cpu);
assert_eq!(&*c_cpu, &*correct_c);
......
......@@ -69,18 +69,18 @@ pub fn dyn_const_value(
match dc {
DynamicConstant::Constant(v) => *v,
DynamicConstant::Parameter(v) => dyn_const_params[*v],
DynamicConstant::Add(xs) => {
xs.iter().map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
.fold(0, |s, v| s + v)
}
DynamicConstant::Add(xs) => xs
.iter()
.map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
.fold(0, |s, v| s + v),
DynamicConstant::Sub(a, b) => {
dyn_const_value(a, dyn_const_values, dyn_const_params)
- dyn_const_value(b, dyn_const_values, dyn_const_params)
}
DynamicConstant::Mul(xs) => {
xs.iter().map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
.fold(1, |p, v| p * v)
}
DynamicConstant::Mul(xs) => xs
.iter()
.map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
.fold(1, |p, v| p * v),
DynamicConstant::Div(a, b) => {
dyn_const_value(a, dyn_const_values, dyn_const_params)
/ dyn_const_value(b, dyn_const_values, dyn_const_params)
......@@ -89,28 +89,28 @@ pub fn dyn_const_value(
dyn_const_value(a, dyn_const_values, dyn_const_params)
% dyn_const_value(b, dyn_const_values, dyn_const_params)
}
DynamicConstant::Max(xs) => {
xs.iter().map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
.fold(None, |m, v| {
if let Some(m) = m {
Some(max(m, v))
} else {
Some(v)
}
})
.unwrap()
}
DynamicConstant::Min(xs) => {
xs.iter().map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
.fold(None, |m, v| {
if let Some(m) = m {
Some(min(m, v))
} else {
Some(v)
}
})
.unwrap()
}
DynamicConstant::Max(xs) => xs
.iter()
.map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
.fold(None, |m, v| {
if let Some(m) = m {
Some(max(m, v))
} else {
Some(v)
}
})
.unwrap(),
DynamicConstant::Min(xs) => xs
.iter()
.map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
.fold(None, |m, v| {
if let Some(m) = m {
Some(min(m, v))
} else {
Some(v)
}
})
.unwrap(),
}
}
......
......@@ -35,9 +35,7 @@ fn alternate_bounds_use_after_loop_no_tid() {
println!("result: {:?}", result_1);
let schedule = default_schedule![
Forkify,
];
let schedule = default_schedule![Forkify,];
let module = run_schedule_on_hercules(module, Some(schedule)).unwrap();
......@@ -61,9 +59,7 @@ fn alternate_bounds_use_after_loop() {
println!("result: {:?}", result_1);
let schedule = Some(default_schedule![
Forkify,
]);
let schedule = Some(default_schedule![Forkify,]);
let module = run_schedule_on_hercules(module, schedule).unwrap();
......@@ -108,10 +104,7 @@ fn do_while_separate_body() {
println!("result: {:?}", result_1);
let schedule = Some(default_schedule![
PhiElim,
Forkify,
]);
let schedule = Some(default_schedule![PhiElim, Forkify,]);
let module = run_schedule_on_hercules(module, schedule).unwrap();
......@@ -131,10 +124,7 @@ fn alternate_bounds_internal_control() {
println!("result: {:?}", result_1);
let schedule = Some(default_schedule![
PhiElim,
Forkify,
]);
let schedule = Some(default_schedule![PhiElim, Forkify,]);
let module = run_schedule_on_hercules(module, schedule).unwrap();
......@@ -155,10 +145,7 @@ fn alternate_bounds_internal_control2() {
println!("result: {:?}", result_1);
let schedule = Some(default_schedule![
PhiElim,
Forkify,
]);
let schedule = Some(default_schedule![PhiElim, Forkify,]);
let module = run_schedule_on_hercules(module, schedule).unwrap();
......@@ -366,16 +353,13 @@ fn look_at_local() {
"/home/xavierrouth/dev/hercules/hercules_test/hercules_tests/save_me.hbin",
);
let schedule = Some(default_schedule![
]);
let schedule = Some(default_schedule![]);
let result_1 = interp_module!(module, 0, dyn_consts, a.clone(), b.clone());
let module = run_schedule_on_hercules(module.clone(), schedule).unwrap();
let schedule = Some(default_schedule![
Unforkify, Verify,
]);
let schedule = Some(default_schedule![Unforkify, Verify,]);
let module = run_schedule_on_hercules(module.clone(), schedule).unwrap();
......
......@@ -752,7 +752,16 @@ fn analyze_program(
}
arg_info.push((ty, inout.is_some(), var));
match process_irrefutable_pattern(pattern, false, var, ty, lexer, &mut stringtab, &mut env, &mut types) {
match process_irrefutable_pattern(
pattern,
false,
var,
ty,
lexer,
&mut stringtab,
&mut env,
&mut types,
) {
Ok(prep) => {
stmts.extend(prep);
}
......
......@@ -8,9 +8,9 @@ use self::camera_model::*;
use self::cava_rust::CHAN;
use self::image_proc::*;
use hercules_rt::{runner, HerculesCPURef};
#[cfg(feature = "cuda")]
use hercules_rt::CUDABox;
use hercules_rt::{runner, HerculesCPURef};
use image::ImageError;
......@@ -31,7 +31,6 @@ fn run_cava(
coefs: &[f32],
tonemap: &[f32],
) -> Box<[u8]> {
assert_eq!(image.len(), CHAN * rows * cols);
assert_eq!(tstw.len(), CHAN * CHAN);
assert_eq!(ctrl_pts.len(), num_ctrl_pts * CHAN);
......@@ -47,21 +46,24 @@ fn run_cava(
let weights = HerculesCPURef::from_slice(weights);
let coefs = HerculesCPURef::from_slice(coefs);
let tonemap = HerculesCPURef::from_slice(tonemap);
let mut r = runner!(cava);
async_std::task::block_on(async {
r.run(
rows as u64,
cols as u64,
num_ctrl_pts as u64,
image,
tstw,
ctrl_pts,
weights,
coefs,
tonemap,
)
.await
}).as_slice::<u8>().to_vec().into_boxed_slice()
let mut r = runner!(cava);
async_std::task::block_on(async {
r.run(
rows as u64,
cols as u64,
num_ctrl_pts as u64,
image,
tstw,
ctrl_pts,
weights,
coefs,
tonemap,
)
.await
})
.as_slice::<u8>()
.to_vec()
.into_boxed_slice()
}
#[cfg(feature = "cuda")]
......@@ -72,8 +74,8 @@ fn run_cava(
let weights = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(weights));
let coefs = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(coefs));
let tonemap = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tonemap));
let mut r = runner!(cava);
let res = async_std::task::block_on(async {
let mut r = runner!(cava);
let res = async_std::task::block_on(async {
r.run(
rows as u64,
cols as u64,
......@@ -86,7 +88,7 @@ fn run_cava(
tonemap.get_ref(),
)
.await
});
});
let num_out = unsafe { res.__size() / std::mem::size_of::<u8>() };
let mut res_cpu: Box<[u8]> = vec![0; num_out].into_boxed_slice();
res.to_cpu_ref(&mut res_cpu);
......@@ -204,7 +206,8 @@ fn cava_harness(args: CavaInputs) {
.expect("Error saving verification image");
}
let max_diff = result.iter()
let max_diff = result
.iter()
.zip(cpu_result.iter())
.map(|(a, b)| (*a as i16 - *b as i16).abs())
.max()
......
#![feature(concat_idents)]
use hercules_rt::runner;
use hercules_rt::HerculesCPURef;
#[cfg(feature = "cuda")]
use hercules_rt::CUDABox;
use hercules_rt::HerculesCPURef;
juno_build::juno!("concat");
......@@ -20,7 +20,7 @@ fn main() {
assert_eq!(output, 42);
const N: usize = 3;
let arr : Box<[i32]> = (2..=4).collect();
let arr: Box<[i32]> = (2..=4).collect();
let arr = HerculesCPURef::from_slice(&arr);
let mut r = runner!(concat_switch);
......
......@@ -2,9 +2,9 @@
mod edge_detection_rust;
use hercules_rt::{runner, HerculesCPURef};
#[cfg(feature = "cuda")]
use hercules_rt::CUDABox;
use hercules_rt::{runner, HerculesCPURef};
use std::slice::from_raw_parts;
......@@ -228,9 +228,9 @@ fn edge_detection_harness(args: EdgeDetectionInputs) {
});
#[cfg(not(feature = "cuda"))]
let result : Box<[f32]> = result.as_slice::<f32>().to_vec().into_boxed_slice();
let result: Box<[f32]> = result.as_slice::<f32>().to_vec().into_boxed_slice();
#[cfg(feature = "cuda")]
let result : Box<[f32]> = {
let result: Box<[f32]> = {
let num_out = unsafe { result.__size() / std::mem::size_of::<f32>() };
let mut res_cpu: Box<[f32]> = vec![0.0; num_out].into_boxed_slice();
result.to_cpu_ref(&mut res_cpu);
......@@ -261,7 +261,10 @@ fn edge_detection_harness(args: EdgeDetectionInputs) {
theta,
);
assert_eq!(result.as_ref(), <Vec<f32> as AsRef<[f32]>>::as_ref(&rust_result));
assert_eq!(
result.as_ref(),
<Vec<f32> as AsRef<[f32]>>::as_ref(&rust_result)
);
println!("Frames {} match", i);
if display_verify {
......
......@@ -62,7 +62,7 @@ fn test4(input : i32) -> i32[4, 4] {
#[entry]
fn test5(input : i32) -> i32[4] {
let arr1 : i32[4];
@cons let arr1 : i32[4];
for i = 0 to 4 {
let red = arr1[i];
for k = 0 to 3 {
......
no-memset(test5@cons);
parallel-reduce(test5@reduce);
gvn(*);
......
......@@ -46,6 +46,6 @@ fn main() {
}
#[test]
fn implicit_clone_test() {
fn fork_join_test() {
main();
}
......@@ -2,9 +2,9 @@
use rand::random;
use hercules_rt::{runner, HerculesCPURef};
#[cfg(feature = "cuda")]
use hercules_rt::CUDABox;
use hercules_rt::{runner, HerculesCPURef};
juno_build::juno!("matmul");
......@@ -28,10 +28,14 @@ fn main() {
let a = HerculesCPURef::from_slice(&a);
let b = HerculesCPURef::from_slice(&b);
let mut r = runner!(matmul);
let c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await;
let c = r
.run(I as u64, J as u64, K as u64, a.clone(), b.clone())
.await;
assert_eq!(c.as_slice::<i32>(), &*correct_c);
let mut r = runner!(tiled_64_matmul);
let tiled_c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await;
let tiled_c = r
.run(I as u64, J as u64, K as u64, a.clone(), b.clone())
.await;
assert_eq!(tiled_c.as_slice::<i32>(), &*correct_c);
}
#[cfg(feature = "cuda")]
......@@ -39,12 +43,16 @@ fn main() {
let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a));
let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
let mut r = runner!(matmul);
let c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await;
let c = r
.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref())
.await;
let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice();
c.to_cpu_ref(&mut c_cpu);
assert_eq!(&*c_cpu, &*correct_c);
let mut r = runner!(tiled_64_matmul);
let tiled_c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await;
let tiled_c = r
.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref())
.await;
let mut tiled_c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice();
tiled_c.to_cpu_ref(&mut tiled_c_cpu);
assert_eq!(&*tiled_c_cpu, &*correct_c);
......
#![feature(concat_idents)]
use hercules_rt::{runner, HerculesCPURef, HerculesCPURefMut};
#[cfg(feature = "cuda")]
use hercules_rt::CUDABox;
use hercules_rt::{runner, HerculesCPURef, HerculesCPURefMut};
juno_build::juno!("nested_ccp");
......
#![feature(concat_idents)]
use hercules_rt::{runner};
use hercules_rt::runner;
juno_build::juno!("patterns");
......