Skip to content
Snippets Groups Projects

GPU backend

Merged prathi3 requested to merge gpu-cg into main
1 file
+ 43
48
Compare changes
  • Side-by-side
  • Inline
+ 43
48
@@ -47,12 +47,17 @@ pub fn gpu_codegen<W: Write>(
.map(NodeID::new)
.collect();
// Fork reduce map should have all reduces contained in some key
let fork_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new();
// Reduct reduce map should have all non-parallel and non-associative reduces
// contained in some key. Unlike fork, reduct is not involved in any assertions,
// put it here for convenience but can move.
let reduct_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new();
for reduce_node in &reduce_nodes {
if let Node::Reduce {
control,
init: _,
reduct: _,
reduct,
} = &function.nodes[reduce_node.idx()]
{
match function.nodes[control.idx()] {
@@ -71,6 +76,13 @@ pub fn gpu_codegen<W: Write>(
panic!("Reduce's control must be a join or region node");
}
}
if !function.schedules[reduce_node.idx()].contains(&Schedule::ParallelReduce)
&& !function.schedules[reduce_node.idx()].contains(&Schedule::Associative) {
reduct_reduce_map
.entry(*reduct)
.or_default()
.push(*reduce_node);
}
}
}
for idx in 0..function.nodes.len() {
@@ -160,6 +172,7 @@ pub fn gpu_codegen<W: Write>(
bbs,
kernel_params,
fork_reduce_map,
reduct_reduce_map,
label_data_for_phi,
return_type_id,
};
@@ -187,6 +200,7 @@ struct GPUContext<'a> {
bbs: &'a Vec<NodeID>,
kernel_params: &'a GPUKernelParams,
fork_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>,
reduct_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>,
label_data_for_phi: &'a HashMap<NodeID, Vec<NodeID>>,
return_type_id: &'a TypeID,
}
@@ -367,7 +381,9 @@ impl GPUContext<'_> {
}
// Emit helper registers that are used throughout the kernel- alignment
// is for proper dynamic shared memory allocation
// is for proper dynamic shared memory allocation, max_variant_size is
// for variant selection during read/write copies since we don't keep
// tag (don't need and it can double summation memory usage due to alignment)
fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> {
write!(w, "\tsize_t alignment;\n")?;
write!(w, "\tsize_t max_variant_size;\n")?;
@@ -555,45 +571,6 @@ impl GPUContext<'_> {
}
}
// /*
// * For each parallel reduce with a reduct write, meaning it's at the end of
// * a potential parallel reduction chain, we walk back to beginning of chain
// * and update the write's collect to be the beginning's init.
// */
// fn update_write_collects(&self) -> HashMap<NodeID, NodeID> {
// let mut write_collect_map = HashMap::new();
// let mut parallel_reduces: HashSet<NodeID> = (0..self.function.nodes.len())
// .map(NodeID::new)
// .filter(|&node_id| {
// self.function.schedules[node_id.idx()].contains(&Schedule::ParallelReduce)
// })
// .collect();
// for reduce in parallel_reduces.clone() {
// if let Node::Reduce {
// control: _,
// init,
// reduct,
// } = &self.function.nodes[reduce.idx()]
// && let Node::Write { .. } = &self.function.nodes[reduct.idx()]
// {
// parallel_reduces.remove(&reduce);
// while parallel_reduces.contains(&init) {
// let Node::Reduce {
// control: _,
// init,
// reduct: _,
// } = &self.function.nodes[init.idx()]
// else {
// panic!("Expected reduce node");
// };
// parallel_reduces.remove(&init);
// }
// write_collect_map.insert(*reduct, *init);
// }
// }
// write_collect_map
// }
fn codegen_data_control(
&self,
root_forks: &Vec<NodeID>,
@@ -684,10 +661,8 @@ impl GPUContext<'_> {
_ => { panic!("Unsupported state for ThreadID") }
}
}
Node::Reduce { control: _, init, reduct: _ } => {
let init_val = self.get_value(*init, false, false);
write!(w, "{}{} = {};\n", tabs, declare_variable, init_val)?;
}
// Fork initializes the reduce and reduct updates the reduce
Node::Reduce { control: _, init: _, reduct: _ } => {}
// Parameters emitted at top
Node::Parameter { index: _ } => {}
Node::Constant { id: cons_id } => {
@@ -889,16 +864,25 @@ impl GPUContext<'_> {
}
}
if let Some(phis) = self.label_data_for_phi.get(&id) {
let val = self.get_value(id, false, false);
for phi in phis {
let phi_val = self.get_value(*phi, false, false);
write!(
w,
"{}{} = {};\n",
tabs,
self.get_value(*phi, false, false),
self.get_value(id, false, false),
phi_val,
val,
)?;
}
}
if let Some(reduces) = self.reduct_reduce_map.get(&id) {
let val = self.get_value(id, true, false);
for reduce in reduces {
let reduce_val = self.get_value(*reduce, false, false);
write!(w, "{}{} = {};\n", tabs, reduce_val, val)?;
}
}
Ok(())
}
@@ -937,7 +921,18 @@ impl GPUContext<'_> {
Node::Fork {
control: _,
factors: _,
} => {}
} => {
// Emitting reduces before the fork allows the reduce to be
// used outside of the fork.
for &reduce in self.fork_reduce_map.get(&id).unwrap() {
let reduce_val = self.get_value(reduce, true, false);
let Node::Reduce { control: _, init, reduct: _ } = &self.function.nodes[reduce.idx()] else {
panic!("Expected reduce node");
};
let init_val = self.get_value(*init, true, false);
write!(w, "{}{} = {};\n", tabs, reduce_val, init_val)?;
}
}
Node::Join { control: _ } => {}
Node::Return { control: _, data } => {
if self.types[self.typing[data.idx()].idx()].is_primitive() {
Loading