Skip to content
Snippets Groups Projects
Commit d3f74c3d authored by Aaron Councilman's avatar Aaron Councilman
Browse files

Fix GPU addressing bug

parent 61274600
No related branches found
No related tags found
1 merge request!160Fix GPU addressing bug
......@@ -1209,6 +1209,16 @@ dependencies = [
"with_builtin_macros",
]
[[package]]
name = "juno_product_read"
version = "0.1.0"
dependencies = [
"async-std",
"hercules_rt",
"juno_build",
"with_builtin_macros",
]
[[package]]
name = "juno_schedule_test"
version = "0.1.0"
......
......@@ -33,4 +33,5 @@ members = [
"juno_samples/edge_detection",
"juno_samples/fork_join_tests",
"juno_samples/multi_device",
"juno_samples/product_read",
]
......@@ -284,11 +284,7 @@ impl GPUContext<'_> {
// If there are no forks, fast forward to single-block, single-thread codegen
let (num_blocks, num_threads) = if self.fork_join_map.is_empty() {
self.codegen_data_control_no_forks(
&HashSet::new(),
&mut dynamic_shared_offset,
&mut gotos,
)?;
self.codegen_data_control_no_forks(&mut dynamic_shared_offset, &mut gotos)?;
("1".to_string(), "1".to_string())
} else {
// Create structures and determine block and thread parallelization strategy
......@@ -298,10 +294,6 @@ impl GPUContext<'_> {
self.get_thread_root_forks(&root_forks, self.fork_tree, is_block_parallel);
let (fork_thread_quota_map, num_threads) =
self.get_thread_quotas(self.fork_tree, thread_root_root_fork);
// TODO: Uncomment and adjust once we know logic of extra dim. This will affect constant
// collections, reads, and writes.
// let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map);
let extra_dim_collects = HashSet::new();
// Core function for the CUDA code of all data and control nodes.
self.codegen_data_control(
......@@ -312,7 +304,6 @@ impl GPUContext<'_> {
},
&thread_root_forks,
&fork_thread_quota_map,
&extra_dim_collects,
&mut dynamic_shared_offset,
is_block_parallel,
num_threads,
......@@ -859,25 +850,8 @@ extern \"C\" {} {}(",
}
}
/*
* All non reduced-over collections used in fork joins have an extra dimension.
* However, this is only useful if ThreadIDs run in parallel not serially,
* otherwise it's unnecessarily consuming shared memory. This function returns
* the set of collections that have an unnecessary extra dimension.
*/
fn get_extra_dim_collects(
&self,
fork_control_map: &HashMap<NodeID, HashSet<NodeID>>,
fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>,
) -> HashSet<TypeID> {
// Determine which fork each collection is used in, and check if it's
// parallelized via the fork_thread_quota_map.
todo!()
}
fn codegen_data_control_no_forks(
&self,
extra_dim_collects: &HashSet<TypeID>,
dynamic_shared_offset: &mut String,
gotos: &mut BTreeMap<NodeID, CudaGoto>,
) -> Result<(), Error> {
......@@ -901,7 +875,6 @@ extern \"C\" {} {}(",
None,
None,
false,
extra_dim_collects,
dynamic_shared_offset,
body,
&mut tabs,
......@@ -919,7 +892,6 @@ extern \"C\" {} {}(",
block_fork: Option<NodeID>,
thread_root_forks: &HashSet<NodeID>,
fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>,
extra_dim_collects: &HashSet<TypeID>,
dynamic_shared_offset: &mut String,
is_block_parallel: bool,
num_threads: usize,
......@@ -945,7 +917,6 @@ extern \"C\" {} {}(",
None,
None,
false,
extra_dim_collects,
dynamic_shared_offset,
body,
&mut tabs,
......@@ -979,7 +950,6 @@ extern \"C\" {} {}(",
None,
Some(block_fork.unwrap()),
false,
extra_dim_collects,
dynamic_shared_offset,
body,
&mut tabs,
......@@ -996,7 +966,6 @@ extern \"C\" {} {}(",
fork_thread_quota_map,
1,
num_threads,
extra_dim_collects,
dynamic_shared_offset,
gotos,
)?;
......@@ -1017,7 +986,6 @@ extern \"C\" {} {}(",
fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>,
parent_quota: usize,
num_threads: usize,
extra_dim_collections: &HashSet<TypeID>,
dynamic_shared_offset: &mut String,
gotos: &mut BTreeMap<NodeID, CudaGoto>,
) -> Result<(), Error> {
......@@ -1068,7 +1036,6 @@ extern \"C\" {} {}(",
parallel_factor,
Some(curr_fork),
reducts.contains(data),
extra_dim_collections,
dynamic_shared_offset,
body,
&mut tabs,
......@@ -1082,7 +1049,6 @@ extern \"C\" {} {}(",
fork_thread_quota_map,
use_thread_quota,
num_threads,
extra_dim_collections,
dynamic_shared_offset,
gotos,
)?;
......@@ -1099,7 +1065,6 @@ extern \"C\" {} {}(",
parallel_factor: Option<usize>,
nesting_fork: Option<NodeID>,
is_special_reduct: bool,
extra_dim_collects: &HashSet<TypeID>,
dynamic_shared_offset: &mut String,
w: &mut String,
num_tabs: &mut usize,
......@@ -1206,7 +1171,6 @@ extern \"C\" {} {}(",
define_variable.clone(),
*cons_id,
true,
Some(extra_dim_collects),
dynamic_shared_offset,
w,
*num_tabs,
......@@ -1232,8 +1196,7 @@ extern \"C\" {} {}(",
if !is_primitive
&& (state != KernelState::OutBlock || !is_block_parallel.unwrap_or(false))
{
let data_size =
self.get_size(self.typing[id.idx()], None, Some(extra_dim_collects));
let data_size = self.get_size(self.typing[id.idx()], None);
write!(
w,
"{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n",
......@@ -1453,8 +1416,7 @@ extern \"C\" {} {}(",
}
// Read of primitive requires load after pointer math.
Node::Read { collect, indices } => {
let collect_with_indices =
self.codegen_collect(*collect, indices, extra_dim_collects);
let collect_with_indices = self.codegen_collect(*collect, indices);
let data_type_id = self.typing[id.idx()];
if self.types[data_type_id.idx()].is_primitive() {
let type_name = self.get_type(data_type_id, true);
......@@ -1478,8 +1440,7 @@ extern \"C\" {} {}(",
data,
indices,
} => {
let collect_with_indices =
self.codegen_collect(*collect, indices, extra_dim_collects);
let collect_with_indices = self.codegen_collect(*collect, indices);
let data_variable = self.get_value(*data, false, false);
let data_type_id = self.typing[data.idx()];
let cg_tile = match state {
......@@ -1498,7 +1459,7 @@ extern \"C\" {} {}(",
)?;
write!(w, "{}}}\n", tabs)?;
} else {
let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects));
let data_size = self.get_size(data_type_id, None);
write!(
w,
"{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n",
......@@ -1754,31 +1715,31 @@ extern \"C\" {} {}(",
* This function emits collection name + pointer math for the provided indices.
* All collection types use char pointers.
*/
fn codegen_collect(
&self,
collect: NodeID,
indices: &[Index],
extra_dim_collects: &HashSet<TypeID>,
) -> String {
fn codegen_collect(&self, collect: NodeID, indices: &[Index]) -> String {
let mut index_ptr = "0".to_string();
let type_id = self.typing[collect.idx()];
let mut type_id = self.typing[collect.idx()];
for index in indices {
match index {
Index::Field(field) => {
self.get_size(type_id, Some(*field), Some(extra_dim_collects));
index_ptr.push_str(&format!(" + ({})", self.get_size(type_id, Some(*field))));
type_id = if let Type::Product(fields) = &self.types[type_id.idx()] {
fields[*field]
} else {
panic!("Expected product type")
};
}
// Variants of summations have zero offset
Index::Variant(_) => {}
Index::Variant(index) => {
type_id = if let Type::Summation(variants) = &self.types[type_id.idx()] {
variants[*index]
} else {
panic!("Expected summation type")
};
}
// Convert multi-d array index to 1-d index, and optionally
// convert to single-byte index by multiplying by element size
Index::Position(array_indices) => {
let has_extra_dim = extra_dim_collects.contains(&self.typing[collect.idx()]);
if has_extra_dim {
continue;
}
let Type::Array(element_type, extents) =
&self.types[self.typing[collect.idx()].idx()]
else {
let Type::Array(element_type, extents) = &self.types[type_id.idx()] else {
panic!("Expected array type")
};
let mut cumulative_offset = multiply_dcs(&extents[array_indices.len()..]);
......@@ -1800,8 +1761,9 @@ extern \"C\" {} {}(",
cumulative_offset,
")".repeat(array_indices.len())
));
let element_size = self.get_size(*element_type, None, Some(extra_dim_collects));
index_ptr.push_str(&format!(" * {}", element_size));
let element_size = self.get_size(*element_type, None);
index_ptr.push_str(&format!(" * ({})", element_size));
type_id = *element_type;
}
}
}
......@@ -1825,7 +1787,6 @@ extern \"C\" {} {}(",
name: String,
cons_id: ConstantID,
allow_allocate: bool,
extra_dim_collects: Option<&HashSet<TypeID>>,
dynamic_shared_offset: &mut String,
w: &mut String,
num_tabs: usize,
......@@ -1850,7 +1811,7 @@ extern \"C\" {} {}(",
Constant::Product(type_id, constant_fields) => {
if allow_allocate {
let alignment = self.get_alignment(*type_id);
let size = self.get_size(*type_id, None, extra_dim_collects);
let size = self.get_size(*type_id, None);
*dynamic_shared_offset = format!(
"(({} + {} - 1) / {}) * {}",
dynamic_shared_offset, alignment, alignment, alignment
......@@ -1872,7 +1833,7 @@ extern \"C\" {} {}(",
};
for i in 0..constant_fields.len() {
// For each field update offset and issue recursive call
let offset = self.get_size(type_fields[i], Some(i), extra_dim_collects);
let offset = self.get_size(type_fields[i], Some(i));
let field_constant = &self.constants[constant_fields[i].idx()];
if field_constant.is_scalar() {
let field_type = self.get_type(type_fields[i], true);
......@@ -1880,7 +1841,6 @@ extern \"C\" {} {}(",
format!("*reinterpret_cast<{}>({}+{})", field_type, name, offset),
constant_fields[i],
false,
None,
dynamic_shared_offset,
w,
num_tabs,
......@@ -1890,7 +1850,6 @@ extern \"C\" {} {}(",
format!("{}+{}", name, offset),
constant_fields[i],
false,
extra_dim_collects,
dynamic_shared_offset,
w,
num_tabs,
......@@ -1901,7 +1860,7 @@ extern \"C\" {} {}(",
Constant::Summation(type_id, variant, field) => {
if allow_allocate {
let alignment = self.get_alignment(*type_id);
let size = self.get_size(*type_id, None, extra_dim_collects);
let size = self.get_size(*type_id, None);
*dynamic_shared_offset = format!(
"(({} + {} - 1) / {}) * {}",
dynamic_shared_offset, alignment, alignment, alignment
......@@ -1930,21 +1889,12 @@ extern \"C\" {} {}(",
format!("*reinterpret_cast<{}>({})", variant_type, name),
*field,
false,
extra_dim_collects,
dynamic_shared_offset,
w,
num_tabs,
)?;
} else if !variant_constant.is_array() {
self.codegen_constant(
name,
*field,
false,
extra_dim_collects,
dynamic_shared_offset,
w,
num_tabs,
)?;
self.codegen_constant(name, *field, false, dynamic_shared_offset, w, num_tabs)?;
};
}
Constant::Array(type_id) => {
......@@ -1952,7 +1902,7 @@ extern \"C\" {} {}(",
panic!("Nested array constant should not be re-allocated");
}
let alignment = self.get_alignment(*type_id);
let size = self.get_size(*type_id, None, extra_dim_collects);
let size = self.get_size(*type_id, None);
*dynamic_shared_offset = format!(
"(({} + {} - 1) / {}) * {}",
dynamic_shared_offset, alignment, alignment, alignment
......@@ -1979,35 +1929,19 @@ extern \"C\" {} {}(",
* and offset to 2nd field. This is useful for constant initialization and read/write
* index math.
*/
fn get_size(
&self,
type_id: TypeID,
num_fields: Option<usize>,
extra_dim_collects: Option<&HashSet<TypeID>>,
) -> String {
fn get_size(&self, type_id: TypeID, num_fields: Option<usize>) -> String {
match &self.types[type_id.idx()] {
Type::Array(element_type, extents) => {
let array_size = if extra_dim_collects.is_some()
&& extra_dim_collects.unwrap().contains(&type_id)
{
"1".to_string()
} else {
multiply_dcs(extents)
};
format!("{} * {}", self.get_alignment(*element_type), array_size)
assert!(num_fields.is_none());
let array_size = multiply_dcs(extents);
format!("{} * {}", self.get_size(*element_type, None), array_size)
}
Type::Product(fields) => {
let num_fields = &num_fields.unwrap_or(fields.len());
let with_field = fields
let num_fields = num_fields.unwrap_or(fields.len());
fields
.iter()
.enumerate()
.filter(|(i, _)| i < num_fields)
.map(|(_, id)| {
(
self.get_size(*id, None, extra_dim_collects),
self.get_alignment(*id),
)
})
.take(num_fields)
.map(|id| (self.get_size(*id, None), self.get_alignment(*id)))
.fold(String::from("0"), |acc, (size, align)| {
if acc == "0" {
size
......@@ -2017,31 +1951,23 @@ extern \"C\" {} {}(",
acc, align, align, align, size
)
}
});
if num_fields < &fields.len() {
format!(
"{} - {}",
with_field,
self.get_size(fields[*num_fields], None, extra_dim_collects)
)
} else {
with_field
}
})
}
Type::Summation(variants) => {
assert!(num_fields.is_none());
// The argmax variant by size is not guaranteed to be same as
// argmax variant by alignment, eg product of 3 4-byte primitives
// vs 1 8-byte primitive, so we need to calculate both.
let max_size = variants
.iter()
.map(|id| self.get_size(*id, None, extra_dim_collects))
.fold(String::from("0"), |acc, x| {
let max_size = variants.iter().map(|id| self.get_size(*id, None)).fold(
String::from("0"),
|acc, x| {
if acc == "0" {
x
} else {
format!("umax({}, {})", acc, x)
}
});
},
);
let max_alignment = variants
.iter()
.map(|id| self.get_alignment(*id))
......@@ -2052,7 +1978,10 @@ extern \"C\" {} {}(",
max_size, max_alignment, max_alignment, max_alignment
)
}
_ => format!("{}", self.get_alignment(type_id)),
_ => {
assert!(num_fields.is_none());
format!("{}", self.get_alignment(type_id))
}
}
}
......
[package]
name = "juno_product_read"
version = "0.1.0"
authors = ["Aaron Councilman <aaronjc4@illinois.edu>"]
edition = "2021"
[[bin]]
name = "juno_product_read"
path = "src/main.rs"
[features]
cuda = ["juno_build/cuda", "hercules_rt/cuda"]
[build-dependencies]
juno_build = { path = "../../juno_build" }
[dependencies]
juno_build = { path = "../../juno_build" }
hercules_rt = { path = "../../hercules_rt" }
with_builtin_macros = "0.1.0"
async-std = "*"
use juno_build::JunoCompiler;
fn main() {
#[cfg(not(feature = "cuda"))]
{
JunoCompiler::new()
.file_in_src("product_read.jn")
.unwrap()
.build()
.unwrap();
}
#[cfg(feature = "cuda")]
{
JunoCompiler::new()
.file_in_src("product_read.jn")
.unwrap()
.schedule_in_src("gpu.sch")
.unwrap()
.build()
.unwrap();
}
}
gvn(*);
phi-elim(*);
dce(*);
let out = auto-outline(*);
gpu(out.product_read);
ip-sroa(*);
sroa(*);
crc(*);
dce(*);
gvn(*);
phi-elim(*);
dce(*);
infer-schedules(*);
float-collections(*);
gcm(*);
#![feature(concat_idents)]
use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox};
juno_build::juno!("product_read");
fn main() {
async_std::task::block_on(async {
let input = vec![(0, 1), (2, 3)];
let input : HerculesImmBox<(i32, i32)> = HerculesImmBox::from(input.as_slice());
let mut r = runner!(product_read);
let res : Vec<i32> = HerculesMutBox::from(r.run(input.to()).await).as_slice().to_vec();
assert_eq!(res, vec![0, 1, 2, 3]);
});
}
#[test]
fn products_test() {
main();
}
#[entry]
fn product_read(input: (i32, i32)[2]) -> i32[4] {
let result : i32[4];
result[0] = input[0].0;
result[1] = input[0].1;
result[2] = input[1].0;
result[3] = input[1].1;
return result;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment