From c4ef507b5b83a128942c2a8007d1e38bcc1ef486 Mon Sep 17 00:00:00 2001 From: rarbore2 <rarbore2@illinois.edu> Date: Mon, 6 May 2024 14:53:59 -0500 Subject: [PATCH] CPU partition respecting backend --- .gitignore | 2 +- Cargo.lock | 204 +++- Cargo.toml | 3 +- hercules_cg/Cargo.toml | 2 +- hercules_cg/src/common.rs | 525 +++++++++ hercules_cg/src/cpu.rs | 1001 +++++++++++++++++ hercules_cg/src/cpu_beta.rs | 1 + hercules_cg/src/lib.rs | 8 + hercules_cg/src/top.rs | 135 +++ hercules_ir/Cargo.toml | 3 +- hercules_ir/src/ir.rs | 54 +- hercules_ir/src/lib.rs | 2 +- hercules_ir/src/schedule.rs | 19 +- hercules_ir/src/subgraph.rs | 6 + hercules_opt/Cargo.toml | 3 + hercules_opt/src/pass.rs | 85 +- hercules_rt/Cargo.toml | 3 + hercules_rt/src/lib.rs | 37 +- hercules_rt/src/manifest.rs | 97 ++ hercules_samples/matmul/src/main.rs | 10 +- hercules_samples/sum_sample.hir | 6 +- hercules_samples/task_parallel.hir | 14 + .../Cargo.toml | 2 +- .../src/main.rs | 2 +- hercules_tools/hercules_driver/Cargo.toml | 10 + hercules_tools/hercules_driver/src/main.rs | 44 + 26 files changed, 2186 insertions(+), 92 deletions(-) create mode 100644 hercules_cg/src/common.rs create mode 100644 hercules_cg/src/cpu.rs create mode 100644 hercules_cg/src/top.rs create mode 100644 hercules_rt/src/manifest.rs create mode 100644 hercules_samples/task_parallel.hir rename hercules_tools/{hercules_cpu => hercules_cpu_beta}/Cargo.toml (91%) rename hercules_tools/{hercules_cpu => hercules_cpu_beta}/src/main.rs (94%) create mode 100644 hercules_tools/hercules_driver/Cargo.toml create mode 100644 hercules_tools/hercules_driver/src/main.rs diff --git a/.gitignore b/.gitignore index 959fc7f6..278d4690 100644 --- a/.gitignore +++ b/.gitignore @@ -5,5 +5,5 @@ *.ll *.c *.o - +*.hbin .*.swp diff --git a/Cargo.lock b/Cargo.lock index 1ec60bdb..c48af1ea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "aho-corasick" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" dependencies = [ "memchr", ] @@ -61,15 +61,30 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.80" +version = "1.0.81" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247" + +[[package]] +name = "atomic-polyfill" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" +checksum = "8cf2bce30dfe09ef0bfaef228b9d414faaf7e563035494d7fe092dba54b300f4" +dependencies = [ + "critical-section", +] [[package]] name = "autocfg" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" + +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" [[package]] name = "bincode" @@ -86,6 +101,15 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" +dependencies = [ + "serde", +] + [[package]] name = "bitvec" version = "1.0.1" @@ -98,6 +122,12 @@ dependencies = [ "wyz", ] +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "cactus" version = "1.0.7" @@ -126,9 +156,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.2" +version = "4.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b230ab84b0ffdf890d5a10abdbc8b83ae1c4918275daea1ab8801f71536b2651" +checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0" dependencies = [ "clap_builder", "clap_derive", @@ -148,9 +178,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.0" +version = "4.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47" +checksum = "528131438037fd55894f62d6e9f068b8f45ac57ffa77517819645d10aed04f64" dependencies = [ "heck", "proc-macro2", @@ -164,12 +194,24 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" +[[package]] +name = "cobs" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67ba02a97a2bd10f4b59b25c7973101c79642302776489e030cd13cdab09ed15" + [[package]] name = "colorchoice" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +[[package]] +name = "critical-section" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7059fff8937831a9ae6f0fe4d658ffabf58f2ca96aa9dec1c889f936f705f216" + [[package]] name = "deranged" version = "0.3.11" @@ -180,13 +222,10 @@ dependencies = [ ] [[package]] -name = "ena" -version = "0.14.2" +name = "embedded-io" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c533630cf40e9caa44bd91aadc88a75d75a4c3a12b4cfde353cbed41daa1e1f1" -dependencies = [ - "log", -] +checksum = "ef1a6892d9eef45c8fa6b9e0086428a2cca8491aca8f787c534a3d6d0bcb3ced" [[package]] name = "equivalent" @@ -238,29 +277,52 @@ dependencies = [ "wasi", ] +[[package]] +name = "hash32" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67" +dependencies = [ + "byteorder", +] + [[package]] name = "hashbrown" version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +[[package]] +name = "heapless" +version = "0.7.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdc6457c0eb62c71aac4bc17216026d8410337c4126773b9c5daba343f17964f" +dependencies = [ + "atomic-polyfill", + "hash32", + "rustc_version", + "serde", + "spin", + "stable_deref_trait", +] + [[package]] name = "heck" -version = "0.4.1" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "hercules_cg" version = "0.1.0" dependencies = [ "bitvec", - "ena", "hercules_ir", + "hercules_rt", ] [[package]] -name = "hercules_cpu" +name = "hercules_cpu_beta" version = "0.1.0" dependencies = [ "clap", @@ -280,6 +342,16 @@ dependencies = [ "rand", ] +[[package]] +name = "hercules_driver" +version = "0.1.0" +dependencies = [ + "clap", + "hercules_ir", + "hercules_opt", + "ron", +] + [[package]] name = "hercules_ir" version = "0.1.0" @@ -288,6 +360,7 @@ dependencies = [ "nom", "ordered-float", "rand", + "serde", ] [[package]] @@ -304,8 +377,11 @@ name = "hercules_opt" version = "0.1.0" dependencies = [ "bitvec", + "hercules_cg", "hercules_ir", "ordered-float", + "postcard", + "serde", "take_mut", ] @@ -313,14 +389,17 @@ dependencies = [ name = "hercules_rt" version = "0.1.0" dependencies = [ + "hercules_ir", "libc", + "postcard", + "serde", ] [[package]] name = "indexmap" -version = "2.2.5" +version = "2.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b0b929d511467233429c45a44ac1dcaa21ba0f5ba11e4879e6ed28ddb4f9df4" +checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" dependencies = [ "equivalent", "hashbrown", @@ -328,9 +407,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "juno_frontend" @@ -358,10 +437,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] -name = "log" -version = "0.4.21" +name = "lock_api" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] [[package]] name = "lrlex" @@ -482,6 +565,18 @@ dependencies = [ "serde", ] +[[package]] +name = "postcard" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a55c51ee6c0db07e68448e336cf8ea4131a620edefebf9893e759b2d793420f8" +dependencies = [ + "cobs", + "embedded-io", + "heapless", + "serde", +] + [[package]] name = "powerfmt" version = "0.2.0" @@ -496,9 +591,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.78" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" +checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" dependencies = [ "unicode-ident", ] @@ -554,19 +649,19 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] name = "regex" -version = "1.10.3" +version = "1.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" +checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" dependencies = [ "aho-corasick", "memchr", "regex-automata", - "regex-syntax 0.8.2", + "regex-syntax 0.8.3", ] [[package]] @@ -577,7 +672,7 @@ checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.8.2", + "regex-syntax 0.8.3", ] [[package]] @@ -588,9 +683,21 @@ checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" [[package]] name = "regex-syntax" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" + +[[package]] +name = "ron" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b91f7eff05f748767f183df4320a63d6936e9c6107d97c9e6bdd9784f4289c94" +dependencies = [ + "base64", + "bitflags 2.5.0", + "serde", + "serde_derive", +] [[package]] name = "rustc_version" @@ -607,6 +714,12 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "semver" version = "1.0.22" @@ -645,6 +758,21 @@ dependencies = [ "vob", ] +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +dependencies = [ + "lock_api", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "static_assertions" version = "1.1.0" @@ -659,9 +787,9 @@ checksum = "5ee073c9e4cd00e28217186dbe12796d692868f432bf2e97ee73bed0c56dfa01" [[package]] name = "syn" -version = "2.0.52" +version = "2.0.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" +checksum = "002a1b3dbf967edfafc32655d0f377ab0bb7b994aa1d32c8cc7e9b8bf3ebb8f0" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index d625efb3..bded2dcf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,8 +6,9 @@ members = [ "hercules_opt", "hercules_rt", + "hercules_tools/hercules_driver", "hercules_tools/hercules_dot", - "hercules_tools/hercules_cpu", + "hercules_tools/hercules_cpu_beta", "juno_frontend", diff --git a/hercules_cg/Cargo.toml b/hercules_cg/Cargo.toml index 8a4cf940..b5479669 100644 --- a/hercules_cg/Cargo.toml +++ b/hercules_cg/Cargo.toml @@ -5,5 +5,5 @@ authors = ["Russel Arbore <rarbore2@illinois.edu>"] [dependencies] bitvec = "*" -ena = "*" hercules_ir = { path = "../hercules_ir" } +hercules_rt = { path = "../hercules_rt" } diff --git a/hercules_cg/src/common.rs b/hercules_cg/src/common.rs new file mode 100644 index 00000000..9bc91b57 --- /dev/null +++ b/hercules_cg/src/common.rs @@ -0,0 +1,525 @@ +extern crate hercules_ir; +extern crate hercules_rt; + +use std::collections::HashMap; + +use self::hercules_ir::*; +use self::hercules_rt::manifest::*; + +/* + * Pretty much all of the codegen functions need to take in some large subset of + * IR structures, analysis results, and global pieces of information. Package + * them all in this struct, and make all the codegen functions members of this + * struct to cut down on the number of function arguments. This structure + * shouldn't be modified after creation. + */ +pub(crate) struct FunctionContext<'a> { + pub(crate) function: &'a Function, + pub(crate) types: &'a Vec<Type>, + pub(crate) constants: &'a Vec<Constant>, + pub(crate) dynamic_constants: &'a Vec<DynamicConstant>, + pub(crate) def_use: &'a ImmutableDefUseMap, + pub(crate) reverse_postorder: &'a Vec<NodeID>, + pub(crate) typing: &'a Vec<TypeID>, + pub(crate) control_subgraph: &'a Subgraph, + pub(crate) fork_join_map: &'a HashMap<NodeID, NodeID>, + pub(crate) fork_join_nest: &'a HashMap<NodeID, Vec<NodeID>>, + pub(crate) antideps: &'a Vec<(NodeID, NodeID)>, + pub(crate) bbs: &'a Vec<NodeID>, + pub(crate) plan: &'a Plan, + pub(crate) llvm_types: &'a Vec<String>, + pub(crate) llvm_constants: &'a Vec<String>, + pub(crate) llvm_dynamic_constants: &'a Vec<String>, + pub(crate) partitions_inverted_map: Vec<Vec<NodeID>>, +} + +impl<'a> FunctionContext<'a> { + /* + * Find data inputs to a partition. + */ + pub(crate) fn partition_data_inputs(&self, partition_id: PartitionID) -> Vec<NodeID> { + let partition = &self.partitions_inverted_map[partition_id.idx()]; + + let mut data_inputs: Vec<NodeID> = partition + .iter() + .map(|id| { + // For each node in the partition, filter out the uses that are + // data nodes and are in a different partition. + get_uses(&self.function.nodes[id.idx()]) + .as_ref() + .into_iter() + .filter(|id| { + // Filter out control nodes (just looking for data + // inputs here), check that it's in another partition, + // and ignore parameters, constants, and dynamic + // constants (those are each passed to partition + // functions using different mechanisms). + !self.function.nodes[id.idx()].is_control() + && self.plan.partitions[id.idx()] != partition_id + && !self.function.nodes[id.idx()].is_parameter() + && !self.function.nodes[id.idx()].is_constant() + && !self.function.nodes[id.idx()].is_dynamic_constant() + }) + .map(|x| *x) + .collect::<Vec<NodeID>>() + }) + // Collect all such uses across the whole partition. + .flatten() + .collect(); + + // Inputs and outputs of partitions need to be sorted so datums don't + // get mixed up. + data_inputs.sort(); + data_inputs + } + + /* + * Find data outputs of a partition. + */ + pub(crate) fn partition_data_outputs(&self, partition_id: PartitionID) -> Vec<NodeID> { + let partition = &self.partitions_inverted_map[partition_id.idx()]; + + let mut data_outputs: Vec<NodeID> = partition + .iter() + .filter(|id| { + // For each data node in the partition, check if it has any uses + // outside its partition. Users can be control or data nodes. + // Also, don't add parameter, constant, and dynamic constant + // nodes. These nodes are passed to partition mechanisms using + // different mechanism. + !self.function.nodes[id.idx()].is_control() + && !self.function.nodes[id.idx()].is_parameter() + && !self.function.nodes[id.idx()].is_constant() + && !self.function.nodes[id.idx()].is_dynamic_constant() + && self + .def_use + .get_users(**id) + .as_ref() + .into_iter() + .filter(|id| self.plan.partitions[id.idx()] != partition_id) + .map(|x| *x) + .count() + > 0 + }) + .map(|x| *x) + // If this partition contains a return node, the data input of that + // node is a data output. + .chain(partition.iter().filter_map(|id| { + if let Node::Return { control: _, data } = self.function.nodes[id.idx()] { + Some(data) + } else { + None + } + })) + .collect(); + + // Inputs and outputs of partitions need to be sorted so datums don't + // get mixed up. + data_outputs.sort(); + data_outputs + } + + /* + * Find control nodes that will return from a partition. + */ + pub(crate) fn partition_control_returns(&self, partition_id: PartitionID) -> Vec<NodeID> { + let partition = &self.partitions_inverted_map[partition_id.idx()]; + + partition + .iter() + .filter(|id| { + // For each control node in the partition, check if it has any + // users outside its partition. Users can be control nodes - if + // a user in a different partition is a data node, then the + // partition is malformed. Return nodes are also unconditionally + // a control return of this partition. + let outside_user_count = self + .def_use + .get_users(**id) + .as_ref() + .into_iter() + .filter(|user_id| { + // Users of control nodes can only be data nodes + // if they are in the same partition as the + // control node. Only control users may be in a + // different partition. + assert!( + !self.function.nodes[id.idx()].is_control() + || self.function.nodes[user_id.idx()].is_control() + || self.plan.partitions[user_id.idx()] == partition_id + ); + self.plan.partitions[user_id.idx()] != partition_id + }) + .count(); + + // Just calculated for the below assert. + let control_user_count = self + .def_use + .get_users(**id) + .as_ref() + .into_iter() + .filter(|id| self.function.nodes[id.idx()].is_control()) + .count(); + + // A control node cannot have users inside and outside its own + // partition. This is because a well-formedness condition of if + // and match nodes (the only control nodes allowed to have + // multiple users) is their read successors must be in the same + // partition as them. + assert!( + !self.function.nodes[id.idx()].is_control() + || outside_user_count == 0 + || outside_user_count == control_user_count + ); + self.function.nodes[id.idx()].is_control() + && (self.function.nodes[id.idx()].is_return() || outside_user_count > 0) + }) + .map(|x| *x) + .collect() + } + + /* + * Find control successors of a given partition. A partition cannot be a + * control successor of itself, since a self-cycle is represented as control + * flow within a partiion. In other words, the graph of control flow between + * partitions is free of self-loops (an edge connecting a partition to + * itself). + */ + pub(crate) fn partition_control_successors( + &self, + partition_id: PartitionID, + ) -> Vec<PartitionID> { + let partition = &self.partitions_inverted_map[partition_id.idx()]; + + let mut partitions: Vec<PartitionID> = partition + .iter() + // Only consider nodes in other partitions that are successors of + // control nodes. These are necessarily other control nodes. + .filter(|id| self.function.nodes[id.idx()].is_control()) + .map(|id| { + // Get the partitions (that are not this partition) of successor + // nodes of control nodes. + self.def_use + .get_users(*id) + .as_ref() + .into_iter() + .map(|id| self.plan.partitions[id.idx()]) + .filter(|id| *id != partition_id) + }) + // We want a flat list of all such partitions. + .flatten() + .collect(); + + // We only want one copy of the ID per partition. + partitions.dedup(); + partitions + } + + /* + * Calculate the reverse postorder of just this partition. + */ + pub(crate) fn partition_reverse_postorder(&self, partition_id: PartitionID) -> Vec<NodeID> { + self.reverse_postorder + .iter() + .filter(|id| self.plan.partitions[id.idx()] == partition_id) + .map(|x| *x) + .collect() + } + + /* + * Determine the array constant inputs to all partition functions. Get the + * constant IDs, and the array type IDs. Sort by constant ID for + * consistency. + */ + pub(crate) fn partition_array_constant_inputs(&self) -> Vec<(ConstantID, TypeID)> { + let mut res = (0..self.constants.len()) + .filter_map(|idx| { + self.constants[idx] + .try_array_type(self.types) + .map(|ty_id| (ConstantID::new(idx), ty_id)) + }) + .collect::<Vec<_>>(); + + res.sort(); + res + } + + /* + * Determine the dynamic constant inputs to all partition functions. Just + * assemble the dynamic constant IDs, since the type is always u64. Sort the + * parameters for consistency. + */ + pub(crate) fn partition_dynamic_constant_inputs(&self) -> Vec<DynamicConstantID> { + let mut res = (0..self.dynamic_constants.len()) + .filter_map(|idx| { + if self.dynamic_constants[idx].is_parameter() { + Some(DynamicConstantID::new(idx)) + } else { + None + } + }) + .collect::<Vec<_>>(); + + res.sort(); + res + } +} + +/* + * When emitting individual nodes in the partition codegen functions, a bunch of + * partition analysis results are needed. Package them all in this struct, and + * make all of the subroutines of the top level partition codegen functions + * members of this struct to cut down on the number of function arguments. This + * structure shouldn't be modified after creation. This structure only holds per + * partition specific information - for example, global function parameters, + * constant parameters, and dynamic constant parameters are not stored, since + * those don't vary across partitions. + */ +pub(crate) struct PartitionContext<'a> { + pub(crate) function: &'a FunctionContext<'a>, + pub(crate) partition_id: PartitionID, + pub(crate) top_node: NodeID, + pub(crate) data_inputs: Vec<NodeID>, + pub(crate) data_outputs: Vec<NodeID>, + pub(crate) control_returns: Vec<NodeID>, + pub(crate) reverse_postorder: Vec<NodeID>, + pub(crate) partition_input_types: Vec<TypeID>, + pub(crate) return_type: Type, + pub(crate) manifest: PartitionManifest, +} + +impl<'a> PartitionContext<'a> { + pub(crate) fn new( + function: &'a FunctionContext<'a>, + partition_id: PartitionID, + top_node: NodeID, + ) -> Self { + let data_inputs = function.partition_data_inputs(partition_id); + let data_outputs = function.partition_data_outputs(partition_id); + let control_returns = function.partition_control_returns(partition_id); + let control_successors = function.partition_control_successors(partition_id); + let reverse_postorder = function.partition_reverse_postorder(partition_id); + + // The data input types are just the types of data nodes used by this + // partition, originating in another partition. + let partition_input_types = data_inputs + .iter() + .map(|id| function.typing[id.idx()]) + .collect(); + + // The return struct contains all of the data outputs, plus control + // information if there are multiple successor partitions. The control + // information is used by the Hercules runtime to implement control flow + // between partitions. + let multiple_control_successors = control_successors.len() > 1; + let output_data_types = data_outputs.iter().map(|id| function.typing[id.idx()]); + let return_type = if multiple_control_successors { + let u64_ty_id = TypeID::new( + function + .types + .iter() + .position(|ty| *ty == Type::UnsignedInteger64) + .unwrap(), + ); + Type::Product( + output_data_types + .chain(std::iter::once(u64_ty_id)) + .collect(), + ) + } else { + Type::Product(output_data_types.collect()) + }; + + // Assemble the manifest. + let mut manifest = PartitionManifest::default(); + manifest.top_node = top_node.idx() as u32; + + // The first inputs are the data inputs, from other partitions. + manifest.inputs.extend( + data_inputs + .iter() + .map(|x| PartitionInput::DataInput(x.idx() as u32)), + ); + + // The next inputs are the function parameters, all in order. + manifest.inputs.extend( + (0..function.function.param_types.len()) + .map(|x| PartitionInput::FunctionArgument(x as u32)), + ); + + // The next inputs are the array constants, all in order. + manifest.inputs.extend( + (0..(function + .constants + .iter() + .filter(|cons| cons.try_array_type(function.types).is_some()) + .count())) + .map(|x| PartitionInput::ArrayConstant(x as u32)), + ); + + // The last inputs are the dynamic constants, all in order. + manifest.inputs.extend( + (0..function.function.num_dynamic_constants) + .map(|x| PartitionInput::DynamicConstant(x as u32)), + ); + + // The outputs are the data outputs of this partition. + manifest.outputs.extend( + data_outputs + .iter() + .map(|x| PartitionOutput::DataOutput(x.idx() as u32)), + ); + + // If there are multiple control returns, also output the node being + // returned from. + if multiple_control_successors { + manifest.outputs.push(PartitionOutput::ControlIndicator); + } + + PartitionContext { + function, + partition_id, + top_node, + data_inputs, + data_outputs, + control_returns, + reverse_postorder, + partition_input_types, + return_type, + manifest, + } + } +} + +/* + * Types, constants, and dynamic constants are fairly simple to translate into + * LLVM IR. + */ + +pub(crate) fn generate_type_string(ty: &Type, llvm_types: &Vec<String>) -> String { + match ty { + Type::Control(_) => { + // Later, we create virtual registers corresponding to fork nodes of + // type i64, so we need the "type" of the fork node to be i64. + "i64".to_string() + } + Type::Boolean => "i1".to_string(), + Type::Integer8 | Type::UnsignedInteger8 => "i8".to_string(), + Type::Integer16 | Type::UnsignedInteger16 => "i16".to_string(), + Type::Integer32 | Type::UnsignedInteger32 => "i32".to_string(), + Type::Integer64 | Type::UnsignedInteger64 => "i64".to_string(), + Type::Float32 => "float".to_string(), + Type::Float64 => "double".to_string(), + // Because we traverse in bottom-up order, we can assume that the LLVM + // types for children types are already computed. + Type::Product(fields) => { + let mut iter = fields.iter(); + if let Some(first) = iter.next() { + iter.fold("{".to_string() + &llvm_types[first.idx()], |s, f| { + s + ", " + &llvm_types[f.idx()] + }) + "}" + } else { + "{}".to_string() + } + } + Type::Array(_, _) => { + // Array types becomes pointers. The element type and dynamic + // constant bounds characterize the access code we generate later, + // not the type itself. + "ptr".to_string() + } + Type::Summation(_) => todo!(), + } +} + +pub(crate) fn generate_type_strings(module: &Module) -> Vec<String> { + // Render types into LLVM IR. This requires translating from our interning + // structures to LLVM types. We can't just blow through the types vector, + // since a type may reference a type ID ahead of it in the vector. Instead, + // iterate types in a bottom up order with respect to the type intern DAGs. + let mut llvm_types = vec!["".to_string(); module.types.len()]; + for id in module.types_bottom_up() { + llvm_types[id.idx()] = generate_type_string(&module.types[id.idx()], &llvm_types); + } + + llvm_types +} + +pub(crate) fn generate_constant_string( + cons_id: ConstantID, + cons: &Constant, + tys: &Vec<Type>, + llvm_constants: &Vec<String>, +) -> String { + match cons { + Constant::Boolean(val) => { + if *val { + "true".to_string() + } else { + "false".to_string() + } + } + Constant::Integer8(val) => format!("{}", val), + Constant::Integer16(val) => format!("{}", val), + Constant::Integer32(val) => format!("{}", val), + Constant::Integer64(val) => format!("{}", val), + Constant::UnsignedInteger8(val) => format!("{}", val), + Constant::UnsignedInteger16(val) => format!("{}", val), + Constant::UnsignedInteger32(val) => format!("{}", val), + Constant::UnsignedInteger64(val) => format!("{}", val), + Constant::Float32(val) => { + if val.fract() == 0.0 { + format!("{}.0", val) + } else { + format!("{}", val) + } + } + Constant::Float64(val) => { + if val.fract() == 0.0 { + format!("{}.0", val) + } else { + format!("{}", val) + } + } + Constant::Product(_, _) | Constant::Summation(_, _, _) | Constant::Array(_, _) => { + format!("%cons.{}", cons_id.idx()) + } + Constant::Zero(ty_id) => match tys[ty_id.idx()] { + Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => { + format!("%cons.{}", cons_id.idx()) + } + _ => "zeroinitializer".to_string(), + }, + } +} + +pub(crate) fn generate_constant_strings(module: &Module) -> Vec<String> { + // Render constants into LLVM IR. This is done in a very similar manner as + // types. + let mut llvm_constants = vec!["".to_string(); module.constants.len()]; + for id in module.constants_bottom_up() { + llvm_constants[id.idx()] = generate_constant_string( + id, + &module.constants[id.idx()], + &module.types, + &llvm_constants, + ); + } + + llvm_constants +} + +pub(crate) fn generate_dynamic_constant_strings(module: &Module) -> Vec<String> { + // Render dynamic constants into LLVM IR. + let mut llvm_dynamic_constants = vec!["".to_string(); module.dynamic_constants.len()]; + for id in (0..module.dynamic_constants.len()).map(DynamicConstantID::new) { + match &module.dynamic_constants[id.idx()] { + DynamicConstant::Constant(val) => llvm_dynamic_constants[id.idx()] = format!("{}", val), + DynamicConstant::Parameter(_) => { + llvm_dynamic_constants[id.idx()] = format!("%dyn_cons.{}", id.idx()) + } + } + } + + llvm_dynamic_constants +} diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs new file mode 100644 index 00000000..4f4726a7 --- /dev/null +++ b/hercules_cg/src/cpu.rs @@ -0,0 +1,1001 @@ +extern crate bitvec; +extern crate hercules_ir; +extern crate hercules_rt; + +use std::collections::HashMap; +use std::collections::VecDeque; + +use std::iter::zip; + +use std::fmt::Write; + +use self::bitvec::prelude::*; + +use self::hercules_ir::*; +use self::hercules_rt::manifest::*; + +use crate::*; + +/* + * When assembling LLVM basic blocks, we traverse the nodes in a partition in an + * ad-hoc order. Thus, we cannot assume block terminators will be visited after + * data nodes, for example. However, textual LLVM IR requires that the + * terminator instruction is last. So, we emit nodes into separate strings of + * LLVM IR that will get stichted together when the block is complete. + */ +#[derive(Debug)] +struct LLVMBlock { + header: String, + phis: String, + data: String, + terminator: String, +} + +impl<'a> FunctionContext<'a> { + /* + * Top level function to generate code for a partition, targeting the CPU. + */ + pub(crate) fn codegen_cpu_partition<W: Write>( + &self, + top_node: NodeID, + w: &mut W, + ) -> Result<PartitionManifest, std::fmt::Error> { + // Step 1: do some analysis to get a bunch of per-partition information. + let partition_id = self.plan.partitions[top_node.idx()]; + let partition_context = PartitionContext::new(self, partition_id, top_node); + + // Step 2: emit the function signature. The partition function + // parameters are the function parameters, the partition data inputs, + // the array constant pointers, and the dynamic constants. + let mut partition_function_parameters = partition_context + // The data inputs to this partition. These are the data values + // calculated in a different partition in the same function. + .partition_input_types + .iter() + .enumerate() + .map(|(idx, ty_id)| { + ( + self.llvm_types[ty_id.idx()].clone(), + format!("%part_arg.{}", idx), + ) + }) + // The input types of the overall function. + .chain( + self.function + .param_types + .iter() + .enumerate() + .map(|(idx, ty_id)| { + ( + self.llvm_types[ty_id.idx()].clone(), + format!("%func_arg.{}", idx), + ) + }), + ) + // Array constants are passed in, pre-initialized. + .chain( + self.partition_array_constant_inputs() + .into_iter() + .map(|(id, ty_id)| { + ( + self.llvm_types[ty_id.idx()].clone(), + format!("%cons.{}", id.idx()), + ) + }), + ) + // Dynamic constants are passed in, since they are only known right + // before runtime. + .chain( + self.partition_dynamic_constant_inputs() + .into_iter() + .map(|id| ("i64".to_string(), format!("%dyn_cons.{}", id.idx()))), + ); + + write!( + w, + "define {} @{}_part_{}(", + generate_type_string(&partition_context.return_type, &self.llvm_types), + self.function.name, + partition_id.idx(), + )?; + let (first_ty, first_param) = partition_function_parameters.next().unwrap(); + write!(w, "{} {}", first_ty, first_param)?; + for (ty, param) in partition_function_parameters { + write!(w, ", {} {}", ty, param)?; + } + write!(w, ") {{\n")?; + + // Step 3: set up basic blocks. A node represents a basic block if its + // entry in the basic blocks vector points to itself. + let mut llvm_bbs = HashMap::new(); + for id in &self.partitions_inverted_map[partition_id.idx()] { + if self.bbs[id.idx()] == *id { + llvm_bbs.insert( + id, + LLVMBlock { + header: format!("bb_{}:\n", id.idx()), + phis: "".to_string(), + data: "".to_string(), + terminator: "".to_string(), + }, + ); + } + } + + // Step 4: emit nodes. Nodes are emitted into basic blocks separately as + // nodes are not necessarily emitted in order. Assemble worklist of + // nodes, starting as reverse post order of nodes. For non-phi and non- + // reduce nodes, only emit once all data uses are emitted. In addition, + // consider additional anti-dependence edges from read to write nodes. + let mut visited = bitvec![u8, Lsb0; 0; self.function.nodes.len()]; + let mut worklist = VecDeque::from(partition_context.reverse_postorder.clone()); + while let Some(id) = worklist.pop_front() { + if !(self.function.nodes[id.idx()].is_phi() + || self.function.nodes[id.idx()].is_reduce()) + && !get_uses(&self.function.nodes[id.idx()]) + .as_ref() + .into_iter() + // If this node isn't a phi or reduce, we need to check that + // all uses, as well as all reads we anti-depend with, have + // been emitted. + .chain(self.antideps.iter().filter_map(|(read, write)| { + if id == *write { + Some(read) + } else { + None + } + })) + // Only data dependencies inside this partition need to have + // already been visited. + .all(|id| { + self.plan.partitions[id.idx()] != partition_id + || self.function.nodes[id.idx()].is_control() + || visited[id.idx()] + }) + { + // Skip emitting node if it's not a phi or reduce node and if + // its data uses are not emitted yet. + worklist.push_back(id); + } else { + // Once all of the data dependencies for this node are emitted, + // this node can be emitted. For reduce nodes specifically, we + // want to emit the phi in the fork's basic block, not the + // join's, so we handle that ugly case here. This is because + // there is a fundamental mismatch between Hercules' notion of + // reductions and LLVM's phi nodes. This is ok, since we can + // translate between the two. It's just a pain. + let bb = if let Node::Reduce { + control, + init: _, + reduct: _, + } = self.function.nodes[id.idx()] + { + // Figure out the fork corresponding to the associated join. + let fork_id = if let Node::Join { control } = self.function.nodes[control.idx()] + { + if let Type::Control(factors) = + &self.types[self.typing[control.idx()].idx()] + { + *factors.last().unwrap() + } else { + panic!("PANIC: Type of join node associated with reduce node is not a control type.") + } + } else { + panic!("PANIC: Node associated with reduce node isn't a join node.") + }; + + // Emit in the basic block of the fork. + llvm_bbs.get_mut(&self.bbs[fork_id.idx()]).unwrap() + } else { + // In the normal case, emit in the basic block the node has + // been actually assigned to. + llvm_bbs.get_mut(&self.bbs[id.idx()]).unwrap() + }; + partition_context.codegen_cpu_node(id, bb)?; + visited.set(id.idx(), true); + } + } + + // Step 5: emit the now completed basic blocks, in order. Emit a dummy + // header block to unconditionally jump to the "top" basic block. + write!(w, "bb_header:\n br label %bb_{}\n", top_node.idx())?; + for id in partition_context.reverse_postorder { + if self.bbs[id.idx()] == id { + write!( + w, + "{}{}{}{}", + llvm_bbs[&id].header, + llvm_bbs[&id].phis, + llvm_bbs[&id].data, + llvm_bbs[&id].terminator + )?; + } + } + + // Step 6: close the partition function - we're done. The partition + // manifest is created by the partition context. + write!(w, "}}\n\n")?; + Ok(partition_context.manifest) + } +} + +impl<'a> PartitionContext<'a> { + /* + * Emit LLVM IR implementing a single node. + */ + fn codegen_cpu_node(&self, id: NodeID, bb: &mut LLVMBlock) -> std::fmt::Result { + // Helper to emit code to index a collection. All collections are + // pointers to some memory at the LLVM IR level. This memory is passed + // in as a parameter for anything involving arrays, and is alloca-ed for + // product and summation types. + let mut generate_index_code = |collect: NodeID, indices: &[Index]| -> std::fmt::Result { + // Step 1: calculate the list of collection types corresponding to + // each index. + let mut collection_ty_ids = vec![]; + let mut curr_ty_id = self.function.typing[collect.idx()]; + for index in indices { + match (index, &self.function.types[curr_ty_id.idx()]) { + (Index::Field(idx), Type::Product(ty_ids)) + | (Index::Variant(idx), Type::Summation(ty_ids)) => { + collection_ty_ids.push(curr_ty_id); + curr_ty_id = ty_ids[*idx]; + } + (Index::Position(_), Type::Array(elem_ty_id, _)) => { + collection_ty_ids.push(curr_ty_id); + curr_ty_id = *elem_ty_id; + } + _ => { + panic!("PANIC: Found unsupported combination of index and collection type.") + } + } + } + assert!( + self.function.types[curr_ty_id.idx()].is_primitive(), + "PANIC: Cannot generate partial indexing code." + ); + + // Step 2: calculate, as LLVM IR values, the stride and offset + // needed at each level of the collection. For products, the stride + // is calculated using a getelementptr hack (and is the size of the + // struct), and the offset corresponds to the field index (which is + // translated to an offset using another getelementptr hack). For + // arrays, the stride is the dynamic constant extent multiplied by + // the stride of the element type, and the offset is the position + // index multiplied by the stride of the element type. Additionally, + // emit code to add up all of the offsets to get a total offset into + // the collection. TODO: to support summations, and arrays in + // arbitrary places, we need to not use the hacky getelementptr + // technique, since LLVM IR can't represent arrays (in the Hercules + // sense) or summations as primitive types. Instead, we need to do + // collection memory layout entirely ourselves. + let elem_llvm_ty = &self.function.llvm_types[curr_ty_id.idx()]; + write!(bb.data, " %index{}.{}.total_offset = add i64 0, 0\n %index{}.{}.stride.ptrhack = getelementptr {}, ptr null, i64 1\n %index{}.{}.stride = ptrtoint ptr %index{}.{}.stride.ptrhack to i64\n", + id.idx(), indices.len(), id.idx(), indices.len(), elem_llvm_ty, id.idx(), indices.len(), id.idx(), indices.len() + )?; + for (idx, index) in indices.into_iter().enumerate().rev() { + match index { + Index::Field(field) => { + let product_llvm_ty = + &self.function.llvm_types[collection_ty_ids[idx].idx()]; + write!( + bb.data, + " %index{}.{}.stride.ptrhack = getelementptr {}, ptr null, i64 1\n %index{}.{}.stride = ptrtoint ptr %index{}.{}.stride.ptrhack to i64\n %index{}.{}.offset.ptrhack = getelementptr {}, ptr null, i64 0, i64 {}\n %index{}.{}.offset = ptrtoint ptr %index{}.{}.offset.ptrhack to i64\n", + id.idx(), idx, + product_llvm_ty, + id.idx(), idx, + id.idx(), idx, + id.idx(), idx, + product_llvm_ty, + field, + id.idx(), idx, + id.idx(), idx, + )?; + } + Index::Variant(_) => todo!(), + Index::Position(position) => { + let array_extents = self.function.types[collection_ty_ids[idx].idx()] + .try_extents() + .unwrap(); + + // TODO: calculate stride for arrays, needed for arrays + // nested in other collections. + write!(bb.data, " %index{}.{}.offset.add.0 = add ", id.idx(), idx)?; + self.cpu_emit_use_of_node(position[0], Some(id), true, &mut bb.data)?; + write!(bb.data, ", {}\n", 0)?; + for (dim_idx, (extent_dc_id, position_id)) in + zip(array_extents, position.into_iter()).enumerate().skip(1) + { + write!( + bb.data, + " %index{}.{}.offset.mul.{} = mul i64 {}, %index{}.{}.offset.add.{}\n", + id.idx(), idx, + dim_idx, + self.function.llvm_dynamic_constants[extent_dc_id.idx()], + id.idx(), idx, + dim_idx - 1 + )?; + write!( + bb.data, + " %index{}.{}.offset.add.{} = add ", + id.idx(), + idx, + dim_idx + )?; + self.cpu_emit_use_of_node(*position_id, Some(id), true, &mut bb.data)?; + write!( + bb.data, + ", %index{}.{}.offset.mul.{}\n", + id.idx(), + idx, + dim_idx + )?; + } + write!(bb.data, " %index{}.{}.offset = mul i64 %index{}.{}.stride, %index{}.{}.offset.add.{}\n", id.idx(), idx, id.idx(), idx + 1, id.idx(), idx, position.len() - 1)?; + } + Index::Control(_) => panic!( + "PANIC: Found control index when generating collection indexing code." + ), + } + write!( + bb.data, + " %index{}.{}.total_offset = add i64 %index{}.{}.total_offset, %index{}.{}.offset\n", + id.idx(), idx, + id.idx(), idx + 1, + id.idx(), idx + )?; + } + + // Step 3: emit the getelementptr using the total collection offset. + write!(bb.data, " %index{} = getelementptr i8, ", id.idx(),)?; + self.cpu_emit_use_of_node(collect, Some(id), true, &mut bb.data)?; + write!(bb.data, ", i64 %index{}.0.total_offset\n", id.idx())?; + + Ok(()) + }; + + // Helper to find the basic block corresponding to a particular control + // predecessor, for phi nodes. This is needed for when a predecessor + // basic block is in a different partition. In this case, the phi's + // control predecessor is set to the top block of the partition. + let get_phi_predecessor = |pred_id: NodeID| { + if self.function.plan.partitions[pred_id.idx()] == self.partition_id { + format!("{}", self.function.bbs[pred_id.idx()].idx()) + } else { + format!("header") + } + }; + + // Emit the primary IR for each node. + match self.function.function.nodes[id.idx()] { + Node::Start | Node::Region { preds: _ } => { + // Basic blocks containing a start or region node branch + // unconditionally to their single successor. + let successor = self + .function + .def_use + .get_users(id) + .iter() + .filter(|id| self.function.function.nodes[id.idx()].is_strictly_control()) + .next() + .unwrap(); + bb.terminator = format!(" br label %bb_{}\n", successor.idx()); + } + Node::If { control: _, cond } => { + let successors = self.function.def_use.get_users(id); + + // Determine the order of the successors (true/false or false/ + // true) in the successors slice. + let rev = if let Node::Read { + collect: _, + indices, + } = &self.function.function.nodes[successors[0].idx()] + { + indices[0] != Index::Control(0) + } else { + panic!("PANIC: Successor of if node isn't a read node.") + }; + bb.terminator = " br ".to_string(); + self.cpu_emit_use_of_node(cond, Some(id), true, &mut bb.terminator)?; + write!( + bb.terminator, + ", label %bb_{}, label %bb_{}\n", + successors[(!rev) as usize].idx(), + successors[rev as usize].idx() + )?; + } + Node::Fork { control, factor: _ } => { + // Calculate the join and successor. + let join = self.function.fork_join_map[&id]; + let successor = self + .function + .def_use + .get_users(id) + .iter() + .filter(|id| self.function.function.nodes[id.idx()].is_strictly_control()) + .next() + .unwrap(); + + // Create the phi node for the loop index. This is used directly + // by any thread ID user nodes. The control predecessor basic + // blocks are the control node preceding the fork and the + // corresponding join. + write!(bb.phis, " ")?; + self.cpu_emit_use_of_node(id, None, false, &mut bb.phis)?; + write!( + bb.phis, + " = phi i64 [ 0, %bb_{} ], [ %fork_inc{}, %bb_{} ]\n", + get_phi_predecessor(self.function.bbs[control.idx()]), + id.idx(), + get_phi_predecessor(self.function.bbs[join.idx()]), + )?; + + // Increment the loop index by one each iteration. + write!(bb.data, " %fork_inc{} = add i64 1, ", id.idx())?; + self.cpu_emit_use_of_node(id, None, false, &mut bb.data)?; + write!(bb.data, "\n")?; + + // Branch to the successor basic block. + write!( + bb.terminator, + " br label %bb_{}\n", + self.function.bbs[successor.idx()].idx() + )?; + } + Node::Join { control } => { + // Get the fork, it's factor, and the successor to this join. + let fork_id = if let Type::Control(factors) = + &self.function.types[self.function.typing[control.idx()].idx()] + { + *factors.last().unwrap() + } else { + panic!("PANIC: The type of a join node is incorrect.") + }; + let factor = if let Node::Fork { control: _, factor } = + &self.function.function.nodes[fork_id.idx()] + { + *factor + } else { + panic!("PANIC: The node referenced by the control type of a join node is not a fork.") + }; + let successor = self + .function + .def_use + .get_users(id) + .iter() + .filter(|id| self.function.function.nodes[id.idx()].is_strictly_control()) + .next() + .unwrap(); + + // Form the bottom of the loop. Check if the loop is finished, + // and branch between the successor and the fork. The structure + // of this loop implies that fork-joins have to iterate at least + // once. Change the loop termination branch target if this is a + // control return (see comment below for more details). + let is_control_return = self.control_returns.contains(&id); + write!( + bb.terminator, + " %join_cond{} = icmp ult i64 %fork_inc{}, {}\n", + id.idx(), + fork_id.idx(), + self.function.llvm_dynamic_constants[factor.idx()] + )?; + write!( + bb.terminator, + " br i1 %join_cond{}, label %bb_{}, label %bb_{}\n", + id.idx(), + self.function.bbs[fork_id.idx()].idx(), + if is_control_return { + format!("{}_join_cr", id.idx()) + } else { + format!("{}", self.function.bbs[successor.idx()].idx()) + } + )?; + + // Join nodes are the only node that can be a control return + // from a partition and generate a conditional branch. This + // means we have to do this really ugly hack where we insert + // another basic block to be the control return that we + // conditionally branch to. Other control nodes that may be + // control returns don't have this problem, because they always + // unconditionally branch to their destination. We add this LLVM + // IR text of a new basic block in the terminator of the current + // basic block, since we don't have mutable access here to the + // set of all LLVM basic blocks. + if is_control_return { + write!(bb.terminator, "bb_{}_join_cr:\n", id.idx())?; + } + } + Node::Phi { + control: _, + ref data, + } => { + // For each predecessor of the associated region, we determine + // if that predecessor is in this partition or not. If so, then + // the predecessor control is just the basic block of the + // predecessor control node. If not, the predecessor control is + // the first basic block of the partition. The corresponding + // datum also needs to be provided by argument to the partition, + // and this is handled by cpu_emit_use_of_node. + let pred_ids = + get_uses(&self.function.function.nodes[self.function.bbs[id.idx()].idx()]); + let mut control_datum_pairs = zip(data.into_iter(), pred_ids.as_ref().iter()) + .map(|(datum, pred_id)| (*datum, get_phi_predecessor(*pred_id))); + + // TODO: this code burns my eyes to look at, it might be worth + // making this not carcinogenic. + write!(bb.phis, " ")?; + self.cpu_emit_use_of_node(id, None, false, &mut bb.phis)?; + write!( + bb.phis, + " = phi {} [ ", + self.function.llvm_types[self.function.typing[id.idx()].idx()] + )?; + let (first_data, first_control) = control_datum_pairs.next().unwrap(); + self.cpu_emit_use_of_node(first_data, Some(id), false, &mut bb.phis)?; + write!(bb.phis, ", %bb_{} ]", first_control)?; + for (data, control) in control_datum_pairs { + write!(bb.phis, ", [ ")?; + self.cpu_emit_use_of_node(data, Some(id), false, &mut bb.phis)?; + write!(bb.phis, ", %bb_{} ]", control)?; + } + write!(bb.phis, "\n")?; + } + Node::ThreadID { control } => { + // Just bitcast the loop index from the fork. The bitcast is a + // no-op, but we add it to copy the value from the virtual + // register the fork generates to the virtual register + // corresponding to this thread ID node. + assert!(self.function.function.nodes[control.idx()].is_fork()); + write!(bb.data, " ")?; + self.cpu_emit_use_of_node(id, None, false, &mut bb.data)?; + write!(bb.data, " = bitcast i64 ",)?; + self.cpu_emit_use_of_node(control, Some(id), false, &mut bb.data)?; + write!(bb.data, " to i64\n",)?; + } + Node::Reduce { + control, + init, + reduct, + } => { + // Figure out the fork corresponding to the associated join. + let fork_id = if let Node::Join { control } = + self.function.function.nodes[control.idx()] + { + if let Type::Control(factors) = + &self.function.types[self.function.typing[control.idx()].idx()] + { + *factors.last().unwrap() + } else { + panic!("PANIC: Type of join node associated with reduce node is not a control type.") + } + } else { + panic!("PANIC: Node associated with reduce node isn't a join node.") + }; + + // Figure out the fork's predecessor. + let pred = if let Node::Fork { control, factor: _ } = + self.function.function.nodes[fork_id.idx()] + { + control + } else { + panic!("PANIC: Node referenced in type of join node associated with a reduce node is not a fork node.") + }; + + // Reduce nodes just lower to phi nodes. We already did the ugly + // hack so that "bb" refers to the basic block of the fork, + // rather than the join. So, now we just need to emit the phi. + write!(bb.phis, " ")?; + self.cpu_emit_use_of_node(id, Some(id), false, &mut bb.phis)?; + write!( + bb.phis, + " = phi {} [ ", + self.function.llvm_types[self.function.typing[id.idx()].idx()] + )?; + self.cpu_emit_use_of_node(init, Some(id), false, &mut bb.phis)?; + write!( + bb.phis, + ", %bb_{} ], [ ", + get_phi_predecessor(self.function.bbs[pred.idx()]) + )?; + self.cpu_emit_use_of_node(reduct, Some(id), false, &mut bb.phis)?; + write!( + bb.phis, + ", %bb_{} ]\n", + get_phi_predecessor(self.function.bbs[control.idx()]) + )?; + } + // These nodes are handled by other mechanisms in the code lowering + // process. + Node::Return { + control: _, + data: _, + } + | Node::Parameter { index: _ } + | Node::Constant { id: _ } + | Node::DynamicConstant { id: _ } => {} + Node::Binary { left, right, op } => { + let op = match op { + BinaryOperator::Add => { + if self.function.types[self.function.typing[left.idx()].idx()].is_float() { + "fadd" + } else { + "add" + } + } + BinaryOperator::Sub => { + if self.function.types[self.function.typing[left.idx()].idx()].is_float() { + "fsub" + } else { + "sub" + } + } + BinaryOperator::Mul => { + if self.function.types[self.function.typing[left.idx()].idx()].is_float() { + "fmul" + } else { + "mul" + } + } + BinaryOperator::Div => { + if self.function.types[self.function.typing[left.idx()].idx()].is_float() { + "fdiv" + } else if self.function.types[self.function.typing[left.idx()].idx()] + .is_unsigned() + { + "udiv" + } else { + "sdiv" + } + } + BinaryOperator::Rem => { + if self.function.types[self.function.typing[left.idx()].idx()].is_float() { + "frem" + } else if self.function.types[self.function.typing[left.idx()].idx()] + .is_unsigned() + { + "urem" + } else { + "srem" + } + } + BinaryOperator::LT => { + if self.function.types[self.function.typing[left.idx()].idx()].is_float() { + "fcmp olt" + } else if self.function.types[self.function.typing[left.idx()].idx()] + .is_unsigned() + { + "icmp ult" + } else { + "icmp slt" + } + } + BinaryOperator::LTE => { + if self.function.types[self.function.typing[left.idx()].idx()].is_float() { + "fcmp ole" + } else if self.function.types[self.function.typing[left.idx()].idx()] + .is_unsigned() + { + "icmp ule" + } else { + "icmp sle" + } + } + BinaryOperator::GT => { + if self.function.types[self.function.typing[left.idx()].idx()].is_float() { + "fcmp ogt" + } else if self.function.types[self.function.typing[left.idx()].idx()] + .is_unsigned() + { + "icmp ugt" + } else { + "icmp sgt" + } + } + BinaryOperator::GTE => { + if self.function.types[self.function.typing[left.idx()].idx()].is_float() { + "fcmp oge" + } else if self.function.types[self.function.typing[left.idx()].idx()] + .is_unsigned() + { + "icmp uge" + } else { + "icmp sge" + } + } + BinaryOperator::EQ => { + if self.function.types[self.function.typing[left.idx()].idx()].is_float() { + "fcmp oeq" + } else { + "icmp eq" + } + } + BinaryOperator::NE => { + if self.function.types[self.function.typing[left.idx()].idx()].is_float() { + "fcmp one" + } else { + "icmp ne" + } + } + BinaryOperator::Or => "or", + BinaryOperator::And => "and", + BinaryOperator::Xor => "xor", + BinaryOperator::LSh => "lsh", + BinaryOperator::RSh => { + if self.function.types[self.function.typing[left.idx()].idx()].is_unsigned() + { + "lshr" + } else { + "ashr" + } + } + }; + write!(bb.data, " ")?; + self.cpu_emit_use_of_node(id, None, false, &mut bb.data)?; + write!(bb.data, " = {} ", op)?; + self.cpu_emit_use_of_node(left, Some(id), true, &mut bb.data)?; + write!(bb.data, ", ")?; + self.cpu_emit_use_of_node(right, Some(id), false, &mut bb.data)?; + write!(bb.data, "\n")?; + } + Node::Read { + collect, + ref indices, + } => { + if self.function.function.nodes[collect.idx()].is_strictly_control() { + // Read nodes may be projection succesors of if or match + // nodes. + let successor = self.function.def_use.get_users(id)[0]; + write!( + bb.terminator, + " br label %bb_{}\n", + self.function.bbs[successor.idx()].idx() + )?; + } else { + generate_index_code(collect, indices)?; + write!(bb.data, " ")?; + self.cpu_emit_use_of_node(id, Some(id), false, &mut bb.data)?; + write!( + bb.data, + " = load {}, ptr %index{}\n", + self.function.llvm_types[self.function.typing[id.idx()].idx()], + id.idx(), + )?; + } + } + Node::Write { + collect, + data, + ref indices, + } => { + generate_index_code(collect, indices)?; + write!( + bb.data, + " store {} ", + self.function.llvm_types[self.function.typing[data.idx()].idx()] + )?; + self.cpu_emit_use_of_node(data, Some(id), false, &mut bb.data)?; + write!(bb.data, ", ptr %index{}\n", id.idx())?; + + // We can't just "copy" in LLVM IR, but we want to forward the + // pointer, unchanged, as the "output" of this write node. The + // easiest way to do this is to insert a useless bitcast. + write!(bb.data, " ")?; + self.cpu_emit_use_of_node(id, None, false, &mut bb.data)?; + write!(bb.data, " = bitcast ptr ")?; + self.cpu_emit_use_of_node(collect, Some(id), false, &mut bb.data)?; + write!(bb.data, " to ptr\n")?; + } + _ => { + eprintln!("TO LOWER: {:?}", self.function.function.nodes[id.idx()]); + } + } + + // If this node is a control return, we emit a return from this + // partition function. + if self.control_returns.contains(&id) { + // Get rid of the old terminator, replace with return. Don't do this + // if this node is a join node, since in that specific case we + // generate specific control return logic. See the join node codegen + // above for more details. + if !self.function.function.nodes[id.idx()].is_join() { + bb.terminator.clear(); + } + + // Making structs from the aggregated values in LLVM IR is a pain. + // We need to, one-by-one, insertvalue each element into the struct. + let ret_ty_str = generate_type_string(&self.return_type, &self.function.llvm_types); + for (idx, data_output_id) in self.data_outputs.iter().enumerate() { + write!( + bb.terminator, + " %ret_agg{}.{} = insertvalue {} {}, ", + id.idx(), + idx, + ret_ty_str, + if idx == 0 { + "undef".to_string() + } else { + format!("%ret_agg{}.{}", id.idx(), idx - 1) + } + )?; + let mut data_output_id = *data_output_id; + + // Handle reduce specially here. Technically, the "user" here is + // the join node, so cpu_emit_use_of_node would normally emit + // the reduce node's virtual register directly. However, if a + // data output is the result of a reduce node, that is + // definitely outside for the corresponding fork-join. Thus, we + // actually need to use the reduction use of the reduce node. + // This all only applies if the reduce node is in the current + // partition. If not, then use the reduce node as the argument + // to cpu_emit_use_of_node as normal, so that the partition + // function argument is properly used. + while let Node::Reduce { + control: _, + init: _, + reduct, + } = self.function.function.nodes[data_output_id.idx()] + && self.partition_id == self.function.plan.partitions[data_output_id.idx()] + { + data_output_id = reduct; + } + self.cpu_emit_use_of_node(data_output_id, None, true, &mut bb.terminator)?; + write!(bb.terminator, ", {}\n", idx)?; + } + + // Now, we can return the aggregate value we calculated. + if self.data_outputs.is_empty() && self.control_returns.len() == 1 { + // If there are no data outputs, just return the empty struct. + write!(bb.terminator, " ret {} zeroinitializer\n", ret_ty_str)?; + } else if self.data_outputs.is_empty() { + // If there are multiple control returns, we need to return the + // node ID of the control return, so that the runtime can do + // control flow between partitions. In this case, there aren't + // any data outputs that also need to be returned. + write!(bb.terminator, " %ret_agg{}.ctrl_pos = insertvalue {} undef, i64 {}, 0\n ret {} %ret_agg{}.ctrl_pos\n", + id.idx(), + ret_ty_str, + id.idx(), + ret_ty_str, + id.idx() + )?; + } else if self.control_returns.len() == 1 { + // In the normal case, we return the struct containing just the + // data outputs. + write!( + bb.terminator, + " ret {} %ret_agg{}.{}\n", + ret_ty_str, + id.idx(), + self.data_outputs.len() - 1, + )?; + } else { + // If there are multiple control returns from this partition and + // there are data outputs, we add the control return node ID to + // the return aggregate. + write!( + bb.terminator, + " %ret_agg{}.ctrl_pos = insertvalue {} %ret_agg{}.{}, i64 {}, {}\n ret {} %ret_agg{}.ctrl_pos\n", + id.idx(), + ret_ty_str, + id.idx(), + self.data_outputs.len() - 1, + id.idx(), + self.data_outputs.len(), + ret_ty_str, + id.idx(), + )?; + } + } + + Ok(()) + } + + /* + * Emit the LLVM value corresponding to a node. Optionally prefix with the + * LLVM type, which is required by textual LLVM IR in a few places. + * Optionally provide the node that will be using this emission. This is + * unused by all emitted node values except reduce nodes, which require the + * user argument to be given. We chose this interface because at the + * callsite of a cpu_emit_use_of_node, it is always known whether this thing + * being emitted could (or should) possibly be a reduce node. If not, then + * providing none gives a nice early panic when it is a reduce node, either + * because the developer misjudged or because there is a bug. + */ + fn cpu_emit_use_of_node<W: Write>( + &self, + id: NodeID, + user: Option<NodeID>, + emit_type: bool, + w: &mut W, + ) -> std::fmt::Result { + // First, emit the type before the value (if applicable). + if emit_type { + write!( + w, + "{} ", + self.function.llvm_types[self.function.typing[id.idx()].idx()] + )?; + } + + // Emitting the value can be surprisingly complicated, depending on what + // the node is. For example, partition arguments are emitted specially. + if let Some(input_idx) = self.data_inputs.iter().position(|inp_id| *inp_id == id) { + // If a use is in another partition, it needs to get passed to this + // partition's function as a parameter. + write!(w, "%part_arg.{}", input_idx)?; + } else { + match self.function.function.nodes[id.idx()] { + // Parameter nodes in this partition also represent parameters + // to this partition function. + Node::Parameter { index } => write!(w, "%func_arg.{}", index)?, + // Constants are pre-defined. + Node::Constant { id } => write!(w, "{}", self.function.llvm_constants[id.idx()])?, + Node::DynamicConstant { id } => { + write!(w, "{}", self.function.llvm_dynamic_constants[id.idx()])? + } + // Reduce nodes, as usual, are not nice to handle. We need to + // emit different LLVM depending on whether the user is inside + // or outside the reduce's corresponding fork-join nest. Inside, + // we emit as usual, since the user needs to use the phi node + // inside the reduction loop. Outside, we need to use the reduct + // use of the reduce node, so that we don't grab the reduction + // variable one loop iteration too early. + Node::Reduce { + control, + init: _, + reduct, + } => { + // Figure out the fork corresponding to the associated join. + let fork_id = if let Node::Join { control } = + self.function.function.nodes[control.idx()] + { + if let Type::Control(factors) = + &self.function.types[self.function.typing[control.idx()].idx()] + { + *factors.last().unwrap() + } else { + panic!() + } + } else { + panic!() + }; + + // Check if the basic block containing the user node is in + // the fork-join nest for this reduce node. We make the user + // node an optional argument as a debugging tool - if we + // exercise this code branch when generating the code for a + // node that absolutely should not be using the result of a + // reduce node, we would like to know! + if self.function.fork_join_nest[&self.function.bbs[user.expect("PANIC: cpu_emit_use_of_node was called on a reduce node, but no user node ID was given.").idx()]] + .contains(&fork_id) + { + // If the user is inside the fork-join nest, then emit + // the reduce node directly. + assert_eq!(self.partition_id, self.function.plan.partitions[id.idx()]); + write!(w, "%virt.{}", id.idx())?; + } else { + // If the user is outside the fork-join nest, then + // recursively emit on the reduction input to the reduce + // node. This is needed when there is a reduce chain. + assert_eq!( + self.partition_id, + self.function.plan.partitions[reduct.idx()] + ); + self.cpu_emit_use_of_node(reduct, user, emit_type, w)?; + } + } + // Uses that are in this partition are just virtual registers. + // Clang is really annoying about numbering virtual registers, + // so to avoid that silliness we prepend all our virtual + // registers with a prefix indicating what kind of thing it is. + // For normal values, we use "virt" for virtual register. + _ => { + assert_eq!(self.partition_id, self.function.plan.partitions[id.idx()]); + write!(w, "%virt.{}", id.idx())?; + } + } + } + + Ok(()) + } +} diff --git a/hercules_cg/src/cpu_beta.rs b/hercules_cg/src/cpu_beta.rs index e3974111..1a563143 100644 --- a/hercules_cg/src/cpu_beta.rs +++ b/hercules_cg/src/cpu_beta.rs @@ -3,6 +3,7 @@ extern crate hercules_ir; use std::collections::HashMap; use std::collections::VecDeque; + use std::fmt::Write; use std::iter::zip; diff --git a/hercules_cg/src/lib.rs b/hercules_cg/src/lib.rs index 2d1293de..280910dd 100644 --- a/hercules_cg/src/lib.rs +++ b/hercules_cg/src/lib.rs @@ -1,3 +1,11 @@ +#![feature(let_chains)] + +pub mod common; +pub mod cpu; pub mod cpu_beta; +pub mod top; +pub use crate::common::*; +pub use crate::cpu::*; pub use crate::cpu_beta::*; +pub use crate::top::*; diff --git a/hercules_cg/src/top.rs b/hercules_cg/src/top.rs new file mode 100644 index 00000000..b992344d --- /dev/null +++ b/hercules_cg/src/top.rs @@ -0,0 +1,135 @@ +extern crate hercules_ir; +extern crate hercules_rt; + +use std::collections::HashMap; +use std::fmt::Write; + +use self::hercules_ir::*; +use self::hercules_rt::manifest::*; + +use crate::*; + +/* + * Top level function to generate code for a module. Emits LLVM IR text. Calls + * out to backends to generate code for individual partitions. Creates a + * manifest describing the generated code. + */ +pub fn codegen<W: Write>( + module: &Module, + def_uses: &Vec<ImmutableDefUseMap>, + reverse_postorders: &Vec<Vec<NodeID>>, + typing: &ModuleTyping, + control_subgraphs: &Vec<Subgraph>, + fork_join_maps: &Vec<HashMap<NodeID, NodeID>>, + fork_join_nests: &Vec<HashMap<NodeID, Vec<NodeID>>>, + antideps: &Vec<Vec<(NodeID, NodeID)>>, + bbs: &Vec<Vec<NodeID>>, + plans: &Vec<Plan>, + w: &mut W, +) -> Result<ModuleManifest, std::fmt::Error> { + // Render types, constants, and dynamic constants into LLVM IR. + let llvm_types = generate_type_strings(module); + let llvm_constants = generate_constant_strings(module); + let llvm_dynamic_constants = generate_dynamic_constant_strings(module); + + // Generate a dummy uninitialized global - this is needed so that there'll + // be a non-empty .bss section in the ELF object file. + write!(w, "@dummy = dso_local global i32 0, align 4\n")?; + + // Do codegen for each function individually. Get each function's manifest. + let mut manifests = vec![]; + for function_idx in 0..module.functions.len() { + // There's a bunch of per-function information we use. + let context = FunctionContext { + function: &module.functions[function_idx], + types: &module.types, + constants: &module.constants, + dynamic_constants: &module.dynamic_constants, + def_use: &def_uses[function_idx], + reverse_postorder: &reverse_postorders[function_idx], + typing: &typing[function_idx], + control_subgraph: &control_subgraphs[function_idx], + fork_join_map: &fork_join_maps[function_idx], + fork_join_nest: &fork_join_nests[function_idx], + antideps: &antideps[function_idx], + bbs: &bbs[function_idx], + plan: &plans[function_idx], + llvm_types: &llvm_types, + llvm_constants: &llvm_constants, + llvm_dynamic_constants: &llvm_dynamic_constants, + partitions_inverted_map: plans[function_idx].invert_partition_map(), + }; + + manifests.push(context.codegen_function(w)?); + } + + // Assemble the manifest for the whole module. + Ok(ModuleManifest { + functions: manifests, + types: module.types.clone(), + // TODO: populate array constants. + array_constants: vec![], + }) +} + +impl<'a> FunctionContext<'a> { + /* + * Each function gets codegened separately. + */ + fn codegen_function<W: Write>(&self, w: &mut W) -> Result<FunctionManifest, std::fmt::Error> { + // Find the "top" control node of each partition. One well-formedness + // condition of partitions is that there is exactly one "top" control + // node. + let top_nodes: Vec<NodeID> = self + .partitions_inverted_map + .iter() + .enumerate() + .map(|(part_idx, part)| { + // For each partition, find the "top" node. + *part + .iter() + .filter(move |id| { + // The "top" node is a control node having at least one + // control predecessor in another partition, or is a + // start node. Every predecessor in the control subgraph + // is a control node. + self.function.nodes[id.idx()].is_start() + || (self.function.nodes[id.idx()].is_control() + && self + .control_subgraph + .preds(**id) + .filter(|pred_id| { + self.plan.partitions[pred_id.idx()].idx() != part_idx + }) + .count() + > 0) + }) + .next() + .unwrap() + }) + .collect(); + + // Generate code for each individual partition. This generates a single + // LLVM function per partition. These functions will be called in async + // tasks by the Hercules runtime. + assert_eq!(self.plan.num_partitions, top_nodes.len()); + let mut manifests = vec![]; + for part_idx in 0..self.plan.num_partitions { + match self.plan.partition_devices[part_idx] { + Device::CPU => manifests.push(self.codegen_cpu_partition(top_nodes[part_idx], w)?), + Device::GPU => todo!(), + } + } + + // Assemble the manifest for the whole function. + Ok(FunctionManifest { + name: self.function.name.clone(), + param_types: self.function.param_types.clone(), + typing: self.typing.clone(), + num_dynamic_constant_parameters: self.function.num_dynamic_constants, + partitions: manifests, + // TODO: populate dynamic constant rules. + dynamic_constant_rules: vec![], + }) + } +} diff --git a/hercules_ir/Cargo.toml b/hercules_ir/Cargo.toml index 39fbebe5..b99c0877 100644 --- a/hercules_ir/Cargo.toml +++ b/hercules_ir/Cargo.toml @@ -7,4 +7,5 @@ authors = ["Russel Arbore <rarbore2@illinois.edu>, Aaron Councilman <aaronjc4@il rand = "*" nom = "*" ordered-float = "*" -bitvec = "*" \ No newline at end of file +bitvec = "*" +serde = { version = "*", features = ["derive"] } \ No newline at end of file diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs index 0688c137..075d10c1 100644 --- a/hercules_ir/src/ir.rs +++ b/hercules_ir/src/ir.rs @@ -1,5 +1,6 @@ extern crate bitvec; extern crate ordered_float; +extern crate serde; use std::fmt::Write; use std::ops::Coroutine; @@ -7,6 +8,8 @@ use std::ops::CoroutineState; use std::pin::Pin; use self::bitvec::prelude::*; +use self::serde::Deserialize; +use self::serde::Serialize; use crate::*; @@ -52,7 +55,7 @@ pub struct Function { * parallelism. Summation types are an IR equivalent of Rust's enum types. * These are lowered into tagged unions during scheduling. */ -#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] pub enum Type { Control(Box<[NodeID]>), Boolean, @@ -379,7 +382,7 @@ impl Module { let mut stack = (0..self.types.len()) .map(TypeID::new) .collect::<Vec<TypeID>>(); - let coroutine = move || { + let coroutine = #[coroutine] move || { // Since this is a coroutine, handle recursion manually. while let Some(id) = stack.pop() { if visited[id.idx()] { @@ -438,7 +441,7 @@ impl Module { let mut stack = (0..self.constants.len()) .map(ConstantID::new) .collect::<Vec<ConstantID>>(); - let coroutine = move || { + let coroutine = #[coroutine] move || { // Since this is a coroutine, handle recursion manually. while let Some(id) = stack.pop() { if visited[id.idx()] { @@ -593,7 +596,8 @@ impl<T: Clone> GraveUpdatable for Vec<T> { for (data, (idx, mapping)) in std::iter::zip(self.into_iter(), grave_mapping.iter().enumerate()) { - if idx != 0 && mapping.idx() == 0 { + if idx == 0 || mapping.idx() != 0 { + assert_eq!(new_self.len(), mapping.idx()); new_self.push(data.clone()); } } @@ -695,6 +699,22 @@ impl Constant { } } + pub fn try_array_type(&self, types: &[Type]) -> Option<TypeID> { + // Need types, since zero initializer may be for a collection type, ro + // not. + match self { + Constant::Array(ty, _) => Some(*ty), + Constant::Zero(ty) => { + if types[ty.idx()].is_primitive() { + None + } else { + Some(*ty) + } + } + _ => None, + } + } + /* * Useful for GVN. */ @@ -732,6 +752,16 @@ impl Constant { } } +impl DynamicConstant { + pub fn is_parameter(&self) -> bool { + if let DynamicConstant::Parameter(_) = self { + true + } else { + false + } + } +} + /* * Simple predicate functions on nodes take a lot of space, so use a macro. */ @@ -825,6 +855,9 @@ impl Node { data: _, } ); + define_pattern_predicate!(is_parameter, Node::Parameter { index: _ }); + define_pattern_predicate!(is_constant, Node::Constant { id: _ }); + define_pattern_predicate!(is_dynamic_constant, Node::DynamicConstant { id: _ }); define_pattern_predicate!( is_read, Node::Read { @@ -1148,7 +1181,18 @@ impl TernaryOperator { #[macro_export] macro_rules! define_id_type { ($x: ident) => { - #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] + #[derive( + Debug, + Clone, + Copy, + PartialEq, + Eq, + Hash, + PartialOrd, + Ord, + serde::Serialize, + serde::Deserialize, + )] pub struct $x(u32); impl $x { diff --git a/hercules_ir/src/lib.rs b/hercules_ir/src/lib.rs index 9da0276d..44252678 100644 --- a/hercules_ir/src/lib.rs +++ b/hercules_ir/src/lib.rs @@ -1,4 +1,4 @@ -#![feature(coroutines, coroutine_trait, let_chains)] +#![feature(coroutines, coroutine_trait, let_chains, stmt_expr_attributes)] pub mod antideps; pub mod build; diff --git a/hercules_ir/src/schedule.rs b/hercules_ir/src/schedule.rs index 9881c3be..f8c35276 100644 --- a/hercules_ir/src/schedule.rs +++ b/hercules_ir/src/schedule.rs @@ -189,7 +189,7 @@ pub fn default_plan( schedules: vec![vec![]; function.nodes.len()], partitions: vec![PartitionID::new(0); function.nodes.len()], partition_devices: vec![Device::CPU; 1], - num_partitions: 0, + num_partitions: 1, }; // Infer schedules. @@ -198,7 +198,8 @@ pub fn default_plan( // Infer a partitioning. partition_out_forks(function, reverse_postorder, fork_join_map, bbs, &mut plan); - place_fork_partitions_on_gpu(function, &mut plan); + // TODO: uncomment once GPU backend is implemented. + // place_fork_partitions_on_gpu(function, &mut plan); plan } @@ -349,13 +350,15 @@ pub fn partition_out_forks( reverse_postorder, |inputs: &[&NodeID], node_id: NodeID| match function.nodes[node_id.idx()] { Node::Start => NodeID::new(0), - Node::Fork { - control: _, - factor: _, - } => { + Node::Fork { control, factor: _ } => { // Start a partition if the preceding partition isn't a fork - // partition. Otherwise, be part of the parent fork partition. - if *inputs[0] != NodeID::top() && function.nodes[inputs[0].idx()].is_fork() { + // partition and the predecessor isn't the join for the + // predecessor fork partition. Otherwise, be part of the parent + // fork partition. + if *inputs[0] != NodeID::top() + && function.nodes[inputs[0].idx()].is_fork() + && fork_join_map.get(&inputs[0]) != Some(&control) + { inputs[0].clone() } else { node_id diff --git a/hercules_ir/src/subgraph.rs b/hercules_ir/src/subgraph.rs index c0b7aa4b..6d76f6fc 100644 --- a/hercules_ir/src/subgraph.rs +++ b/hercules_ir/src/subgraph.rs @@ -37,6 +37,12 @@ impl<'a> Iterator for SubgraphIterator<'a> { } } +impl<'a> ExactSizeIterator for SubgraphIterator<'a> { + fn len(&self) -> usize { + self.edges.len() + } +} + impl IntoIterator for Subgraph { type Item = NodeID; type IntoIter = std::vec::IntoIter<Self::Item>; diff --git a/hercules_opt/Cargo.toml b/hercules_opt/Cargo.toml index 5cf76b09..6da77f44 100644 --- a/hercules_opt/Cargo.toml +++ b/hercules_opt/Cargo.toml @@ -7,4 +7,7 @@ authors = ["Russel Arbore <rarbore2@illinois.edu>, Aaron Councilman <aaronjc4@il ordered-float = "*" bitvec = "*" take_mut = "*" +postcard = { version = "*", features = ["alloc"] } +serde = { version = "*", features = ["derive"] } hercules_ir = { path = "../hercules_ir" } +hercules_cg = { path = "../hercules_cg" } diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs index 394de3c0..ef79b36e 100644 --- a/hercules_opt/src/pass.rs +++ b/hercules_opt/src/pass.rs @@ -1,28 +1,26 @@ +extern crate hercules_cg; extern crate hercules_ir; +extern crate postcard; +extern crate serde; extern crate take_mut; use std::collections::HashMap; +use std::fs::File; +use std::io::prelude::*; use std::iter::zip; +use std::process::*; -use self::hercules_ir::antideps::*; -use self::hercules_ir::dataflow::*; -use self::hercules_ir::def_use::*; -use self::hercules_ir::dom::*; -use self::hercules_ir::dot::*; -use self::hercules_ir::gcm::*; -use self::hercules_ir::ir::*; -use self::hercules_ir::loops::*; -use self::hercules_ir::schedule::*; -use self::hercules_ir::subgraph::*; -use self::hercules_ir::typecheck::*; -use self::hercules_ir::verify::*; +use self::serde::Deserialize; + +use self::hercules_cg::*; +use self::hercules_ir::*; use crate::*; /* * Passes that can be run on a module. */ -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Deserialize)] pub enum Pass { DCE, CCP, @@ -31,7 +29,11 @@ pub enum Pass { PhiElim, Predication, Verify, + // Parameterized over whether analyses that aid visualization are necessary. + // Useful to set to false if displaying a potentially broken module. Xdot(bool), + // Parameterized by output file name. + Codegen(String), } /* @@ -221,7 +223,7 @@ impl PassManager { } pub fn make_bbs(&mut self) { - if self.antideps.is_none() { + if self.bbs.is_none() { self.make_def_uses(); self.make_reverse_postorders(); self.make_doms(); @@ -390,6 +392,61 @@ impl PassManager { // Xdot doesn't require clearing analysis results. continue; } + Pass::Codegen(output_file_name) => { + self.make_def_uses(); + self.make_reverse_postorders(); + self.make_typing(); + self.make_control_subgraphs(); + self.make_fork_join_maps(); + self.make_fork_join_nests(); + self.make_antideps(); + self.make_bbs(); + self.make_plans(); + + let mut llvm_ir = String::new(); + let manifest = codegen( + &self.module, + self.def_uses.as_ref().unwrap(), + self.reverse_postorders.as_ref().unwrap(), + self.typing.as_ref().unwrap(), + self.control_subgraphs.as_ref().unwrap(), + self.fork_join_maps.as_ref().unwrap(), + self.fork_join_nests.as_ref().unwrap(), + self.antideps.as_ref().unwrap(), + self.bbs.as_ref().unwrap(), + self.plans.as_ref().unwrap(), + &mut llvm_ir, + ) + .unwrap(); + + // Compile LLVM IR into ELF object. + let llc_process = Command::new("llc") + .arg("-filetype=obj") + .arg("-O3") + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .unwrap(); + llc_process + .stdin + .as_ref() + .unwrap() + .write(llvm_ir.as_bytes()) + .unwrap(); + let elf_object = llc_process.wait_with_output().unwrap().stdout; + + // Package manifest and ELF object into the same file. + let hbin_module = (manifest, elf_object); + let hbin_contents: Vec<u8> = postcard::to_allocvec(&hbin_module).unwrap(); + + let mut file = + File::create(output_file_name).expect("PANIC: Unable to open output file."); + file.write_all(&hbin_contents) + .expect("PANIC: Unable to write output file contents."); + + // Codegen doesn't require clearing analysis results. + continue; + } } // Cleanup the module after passes. Delete gravestone nodes. Repair diff --git a/hercules_rt/Cargo.toml b/hercules_rt/Cargo.toml index 500265c4..3df92d6a 100644 --- a/hercules_rt/Cargo.toml +++ b/hercules_rt/Cargo.toml @@ -5,3 +5,6 @@ authors = ["Russel Arbore <rarbore2@illinois.edu>"] [dependencies] libc = "*" +postcard = { version = "*", features = ["alloc"] } +serde = { version = "*", features = ["derive"] } +hercules_ir = { path = "../hercules_ir" } diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs index f04c6c6a..2d19454d 100644 --- a/hercules_rt/src/lib.rs +++ b/hercules_rt/src/lib.rs @@ -1,27 +1,34 @@ +extern crate postcard; + use std::fs::File; use std::io::prelude::*; use std::path::Path; pub(crate) mod elf; +pub mod manifest; pub(crate) use crate::elf::*; +pub(crate) use crate::manifest::*; #[derive(Debug)] pub struct Module { + manifest: ModuleManifest, elf: Elf, } impl Module { - pub fn get_function_ptr(&self, name: &str) -> *mut u8 { - unsafe { - self.elf.program_section.offset( - self.elf.function_pointers[self - .elf - .function_names - .iter() - .position(|s| s == name) - .unwrap()], - ) - } + /* + * Get the function pointer corresponding to a function name. Panic if not + * found. + */ + pub unsafe fn get_function_ptr(&self, name: &str) -> *mut u8 { + self.elf.program_section.offset( + self.elf.function_pointers[self + .elf + .function_names + .iter() + .position(|s| s == name) + .unwrap()], + ) } } @@ -29,8 +36,12 @@ pub fn load_binary(path: &Path) -> Module { let mut f = File::open(path).unwrap(); let mut buffer = vec![]; f.read_to_end(&mut buffer).unwrap(); - let elf = unsafe { parse_elf(buffer.as_slice()) }; - Module { elf } + let manifest_and_elf_bytes: (ModuleManifest, Vec<u8>) = postcard::from_bytes(&buffer).unwrap(); + let elf = unsafe { parse_elf(&manifest_and_elf_bytes.1) }; + Module { + manifest: manifest_and_elf_bytes.0, + elf, + } } /* diff --git a/hercules_rt/src/manifest.rs b/hercules_rt/src/manifest.rs new file mode 100644 index 00000000..c184e8b1 --- /dev/null +++ b/hercules_rt/src/manifest.rs @@ -0,0 +1,97 @@ +extern crate hercules_ir; +extern crate serde; + +use self::serde::Deserialize; +use self::serde::Serialize; + +use self::hercules_ir::ir::*; + +/* + * Every .hbin file contains a manifest which describes the Hercules functions + * contained in the module. This information is used by the runtime to execute + * the functions properly, the chief concern being how to stitch together the + * execution of each partition. + */ +#[derive(Debug, Serialize, Deserialize)] +pub struct ModuleManifest { + // A module contains a manifest per individual function. + pub functions: Vec<FunctionManifest>, + // All of the types used in the module. + pub types: Vec<Type>, + // The only constants that aren't baked into the generated code are array + // constants. These are explicitly stored in and loaded from the manifest. + // Arrays are composed of the underlying array bytes. We don't need to store + // the dimensions of arrays at this point, since the runtime doesn't + // manipulate or otherwise need the dimensions of constant arrays. + pub array_constants: Vec<Vec<u8>>, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct FunctionManifest { + pub name: String, + // Types of the function parameters. + pub param_types: Vec<TypeID>, + // Types of all of the nodes in this function. Used for figuring out the + // type of partition data inputs and outputs. + pub typing: Vec<TypeID>, + // Number of dynamic constant parameters that need to provided. + pub num_dynamic_constant_parameters: u32, + // Manifests for constituent partitions. + pub partitions: Vec<PartitionManifest>, + // When using dynamic constants, certain constraints are generated. For + // example, using a dynamic constant in a fork means that it must be non- + // zero, since fork-join nests are guaranteed to execute at least one + // iteration. Also, if one uses division in dynamic constant math, the + // resulting dynamic constant must be an integer, so the numerator dynamic + // constant must be divisible by the denominator dynamic constant. These are + // stored per function, since different functions have different contraints + // on their dynamic constant parameters. + pub dynamic_constant_rules: Vec<DynamicConstantRule>, +} + +/* + * Rules for validity of provided dynamic constants. Integers refer to dynamic + * constant parameters of a function. + */ +#[derive(Debug, Serialize, Deserialize)] +pub enum DynamicConstantRule { + // Generated from forks. + NonZero(u32), + // Generated from subtraction. + LessThan(u32, u32), + // Generated from division. + Divides(u32, u32), +} + +#[derive(Debug, Serialize, Deserialize)] +pub enum PartitionInput { + // Data input from another partition within this function. Integer is the + // node ID used from the other partition. + DataInput(u32), + // An argument from the function parameters. Integer is the parameter index. + FunctionArgument(u32), + // An array constant used in this function. Integer is the array constant + // number. + ArrayConstant(u32), + // A dynamic constant parameter of this function. Integer is the dynamic + // constant parameter number. + DynamicConstant(u32), +} + +#[derive(Debug, Serialize, Deserialize)] +pub enum PartitionOutput { + // Data output used by another partition within this function, or to be + // returned from this function. Integer is the node ID used in the other + // partition or by a return node. + DataOutput(u32), + // Value indicating control flow that the runtime should take. + ControlIndicator, +} + +#[derive(Debug, Serialize, Deserialize, Default)] +pub struct PartitionManifest { + // Top node for this partition, as an integer. + pub top_node: u32, + pub inputs: Vec<PartitionInput>, + pub outputs: Vec<PartitionOutput>, +} diff --git a/hercules_samples/matmul/src/main.rs b/hercules_samples/matmul/src/main.rs index 2a28cd80..36177bea 100644 --- a/hercules_samples/matmul/src/main.rs +++ b/hercules_samples/matmul/src/main.rs @@ -3,17 +3,19 @@ extern crate clap; use std::path::Path; fn main() { - let module = hercules_rt::load_binary(Path::new("test.o")); + let module = hercules_rt::load_binary(Path::new("matmul.hbin")); + + println!("{:?}", module); let matmul = hercules_rt::lookup_function!( module, - "matmul", + "matmul_part_1", *const f32, *const f32, + *mut f32, u64, u64, u64, - *mut f32, => *const f32 ); @@ -24,10 +26,10 @@ fn main() { matmul( std::mem::transmute(a.as_ptr()), std::mem::transmute(b.as_ptr()), + std::mem::transmute(c.as_mut_ptr()), 2, 2, 2, - std::mem::transmute(c.as_mut_ptr()), ) }; println!("{} {}\n{} {}", c[0][0], c[0][1], c[1][0], c[1][1]); diff --git a/hercules_samples/sum_sample.hir b/hercules_samples/sum_sample.hir index 8b8c0024..2ff76749 100644 --- a/hercules_samples/sum_sample.hir +++ b/hercules_samples/sum_sample.hir @@ -1,8 +1,8 @@ -fn sum(a: array(f32, 16)) -> f32 +fn sum<1>(a: array(f32, #0)) -> f32 zero_idx = constant(u64, 0) one_idx = constant(u64, 1) zero_inc = constant(f32, 0) - bound = constant(u64, 16) + bound = dynamic_constant(#0) loop = region(start, if_true) idx = phi(loop, zero_idx, idx_inc) idx_inc = add(idx, one_idx) @@ -39,4 +39,4 @@ fn alt_sum<1>(a: array(f32, #0)) -> f32 if = if(negate_bottom, in_bounds) if_false = read(if, control(0)) if_true = read(if, control(1)) - r = return(if_false, red_add) \ No newline at end of file + r = return(if_false, red_add) diff --git a/hercules_samples/task_parallel.hir b/hercules_samples/task_parallel.hir new file mode 100644 index 00000000..6386d5ec --- /dev/null +++ b/hercules_samples/task_parallel.hir @@ -0,0 +1,14 @@ +fn task_parallel<1>() -> u64 + f_ctrl1 = fork(start, #0) + j_ctrl1 = join(f_ctrl1) + zero = constant(u64, 0) + x1 = thread_id(f_ctrl1) + data1 = reduce(j_ctrl1, zero, sum1) + sum1 = add(data1, x1) + f_ctrl2 = fork(j_ctrl1, #0) + j_ctrl2 = join(f_ctrl2) + x2 = thread_id(f_ctrl2) + data2 = reduce(j_ctrl2, zero, sum2) + sum2 = add(data2, x2) + final = add(data1, data2) + r = return(j_ctrl2, final) diff --git a/hercules_tools/hercules_cpu/Cargo.toml b/hercules_tools/hercules_cpu_beta/Cargo.toml similarity index 91% rename from hercules_tools/hercules_cpu/Cargo.toml rename to hercules_tools/hercules_cpu_beta/Cargo.toml index 58519ce5..38b30e82 100644 --- a/hercules_tools/hercules_cpu/Cargo.toml +++ b/hercules_tools/hercules_cpu_beta/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "hercules_cpu" +name = "hercules_cpu_beta" version = "0.1.0" authors = ["Russel Arbore <rarbore2@illinois.edu>"] diff --git a/hercules_tools/hercules_cpu/src/main.rs b/hercules_tools/hercules_cpu_beta/src/main.rs similarity index 94% rename from hercules_tools/hercules_cpu/src/main.rs rename to hercules_tools/hercules_cpu_beta/src/main.rs index c1b66ede..2edd426a 100644 --- a/hercules_tools/hercules_cpu/src/main.rs +++ b/hercules_tools/hercules_cpu_beta/src/main.rs @@ -17,7 +17,7 @@ struct Args { fn main() { let args = Args::parse(); if !args.hir_file.ends_with(".hir") { - eprintln!("WARNING: Running hercules_cpu on a file without a .hir extension - interpreting as a textual Hercules IR file."); + eprintln!("WARNING: Running hercules_cpu_beta on a file without a .hir extension - interpreting as a textual Hercules IR file."); } let mut file = File::open(args.hir_file).expect("PANIC: Unable to open input file."); diff --git a/hercules_tools/hercules_driver/Cargo.toml b/hercules_tools/hercules_driver/Cargo.toml new file mode 100644 index 00000000..aa6d4f5e --- /dev/null +++ b/hercules_tools/hercules_driver/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "hercules_driver" +version = "0.1.0" +authors = ["Russel Arbore <rarbore2@illinois.edu>"] + +[dependencies] +clap = { version = "*", features = ["derive"] } +ron = "*" +hercules_ir = { path = "../../hercules_ir" } +hercules_opt = { path = "../../hercules_opt" } diff --git a/hercules_tools/hercules_driver/src/main.rs b/hercules_tools/hercules_driver/src/main.rs new file mode 100644 index 00000000..17be3596 --- /dev/null +++ b/hercules_tools/hercules_driver/src/main.rs @@ -0,0 +1,44 @@ +extern crate clap; + +use std::fs::File; +use std::io::prelude::*; + +use clap::Parser; + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + hir_file: String, + passes: String, +} + +fn main() { + let args = Args::parse(); + if !args.hir_file.ends_with(".hir") { + eprintln!("WARNING: Running hercules_driver on a file without a .hir extension - interpreting as a textual Hercules IR file."); + } + + let mut file = File::open(args.hir_file).expect("PANIC: Unable to open input file."); + let mut contents = String::new(); + file.read_to_string(&mut contents) + .expect("PANIC: Unable to read input file contents."); + let module = + hercules_ir::parse::parse(&contents).expect("PANIC: Failed to parse Hercules IR file."); + + let mut pm = hercules_opt::pass::PassManager::new(module); + let passes: Vec<hercules_opt::pass::Pass> = args + .passes + .split(char::is_whitespace) + .map(|pass_str| { + assert_ne!( + pass_str, "", + "PANIC: Can't interpret empty pass name. Try giving a list of pass names." + ); + ron::from_str(pass_str).expect("PANIC: Couldn't parse list of passes.") + }) + .collect(); + for pass in passes { + pm.add_pass(pass); + } + pm.run_passes(); +} -- GitLab