diff --git a/Cargo.lock b/Cargo.lock index 61cde7f161b7c4177cb781addeb2f484af3b7477..acc9783042434d0569ed2fe7f1f577a43fc597d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -23,12 +23,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4aa90d7ce82d4be67b64039a3d588d38dbcc6736577de4a847025ce5b0c468d1" -[[package]] -name = "allocator-api2" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" - [[package]] name = "anes" version = "0.1.6" @@ -684,27 +678,6 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" -[[package]] -name = "egg" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abb749745461743bb477fba3ef87c663d5965876155c676c9489cfe0963de5ab" -dependencies = [ - "env_logger", - "hashbrown", - "indexmap", - "log", - "num-bigint", - "num-traits", - "quanta", - "rustc-hash", - "saturating", - "smallvec", - "symbol_table", - "symbolic_expressions", - "thiserror", -] - [[package]] name = "either" version = "1.13.0" @@ -723,15 +696,6 @@ version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d" -[[package]] -name = "env_logger" -version = "0.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7" -dependencies = [ - "log", -] - [[package]] name = "equivalent" version = "1.0.2" @@ -845,12 +809,6 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" -[[package]] -name = "foldhash" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" - [[package]] name = "funty" version = "2.0.0" @@ -975,11 +933,6 @@ name = "hashbrown" version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" -dependencies = [ - "allocator-api2", - "equivalent", - "foldhash", -] [[package]] name = "heapless" @@ -1047,7 +1000,6 @@ version = "0.1.0" dependencies = [ "bimap", "bitvec", - "egg", "either", "hercules_cg", "hercules_ir", @@ -2117,21 +2069,6 @@ dependencies = [ "bytemuck", ] -[[package]] -name = "quanta" -version = "0.12.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e" -dependencies = [ - "crossbeam-utils", - "libc", - "once_cell", - "raw-cpuid", - "wasi 0.11.0+wasi-snapshot-preview1", - "web-sys", - "winapi", -] - [[package]] name = "quick-error" version = "2.0.1" @@ -2266,15 +2203,6 @@ dependencies = [ "rgb", ] -[[package]] -name = "raw-cpuid" -version = "11.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "529468c1335c1c03919960dfefdb1b3648858c20d7ec2d0663e728e4a717efbc" -dependencies = [ - "bitflags 2.8.0", -] - [[package]] name = "rayon" version = "1.10.0" @@ -2339,12 +2267,6 @@ version = "0.8.50" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57397d16646700483b67d2dd6511d79318f9d057fdbd21a4066aeac8b41d310a" -[[package]] -name = "rustc-hash" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" - [[package]] name = "rustc_version" version = "0.4.1" @@ -2388,12 +2310,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "saturating" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ece8e78b2f38ec51c51f5d475df0a7187ba5111b2a28bdc761ee05b075d40a71" - [[package]] name = "scopeguard" version = "1.2.0" @@ -2537,23 +2453,6 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" -[[package]] -name = "symbol_table" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f19bffd69fb182e684d14e3c71d04c0ef33d1641ac0b9e81c712c734e83703bc" -dependencies = [ - "crossbeam-utils", - "foldhash", - "hashbrown", -] - -[[package]] -name = "symbolic_expressions" -version = "5.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c68d531d83ec6c531150584c42a4290911964d5f0d79132b193b67252a23b71" - [[package]] name = "syn" version = "1.0.109" @@ -2938,22 +2837,6 @@ version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53a85b86a771b1c87058196170769dd264f66c0782acf1ae6cc51bfd64b39082" -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - [[package]] name = "winapi-util" version = "0.1.9" @@ -2963,12 +2846,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - [[package]] name = "windows" version = "0.59.0" diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs index 37bf814d85569a5aa8dd827e2fce17894da5a397..c5ffe76034775fccab6b3353a9a6a83625a4ab4b 100644 --- a/hercules_cg/src/cpu.rs +++ b/hercules_cg/src/cpu.rs @@ -146,7 +146,7 @@ impl<'a> CPUContext<'a> { self.function.name, )?; } - write!(w, ") {{\n")?; + write!(w, ") nounwind nosync willreturn norecurse {{\n")?; let mut blocks: BTreeMap<_, _> = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_control()) @@ -346,7 +346,7 @@ impl<'a> CPUContext<'a> { let offset = offsets[&id].0; write!( body, - " {} = getelementptr i8, ptr %backing, i64 %dc{}\n", + " {} = getelementptr inbounds i8, ptr %backing, i64 %dc{}\n", self.get_value(id, false), offset.idx() )?; @@ -473,11 +473,14 @@ impl<'a> CPUContext<'a> { let opcode = match (op, op_ty) { (BinaryOperator::Add, OpTy::Float) => "fadd", - (BinaryOperator::Add, _) => "add", + (BinaryOperator::Add, OpTy::Unsigned) => "add nuw", + (BinaryOperator::Add, OpTy::Signed) => "add nsw", (BinaryOperator::Sub, OpTy::Float) => "fsub", - (BinaryOperator::Sub, _) => "sub", + (BinaryOperator::Sub, OpTy::Unsigned) => "sub nuw", + (BinaryOperator::Sub, OpTy::Signed) => "sub nsw", (BinaryOperator::Mul, OpTy::Float) => "fmul", - (BinaryOperator::Mul, _) => "mul", + (BinaryOperator::Mul, OpTy::Unsigned) => "mul nuw", + (BinaryOperator::Mul, OpTy::Signed) => "mul nsw", (BinaryOperator::Div, OpTy::Float) => "fdiv", (BinaryOperator::Div, OpTy::Unsigned) => "udiv", (BinaryOperator::Div, OpTy::Signed) => "sdiv", @@ -1005,7 +1008,7 @@ impl<'a> CPUContext<'a> { let name = format!("%gep.{}", Self::gen_filler_id()); write!( body, - " {} = getelementptr i8, ptr {}, i64 {}\n", + " {} = getelementptr inbounds i8, ptr {}, i64 {}\n", name, ptr, size )?; Ok(name) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index dd87acbe18de297557fc8b97696928946e90c2e2..4069cb02bcf1ed0a93f5c40df432253e842ead9a 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -354,7 +354,6 @@ impl GPUContext<'_> { write!( w, " -#define _CG_ABI_EXPERIMENTAL #include <assert.h> #include <stdio.h> #include <stddef.h> @@ -362,8 +361,23 @@ impl GPUContext<'_> { #include <cuda_runtime.h> #include <math_constants.h> #include <mma.h> + +#if (CUDA_VERSION >= 12000) +#else +#define _CG_ABI_EXPERIMENTAL +#endif + #include <cooperative_groups.h> #include <cooperative_groups/reduce.h> + +#if (CUDA_VERSION >= 12000) +namespace cg = cooperative_groups; +namespace cge = cooperative_groups; +#else +namespace cg = cooperative_groups; +namespace cge = cooperative_groups::experimental; +#endif + #include <cuda_bf16.h> namespace cg = cooperative_groups; @@ -564,12 +578,12 @@ namespace cg = cooperative_groups; fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { write!( w, - "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n" + "\t__shared__ cge::block_tile_memory<1024> block_sync_shared;\n" )?; write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?; write!( w, - "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n" + "\tcg::thread_block block = cge::this_thread_block(block_sync_shared);\n" )?; Ok(()) } @@ -1715,20 +1729,20 @@ namespace cg = cooperative_groups; }; write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n", use_thread_per_id, cg_tile, use_thread_per_id )?; let cg_tile_use = self.get_cg_tile(id, CGType::Use); write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n", use_thread_quota, cg_tile_use, use_thread_quota )?; let available_thread_quota = available_thread_quota.unwrap(); let cg_tile_available = self.get_cg_tile(id, CGType::Available); write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n", available_thread_quota, cg_tile_available, available_thread_quota )?; if parallel_factor.is_none() { diff --git a/hercules_opt/Cargo.toml b/hercules_opt/Cargo.toml index 92e0533938eab0cc76fe4b69a205f4dceae2f244..892e4c1539f34a91087472b052f015f542a351a6 100644 --- a/hercules_opt/Cargo.toml +++ b/hercules_opt/Cargo.toml @@ -22,4 +22,4 @@ hercules_cg = { path = "../hercules_cg" } hercules_ir = { path = "../hercules_ir" } nestify = "*" bimap = "*" -egg = "*" +#egg = "*" diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs index 4a6365c8c68bd7714b529744be1cc2bd0259071a..4cee5e83a227d3c868bc7607146cdebfaeb0b4d5 100644 --- a/hercules_opt/src/gcm.rs +++ b/hercules_opt/src/gcm.rs @@ -221,6 +221,31 @@ fn preliminary_fixups( } } + // Add region nodes between join nodes and loop headers to aid in block + // placement. + for (_, join) in fork_join_map { + let control_user = editor + .get_users(*join) + .filter(|id| nodes[id.idx()].is_control()) + .next() + .unwrap(); + if nodes[control_user.idx()].is_fork() + || nodes[control_user.idx()] + .try_region() + .map(|preds| preds.len() > 1) + .unwrap_or(false) + { + let success = editor.edit(|mut edit| { + let region = edit.add_node(Node::Region { + preds: Box::new([*join]), + }); + edit.replace_all_uses_where(*join, region, |id| *id == control_user) + }); + assert!(success); + return true; + } + } + false } diff --git a/hercules_opt/src/rewrite_math_expressions.rs b/hercules_opt/src/rewrite_math_expressions.rs index 6f52dc58ee32751b96190b42a0add01d42ce3b1e..55e695ba75704395c5fde371449fad940310ad43 100644 --- a/hercules_opt/src/rewrite_math_expressions.rs +++ b/hercules_opt/src/rewrite_math_expressions.rs @@ -3,10 +3,22 @@ use std::fmt::{Error, Write}; use hercules_ir::*; -use egg::*; +//use egg::*; use crate::*; +pub fn rewrite_math_expressions( + editor: &mut FunctionEditor, + device: Device, + typing: &Vec<TypeID>, + fork_join_map: &HashMap<NodeID, NodeID>, + nodes_in_fork_joins: &HashMap<NodeID, HashSet<NodeID>>, + reduce_einsums: &(MathEnv, HashMap<NodeID, MathID>), +) { + panic!("PANIC: The rewrite math expressions pass is currently disabled, as including egg increases compile times and we're not using it currently."); +} + +/* define_language! { enum MathLanguage { "zero" = Zero, @@ -164,3 +176,4 @@ fn egg_print_math_expr<W: Write>(id: MathID, env: &MathEnv, w: &mut W) -> Result _ => Err(Error::default()), } } +*/ diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs index a5954ca03c32441709a2ce5452e9b2584f9c6155..9265808be132b0137293332fea7ba4a378bd1bd9 100644 --- a/hercules_rt/src/lib.rs +++ b/hercules_rt/src/lib.rs @@ -5,6 +5,7 @@ use std::future::Future; use std::marker::PhantomData; use std::ptr::{copy_nonoverlapping, write_bytes, NonNull}; use std::slice::{from_raw_parts, from_raw_parts_mut}; +use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::OnceLock; /* @@ -928,3 +929,30 @@ unsafe impl GlobalAlloc for AlignedAlloc { #[global_allocator] static A: AlignedAlloc = AlignedAlloc; + +pub struct SpinBarrier { + num: usize, + waiting: AtomicUsize, + gen: AtomicUsize, +} + +impl SpinBarrier { + pub const fn new(num: usize) -> Self { + SpinBarrier { + num, + waiting: AtomicUsize::new(0), + gen: AtomicUsize::new(0), + } + } + + pub fn wait(&self) { + let old_gen = self.gen.load(Ordering::Acquire); + let old_waiting = self.waiting.fetch_add(1, Ordering::Relaxed); + if old_waiting + 1 == self.num { + self.waiting.store(0, Ordering::Relaxed); + self.gen.fetch_add(1, Ordering::Release); + } else { + while old_gen == self.gen.load(Ordering::Acquire) {} + } + } +} diff --git a/juno_samples/matmul/src/matmul.sch b/juno_samples/matmul/src/matmul.sch index 306997f58eb217f9ce301dc18e418c412e6df621..6867576e4a3a81f327d23eca6dccbcd6a6183eb4 100644 --- a/juno_samples/matmul/src/matmul.sch +++ b/juno_samples/matmul/src/matmul.sch @@ -51,7 +51,9 @@ if feature("cuda") { fork-coalesce(*); infer-schedules(*); dce(*); - rewrite(*); + //rewrite(*); + let out = outline(matmul@outer); + gpu(out); fixpoint { simplify-cfg(*); dce(*); diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn index d6ec25f26b997205d4dcdf116bce462fa228fc88..75190300e4d8be463176c4c346d89715cece0ced 100644 --- a/juno_samples/rodinia/bfs/src/bfs.jn +++ b/juno_samples/rodinia/bfs/src/bfs.jn @@ -24,7 +24,7 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] @cost_init for i in 0..n { cost[i] = -1; } - cost[source as u64] = 0; + @cost_init cost[source as u64] = 0; // Nodes that were updated in the current iteration let updated: bool[n]; diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index 550dce319d3c5fe2c14168af825ad6417d6f8a6f..63b2e4fed6db89a091488117cdb6c17a14582c30 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -47,16 +47,15 @@ if !feature("seq") { let (outer, inner) = fork-reshape[[1], [0]](collect); collect = outline(inner); - // Following code seems to generate breaking RT code - //fork-tile[32, 0, false, true](init); - //let (outer, inner) = fork-reshape[[1], [0]](init); - //init = outline(inner); - //inline(bfs@cost_init); + fork-tile[32, 0, false, true](init); + let (outer, inner) = fork-reshape[[1], [0]](init); + let init_body = outline(inner); - inline(bfs@loop1, bfs@loop2); + inline(bfs@cost_init, bfs@loop1, bfs@loop2); } delete-uncalled(*); const-inline(*); +simpl!(*); unforkify(init, traverse, collect); simpl!(*); diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch index 4e5c1f74e34cedacd48ddf6c33b631a08079b995..0253a0210f6cc2451d38399601ad39ff3ab9465a 100644 --- a/juno_samples/rodinia/bfs/src/gpu.sch +++ b/juno_samples/rodinia/bfs/src/gpu.sch @@ -38,8 +38,12 @@ fixpoint { } simpl!(collect); +fork-tile[32, 0, false, true](init); +let (outer, inner) = fork-reshape[[1], [0]](init); +let init_body = outline(inner); + fork-tile[1024, 0, false, true](traverse, collect); fork-split(traverse, collect); -unforkify(init); +unforkify(init_body); gcm(*);