diff --git a/Cargo.lock b/Cargo.lock index 80ef4516f66677992a07755b3bdb788245e63c3a..3c89534c0cdc3fabb3b57598c29d9d0e94f5c3fd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -367,6 +367,7 @@ name = "call" version = "0.1.0" dependencies = [ "async-std", + "hercules_rt", "juno_build", "rand", "with_builtin_macros", @@ -388,6 +389,7 @@ name = "ccp" version = "0.1.0" dependencies = [ "async-std", + "hercules_rt", "juno_build", "rand", "with_builtin_macros", @@ -639,6 +641,7 @@ version = "0.1.0" dependencies = [ "async-std", "clap", + "hercules_rt", "juno_build", "rand", "with_builtin_macros", @@ -1013,6 +1016,7 @@ name = "juno_casts_and_intrinsics" version = "0.1.0" dependencies = [ "async-std", + "hercules_rt", "juno_build", "with_builtin_macros", ] diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs index 85139b4c573c2287169715599336bd5b2146e8be..06dfd18d173eb85a355f806f5c2111bba2ca5268 100644 --- a/hercules_cg/src/cpu.rs +++ b/hercules_cg/src/cpu.rs @@ -665,7 +665,7 @@ impl<'a> CPUContext<'a> { get_type_alignment(&self.types, fields[*idx]), body, )?; - acc_ptr = Self::gep(collect_name, &acc_offset, body)?; + acc_ptr = Self::gep(&acc_ptr, &acc_offset, body)?; } Index::Variant(_) => { // The tag of a summation is at the end of the summation, so @@ -690,7 +690,7 @@ impl<'a> CPUContext<'a> { // Convert offset in # elements -> # bytes. acc_offset = Self::multiply(&acc_offset, &elem_size, body)?; - acc_ptr = Self::gep(collect_name, &acc_offset, body)?; + acc_ptr = Self::gep(&acc_ptr, &acc_offset, body)?; } } } diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs index 6e56ebc48d381108f4483411b4e497bed6aec8ed..0449c92b41ef8bc17eff1a1134b24b49c70655a2 100644 --- a/hercules_cg/src/rt.rs +++ b/hercules_cg/src/rt.rs @@ -53,7 +53,7 @@ impl<'a> RTContext<'a> { // Dump the function signature. write!( w, - "#[allow(unused_variables,unused_mut,unused_parens)]\nasync fn {}<'a>(", + "#[allow(unused_variables,unused_mut,unused_parens,unused_unsafe)]\nasync fn {}<'a>(", func.name )?; let mut first_param = true; @@ -149,7 +149,7 @@ impl<'a> RTContext<'a> { // blocks to drive execution. write!( w, - " let mut control_token: i8 = 0;\n loop {{\n match control_token {{\n", + " let mut control_token: i8 = 0;\n let return_value = loop {{\n match control_token {{\n", )?; let mut blocks: BTreeMap<_, _> = (0..func.nodes.len()) @@ -182,8 +182,41 @@ impl<'a> RTContext<'a> { )?; } - // Close the match, loop, and function. - write!(w, " _ => panic!()\n }}\n }}\n}}\n")?; + // Close the match and loop. + write!(w, " _ => panic!()\n }}\n }};\n")?; + + // Emit the epilogue of the function. + write!(w, " unsafe {{\n")?; + for idx in 0..func.param_types.len() { + if !self.module.types[func.param_types[idx].idx()].is_primitive() { + write!(w, " p{}.__forget();\n", idx)?; + } + } + if !self.module.types[func.return_type.idx()].is_primitive() { + for object in self.collection_objects[&self.func_id].iter_objects() { + if let CollectionObjectOrigin::Constant(_) = + self.collection_objects[&self.func_id].origin(object) + { + write!( + w, + " if obj{}.__cmp_ids(&return_value) {{\n", + object.idx() + )?; + write!(w, " obj{}.__forget();\n", object.idx())?; + write!(w, " }}\n")?; + } + } + } + for idx in 0..func.nodes.len() { + if !func.nodes[idx].is_control() + && !self.module.types[self.typing[idx].idx()].is_primitive() + { + write!(w, " node_{}.__forget();\n", idx)?; + } + } + write!(w, " }}\n")?; + write!(w, " return_value\n")?; + write!(w, "}}\n")?; Ok(()) } @@ -230,7 +263,15 @@ impl<'a> RTContext<'a> { } Node::Return { control: _, data } => { let block = &mut blocks.get_mut(&id).unwrap(); - write!(block, " return {};\n", self.get_value(data))? + if self.module.types[self.typing[data.idx()].idx()].is_primitive() { + write!(block, " break {};\n", self.get_value(data))? + } else { + write!( + block, + " break unsafe {{ {}.__clone() }};\n", + self.get_value(data) + )? + } } _ => panic!("PANIC: Can't lower {:?}.", func.nodes[id.idx()]), } @@ -259,7 +300,7 @@ impl<'a> RTContext<'a> { } else { write!( block, - " {} = unsafe {{ p{}.__take() }};\n", + " {} = unsafe {{ p{}.__clone() }};\n", self.get_value(id), index )? @@ -284,7 +325,7 @@ impl<'a> RTContext<'a> { let objects = self.collection_objects[&self.func_id].objects(id); assert_eq!(objects.len(), 1); let object = objects[0]; - write!(block, "unsafe {{ obj{}.__take() }}", object.idx())? + write!(block, "unsafe {{ obj{}.__clone() }}", object.idx())? } } write!(block, ";\n")? @@ -374,7 +415,7 @@ impl<'a> RTContext<'a> { )?; write!( block, - " {} = unsafe {{ {}.__take() }};\n", + " {} = unsafe {{ {}.__clone() }};\n", self.get_value(id), self.get_value(*arg) )?; @@ -407,13 +448,94 @@ impl<'a> RTContext<'a> { if self.module.types[self.typing[arg.idx()].idx()].is_primitive() { write!(block, "{}, ", self.get_value(*arg))?; } else { - write!(block, "unsafe {{ {}.__take() }}, ", self.get_value(*arg))?; + write!(block, "unsafe {{ {}.__clone() }}, ", self.get_value(*arg))?; } } write!(block, ").await;\n")?; } } } + Node::Read { + collect, + ref indices, + } => { + let block = &mut blocks.get_mut(&self.bbs.0[id.idx()]).unwrap(); + let collect_ty = self.typing[collect.idx()]; + let out_size = self.codegen_type_size(self.typing[id.idx()]); + let offset = self.codegen_index_math(collect_ty, indices)?; + write!( + block, + " let mut read_offset_obj = unsafe {{ {}.__clone() }};\n", + self.get_value(collect) + )?; + write!( + block, + " unsafe {{ read_offset_obj.__offset({}, {}) }};\n", + offset, out_size, + )?; + if self.module.types[self.typing[id.idx()].idx()].is_primitive() { + write!( + block, + " {} = unsafe {{ *(read_offset_obj.__cpu_ptr() as *const _) }};\n", + self.get_value(id) + )?; + write!( + block, + " unsafe {{ read_offset_obj.__forget() }};\n", + )?; + } else { + write!( + block, + " {} = read_offset_obj;\n", + self.get_value(id) + )?; + } + } + Node::Write { + collect, + data, + ref indices, + } => { + let block = &mut blocks.get_mut(&self.bbs.0[id.idx()]).unwrap(); + let collect_ty = self.typing[collect.idx()]; + let data_size = self.codegen_type_size(self.typing[data.idx()]); + let offset = self.codegen_index_math(collect_ty, indices)?; + write!( + block, + " let mut write_offset_obj = unsafe {{ {}.__clone() }};\n", + self.get_value(collect) + )?; + write!( + block, + " unsafe {{ write_offset_obj.__offset({}, {}) }};\n", + offset, data_size, + )?; + write!(block, " let write_offset_ptr = unsafe {{ write_offset_obj.__cpu_ptr_mut() }};\n")?; + if self.module.types[self.typing[data.idx()].idx()].is_primitive() { + write!( + block, + " unsafe {{ *(write_offset_ptr as *mut _) = {} }};\n", + self.get_value(data) + )?; + } else { + write!( + block, + " unsafe {{ ::core::ptr::copy_nonoverlapping({}.__cpu_ptr(), write_offset_ptr as *mut _, {} as usize) }};\n", + self.get_value(data), + data_size, + )?; + } + write!( + block, + " unsafe {{ write_offset_obj.__forget() }};\n", + )?; + write!( + block, + " {} = unsafe {{ {}.__clone() }};\n", + self.get_value(id), + self.get_value(collect) + )?; + } _ => panic!( "PANIC: Can't lower {:?} in {}.", func.nodes[id.idx()], @@ -487,6 +609,68 @@ impl<'a> RTContext<'a> { Ok(()) } + /* + * Emit logic to index into an collection. + */ + fn codegen_index_math(&self, collect_ty: TypeID, indices: &[Index]) -> Result<String, Error> { + let mut acc_offset = "0".to_string(); + for index in indices { + match index { + Index::Field(idx) => { + let Type::Product(ref fields) = self.module.types[collect_ty.idx()] else { + panic!() + }; + + // Get the offset of the field at index `idx` by calculating + // the product's size up to field `idx`, then offseting the + // base pointer by that amount. + for field in &fields[..*idx] { + let field_align = get_type_alignment(&self.module.types, *field); + let field = self.codegen_type_size(*field); + acc_offset = format!( + "((({} + {}) & !{}) + {})", + acc_offset, + field_align - 1, + field_align - 1, + field + ); + } + let last_align = get_type_alignment(&self.module.types, fields[*idx]); + acc_offset = format!( + "(({} + {}) & !{})", + acc_offset, + last_align - 1, + last_align - 1 + ); + } + Index::Variant(_) => { + // The tag of a summation is at the end of the summation, so + // the variant pointer is just the base pointer. Do nothing. + } + Index::Position(ref pos) => { + let Type::Array(elem, ref dims) = self.module.types[collect_ty.idx()] else { + panic!() + }; + + // The offset of the position into an array is: + // + // ((0 * s1 + p1) * s2 + p2) * s3 + p3 ... + let elem_size = self.codegen_type_size(elem); + for (p, s) in zip(pos, dims) { + let p = self.get_value(*p); + acc_offset = format!("{} * ", acc_offset); + self.codegen_dynamic_constant(*s, &mut acc_offset)?; + acc_offset = format!("({} + {})", acc_offset, p); + } + + // Convert offset in # elements -> # bytes. + acc_offset = format!("({} * {})", acc_offset, elem_size); + } + } + } + Ok(acc_offset) + } + /* * Lower the size of a type into a Rust expression. */ diff --git a/hercules_rt/build.rs b/hercules_rt/build.rs index 15b9f6396d2c90eee66dfecb5a255cef2890726b..d9c689b42b161dcb5db29f8fca91cad4fbea11bb 100644 --- a/hercules_rt/build.rs +++ b/hercules_rt/build.rs @@ -18,6 +18,7 @@ fn main() { println!("cargo::rustc-link-search=native={}", out_dir); println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); + println!("cargo::rustc-link-search=native=/opt/cuda/lib/"); println!("cargo::rustc-link-lib=static=rtdefs"); println!("cargo::rustc-link-lib=cudart"); println!("cargo::rerun-if-changed=src/rtdefs.cu"); diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs index 2a96970e88e2b432c8c9c2ed896dc1c623781cbd..eeb09eb6859fef91c10f2026bbb8553274195cae 100644 --- a/hercules_rt/src/lib.rs +++ b/hercules_rt/src/lib.rs @@ -1,8 +1,8 @@ use std::alloc::{alloc, alloc_zeroed, dealloc, Layout}; use std::marker::PhantomData; -use std::mem::swap; use std::ptr::{copy_nonoverlapping, NonNull}; use std::slice::from_raw_parts; +use std::sync::atomic::{AtomicUsize, Ordering}; #[cfg(feature = "cuda")] extern "C" { @@ -14,48 +14,115 @@ extern "C" { fn copy_cuda_to_cuda(dst: *mut u8, src: *mut u8, size: usize); } +/* + * Each object needs to get assigned a unique ID. + */ +static NUM_OBJECTS: AtomicUsize = AtomicUsize::new(1); + /* * An in-memory collection object that can be used by functions compiled by the - * Hercules compiler. + * Hercules compiler. Memory objects can be in these states: + * + * 1. Shared CPU - the object has a shared reference to some CPU memory, usually + * from the programmer using the Hercules RT API. + * 2. Exclusive CPU - the object has an exclusive reference to some CPU memory, + * usually from the programmer using the Hercules RT API. + * 3. Owned CPU - the object owns some allocated CPU memory. + * 4. Owned GPU - the object owns some allocated GPU memory. + * + * A single object can be in some combination of these objects at the same time. + * Only some combinations are valid, because only some combinations are + * reachable. Under this assumption, we can model an object's placement as a + * state machine, where states are combinations of the aforementioned states, + * and actions are requests on the CPU or GPU, immutably or mutably. Here's the + * state transition table: + * + * Shared CPU = CS + * Exclusive CPU = CE + * Owned CPU = CO + * Owned GPU = GO + * + * CPU Mut CPU GPU Mut GPU + * *--------------------------------------- + * CS | CS CO CS,GO GO + * CE | CE CE CE,GO GO + * CO | CO CO CO,GO GO + * GO | CO CO GO GO + * CS,GO | CS,GO CO CS,GO GO + * CE,GO | CE,GO CE CE,GO GO + * CO,GO | CO,GO CO CO,GO GO + * | + * + * A HerculesBox cannot be cloned, because it may have be a mutable reference to + * some CPU memory. */ +#[derive(Debug)] pub struct HerculesBox<'a> { - cpu_shared: Option<NonNull<u8>>, - cpu_exclusive: Option<NonNull<u8>>, - cpu_owned: Option<NonNull<u8>>, + cpu_shared: Option<NonOwned<'a>>, + cpu_exclusive: Option<NonOwned<'a>>, + cpu_owned: Option<Owned>, #[cfg(feature = "cuda")] - cuda_owned: Option<NonNull<u8>>, + cuda_owned: Option<Owned>, size: usize, + id: usize, +} + +#[derive(Clone, Debug)] +struct NonOwned<'a> { + ptr: NonNull<u8>, + offset: usize, _phantom: PhantomData<&'a u8>, } +#[derive(Clone, Debug)] +struct Owned { + ptr: NonNull<u8>, + alloc_size: usize, + offset: usize, +} + impl<'b, 'a: 'b> HerculesBox<'a> { pub fn from_slice<T>(slice: &'a [T]) -> Self { + let ptr = unsafe { NonNull::new_unchecked(slice.as_ptr() as *mut u8) }; + let size = slice.len() * size_of::<T>(); + let id = NUM_OBJECTS.fetch_add(1, Ordering::Relaxed); HerculesBox { - cpu_shared: Some(unsafe { NonNull::new_unchecked(slice.as_ptr() as *mut u8) }), + cpu_shared: Some(NonOwned { + ptr, + offset: 0, + _phantom: PhantomData, + }), cpu_exclusive: None, cpu_owned: None, #[cfg(feature = "cuda")] cuda_owned: None, - size: slice.len() * size_of::<T>(), - _phantom: PhantomData, + size, + id, } } pub fn from_slice_mut<T>(slice: &'a mut [T]) -> Self { + let ptr = unsafe { NonNull::new_unchecked(slice.as_ptr() as *mut u8) }; + let size = slice.len() * size_of::<T>(); + let id = NUM_OBJECTS.fetch_add(1, Ordering::Relaxed); HerculesBox { cpu_shared: None, - cpu_exclusive: Some(unsafe { NonNull::new_unchecked(slice.as_mut_ptr() as *mut u8) }), + cpu_exclusive: Some(NonOwned { + ptr, + offset: 0, + _phantom: PhantomData, + }), cpu_owned: None, #[cfg(feature = "cuda")] cuda_owned: None, - size: slice.len() * size_of::<T>(), - _phantom: PhantomData, + size, + id, } } @@ -65,69 +132,91 @@ impl<'b, 'a: 'b> HerculesBox<'a> { } unsafe fn get_cpu_ptr(&self) -> Option<NonNull<u8>> { - self.cpu_owned.or(self.cpu_exclusive).or(self.cpu_shared) + self.cpu_owned + .as_ref() + .map(|obj| obj.ptr.byte_add(obj.offset)) + .or(self + .cpu_exclusive + .as_ref() + .map(|obj| obj.ptr.byte_add(obj.offset))) + .or(self + .cpu_shared + .as_ref() + .map(|obj| obj.ptr.byte_add(obj.offset))) } #[cfg(feature = "cuda")] unsafe fn get_cuda_ptr(&self) -> Option<NonNull<u8>> { self.cuda_owned + .as_ref() + .map(|obj| obj.ptr.byte_add(obj.offset)) } unsafe fn allocate_cpu(&mut self) -> NonNull<u8> { - if let Some(ptr) = self.cpu_owned { - ptr + if let Some(obj) = self.cpu_owned.as_ref() { + obj.ptr } else { let ptr = NonNull::new(alloc(Layout::from_size_align_unchecked(self.size, 16))).unwrap(); - self.cpu_owned = Some(ptr); + self.cpu_owned = Some(Owned { + ptr, + alloc_size: self.size, + offset: 0, + }); ptr } } #[cfg(feature = "cuda")] unsafe fn allocate_cuda(&mut self) -> NonNull<u8> { - if let Some(ptr) = self.cuda_owned { - ptr + if let Some(obj) = self.cuda_owned.as_ref() { + obj.ptr } else { - let ptr = cuda_alloc(self.size); - self.cuda_owned = Some(NonNull::new(ptr).unwrap()); - self.cuda_owned.unwrap() + let ptr = NonNull::new(cuda_alloc(self.size)).unwrap(); + self.cuda_owned = Some(Owned { + ptr, + alloc_size: self.size, + offset: 0, + }); + ptr } } unsafe fn deallocate_cpu(&mut self) { - if let Some(ptr) = self.cpu_owned { + if let Some(obj) = self.cpu_owned.take() { dealloc( - ptr.as_ptr(), - Layout::from_size_align_unchecked(self.size, 16), + obj.ptr.as_ptr(), + Layout::from_size_align_unchecked(obj.alloc_size, 16), ); - self.cpu_owned = None; } } #[cfg(feature = "cuda")] unsafe fn deallocate_cuda(&mut self) { - if let Some(ptr) = self.cuda_owned { - cuda_dealloc(ptr.as_ptr()); - self.cuda_owned = None; + if let Some(obj) = self.cuda_owned.take() { + cuda_dealloc(obj.ptr.as_ptr()); } } pub unsafe fn __zeros(size: u64) -> Self { assert_ne!(size, 0); let size = size as usize; + let id = NUM_OBJECTS.fetch_add(1, Ordering::Relaxed); HerculesBox { cpu_shared: None, cpu_exclusive: None, - cpu_owned: Some( - NonNull::new(alloc_zeroed(Layout::from_size_align_unchecked(size, 16))).unwrap(), - ), + cpu_owned: Some(Owned { + ptr: NonNull::new(alloc_zeroed(Layout::from_size_align_unchecked(size, 16))) + .unwrap(), + alloc_size: size, + offset: 0, + }), #[cfg(feature = "cuda")] cuda_owned: None, - size: size, - _phantom: PhantomData, + size, + id, } } @@ -141,16 +230,10 @@ impl<'b, 'a: 'b> HerculesBox<'a> { cuda_owned: None, size: 0, - _phantom: PhantomData, + id: 0, } } - pub unsafe fn __take(&mut self) -> Self { - let mut ret = Self::__null(); - swap(&mut ret, self); - ret - } - pub unsafe fn __cpu_ptr(&mut self) -> *mut u8 { if let Some(ptr) = self.get_cpu_ptr() { return ptr.as_ptr(); @@ -167,12 +250,15 @@ impl<'b, 'a: 'b> HerculesBox<'a> { pub unsafe fn __cpu_ptr_mut(&mut self) -> *mut u8 { let cpu_ptr = self.__cpu_ptr(); - if Some(cpu_ptr) == self.cpu_shared.map(|nn| nn.as_ptr()) { + if Some(cpu_ptr) == self.cpu_shared.as_ref().map(|obj| obj.ptr.as_ptr()) { self.allocate_cpu(); - copy_nonoverlapping(cpu_ptr, self.cpu_owned.unwrap().as_ptr(), self.size); + copy_nonoverlapping( + cpu_ptr, + self.cpu_owned.as_ref().unwrap().ptr.as_ptr(), + self.size, + ); } self.cpu_shared = None; - self.cpu_exclusive = None; #[cfg(feature = "cuda")] self.deallocate_cuda(); cpu_ptr @@ -198,6 +284,47 @@ impl<'b, 'a: 'b> HerculesBox<'a> { self.deallocate_cpu(); cuda_ptr } + + pub unsafe fn __clone(&self) -> Self { + Self { + cpu_shared: self.cpu_shared.clone(), + cpu_exclusive: self.cpu_exclusive.clone(), + cpu_owned: self.cpu_owned.clone(), + #[cfg(feature = "cuda")] + cuda_owned: self.cuda_owned.clone(), + size: self.size, + id: self.id, + } + } + + pub unsafe fn __forget(&mut self) { + self.cpu_owned = None; + #[cfg(feature = "cuda")] + { + self.cuda_owned = None; + } + } + + pub unsafe fn __offset(&mut self, offset: u64, size: u64) { + if let Some(obj) = self.cpu_shared.as_mut() { + obj.offset += offset as usize; + } + if let Some(obj) = self.cpu_exclusive.as_mut() { + obj.offset += offset as usize; + } + if let Some(obj) = self.cpu_owned.as_mut() { + obj.offset += offset as usize; + } + #[cfg(feature = "cuda")] + if let Some(obj) = self.cuda_owned.as_mut() { + obj.offset += offset as usize; + } + self.size = size as usize; + } + + pub unsafe fn __cmp_ids(&self, other: &HerculesBox<'_>) -> bool { + self.id == other.id + } } impl<'a> Drop for HerculesBox<'a> { diff --git a/hercules_samples/call/Cargo.toml b/hercules_samples/call/Cargo.toml index 4a2fbb862039ad0d268b85fc6e85463dc87841d7..c8b570affa63657422f3857f96f6bbc06a52ef1e 100644 --- a/hercules_samples/call/Cargo.toml +++ b/hercules_samples/call/Cargo.toml @@ -9,6 +9,7 @@ juno_build = { path = "../../juno_build" } [dependencies] juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } rand = "*" async-std = "*" with_builtin_macros = "0.1.0" diff --git a/hercules_samples/ccp/Cargo.toml b/hercules_samples/ccp/Cargo.toml index 3547aa52df766cb00445e088176c076fb5996b80..97d4b2ef8efc96af059284df345282213dddab68 100644 --- a/hercules_samples/ccp/Cargo.toml +++ b/hercules_samples/ccp/Cargo.toml @@ -9,6 +9,7 @@ juno_build = { path = "../../juno_build" } [dependencies] juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } rand = "*" async-std = "*" with_builtin_macros = "0.1.0" diff --git a/hercules_samples/fac/Cargo.toml b/hercules_samples/fac/Cargo.toml index d4b9c5fe2ae7ac6907d3c1181dabb3cb247cff2e..9082a4fc4194ac3fa9c694a5a5973f605772a384 100644 --- a/hercules_samples/fac/Cargo.toml +++ b/hercules_samples/fac/Cargo.toml @@ -10,6 +10,7 @@ juno_build = { path = "../../juno_build" } [dependencies] clap = { version = "*", features = ["derive"] } juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } rand = "*" async-std = "*" with_builtin_macros = "0.1.0" diff --git a/juno_samples/casts_and_intrinsics/Cargo.toml b/juno_samples/casts_and_intrinsics/Cargo.toml index af74c07acc3950b22b9b2c95e0d07090f99d7490..f49797969012f5195a0338b7b14fa04414dddb03 100644 --- a/juno_samples/casts_and_intrinsics/Cargo.toml +++ b/juno_samples/casts_and_intrinsics/Cargo.toml @@ -13,5 +13,6 @@ juno_build = { path = "../../juno_build" } [dependencies] juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } with_builtin_macros = "0.1.0" async-std = "*"