Skip to content
Snippets Groups Projects
Commit a84d8c89 authored by Russel Arbore's avatar Russel Arbore
Browse files

Use AlignedAlloc

parent 04e41c17
No related branches found
No related tags found
1 merge request!190Set up cava benchmark
Pipeline #201767 failed
......@@ -670,7 +670,6 @@ dependencies = [
name = "dot"
version = "0.1.0"
dependencies = [
"aligned-vec",
"async-std",
"clap",
"hercules_rt",
......
#![feature(once_cell_try, pointer_is_aligned_to)]
use std::alloc::{alloc, dealloc, Layout};
use std::alloc::{alloc, dealloc, GlobalAlloc, Layout, System};
use std::marker::PhantomData;
use std::ptr::{copy_nonoverlapping, write_bytes, NonNull};
use std::slice::{from_raw_parts, from_raw_parts_mut};
......@@ -189,7 +189,7 @@ pub struct CUDABox {
impl<'a> HerculesCPURef<'a> {
pub fn from_slice<T>(slice: &'a [T]) -> Self {
assert!(slice.as_ptr().is_aligned_to(32));
assert!(slice.as_ptr().is_aligned_to(LARGEST_ALIGNMENT));
let ptr = unsafe { NonNull::new_unchecked(slice.as_ptr() as *mut u8) };
let size = slice.len() * size_of::<T>();
Self {
......@@ -214,7 +214,7 @@ impl<'a> HerculesCPURef<'a> {
}
pub unsafe fn __from_parts(ptr: *mut u8, size: usize) -> Self {
assert!(ptr.is_aligned_to(32));
assert!(ptr.is_aligned_to(LARGEST_ALIGNMENT));
Self {
ptr: NonNull::new(ptr).unwrap(),
size,
......@@ -225,7 +225,7 @@ impl<'a> HerculesCPURef<'a> {
impl<'a> HerculesCPURefMut<'a> {
pub fn from_slice<T>(slice: &'a mut [T]) -> Self {
assert!(slice.as_ptr().is_aligned_to(32));
assert!(slice.as_ptr().is_aligned_to(LARGEST_ALIGNMENT));
let ptr = unsafe { NonNull::new_unchecked(slice.as_ptr() as *mut u8) };
let size = slice.len() * size_of::<T>();
Self {
......@@ -259,7 +259,7 @@ impl<'a> HerculesCPURefMut<'a> {
}
pub unsafe fn __from_parts(ptr: *mut u8, size: usize) -> Self {
assert!(ptr.is_aligned_to(32));
assert!(ptr.is_aligned_to(LARGEST_ALIGNMENT));
Self {
ptr: NonNull::new(ptr).unwrap(),
size,
......@@ -271,7 +271,7 @@ impl<'a> HerculesCPURefMut<'a> {
#[cfg(feature = "cuda")]
impl<'a> HerculesCUDARef<'a> {
pub fn to_cpu_ref<'b, T>(self, dst: &'b mut [T]) -> HerculesCPURefMut<'b> {
assert!(dst.as_ptr().is_aligned_to(32));
assert!(dst.as_ptr().is_aligned_to(LARGEST_ALIGNMENT));
unsafe {
let size = self.size;
assert_eq!(size, dst.len() * size_of::<T>());
......@@ -313,7 +313,7 @@ impl<'a> HerculesCUDARefMut<'a> {
}
pub fn to_cpu_ref<'b, T>(self, dst: &mut [T]) -> HerculesCPURefMut<'b> {
assert!(dst.as_ptr().is_aligned_to(32));
assert!(dst.as_ptr().is_aligned_to(LARGEST_ALIGNMENT));
unsafe {
let size = self.size;
let ptr = NonNull::new(dst.as_ptr() as *mut u8).unwrap();
......@@ -872,3 +872,24 @@ impl<'a, T> HerculesRefInto<'a> for Box<[T]> {
HerculesCPURef::from_slice(self)
}
}
/*
* We need all allocations to be aligned to LARGEST_ALIGNMENT bytes for
* vectorization. This is the easiest way to do that.
*/
pub struct AlignedAlloc;
unsafe impl GlobalAlloc for AlignedAlloc {
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
let layout = layout.align_to(LARGEST_ALIGNMENT).unwrap();
System.alloc(layout)
}
unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
let layout = layout.align_to(LARGEST_ALIGNMENT).unwrap();
System.dealloc(ptr, layout)
}
}
#[global_allocator]
static A: AlignedAlloc = AlignedAlloc;
......@@ -17,4 +17,3 @@ hercules_rt = { path = "../../hercules_rt" }
rand = "*"
async-std = "*"
with_builtin_macros = "0.1.0"
aligned-vec = "*"
......@@ -4,14 +4,12 @@
use hercules_rt::CUDABox;
use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo};
use aligned_vec::ABox;
juno_build::juno!("dot");
fn main() {
async_std::task::block_on(async {
let a: ABox<[f32; 8]> = ABox::new(32, [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]);
let b: ABox<[f32; 8]> = ABox::new(32, [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]);
let a: Box<[f32; 8]> = Box::new([0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]);
let b: Box<[f32; 8]> = Box::new([0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]);
let a = HerculesImmBox::from(a.as_ref() as &[f32]);
let b = HerculesImmBox::from(b.as_ref() as &[f32]);
let mut r = runner!(dot);
......
......@@ -22,8 +22,8 @@ fn main() {
}
}
}
let a = HerculesImmBox::from(&a as &[f32]);
let b = HerculesImmBox::from(&b as &[f32]);
let a = HerculesImmBox::from(a.as_ref());
let b = HerculesImmBox::from(b.as_ref());
let mut r = runner!(matmul);
let mut c = HerculesMutBox::from(r.run(I as u64, J as u64, K as u64, a.to(), b.to()).await);
for (calc, correct) in zip(c.as_slice().into_iter().map(|x: &mut f32| *x), correct_c) {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment