diff --git a/Cargo.lock b/Cargo.lock
index 0cad8e19b64d4f6279c1d25594e5c61ef1ceb1a6..1973fbbeb88622cb01fe3ddfc191aae796a00f54 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -670,7 +670,6 @@ dependencies = [
 name = "dot"
 version = "0.1.0"
 dependencies = [
- "aligned-vec",
  "async-std",
  "clap",
  "hercules_rt",
diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index 3b79dc4879153b69706b795dc21ad91d47bb0488..a245a2647985ec7ab393021b75233067895ffdcd 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -1,6 +1,6 @@
 #![feature(once_cell_try, pointer_is_aligned_to)]
 
-use std::alloc::{alloc, dealloc, Layout};
+use std::alloc::{alloc, dealloc, GlobalAlloc, Layout, System};
 use std::marker::PhantomData;
 use std::ptr::{copy_nonoverlapping, write_bytes, NonNull};
 use std::slice::{from_raw_parts, from_raw_parts_mut};
@@ -189,7 +189,7 @@ pub struct CUDABox {
 
 impl<'a> HerculesCPURef<'a> {
     pub fn from_slice<T>(slice: &'a [T]) -> Self {
-        assert!(slice.as_ptr().is_aligned_to(32));
+        assert!(slice.as_ptr().is_aligned_to(LARGEST_ALIGNMENT));
         let ptr = unsafe { NonNull::new_unchecked(slice.as_ptr() as *mut u8) };
         let size = slice.len() * size_of::<T>();
         Self {
@@ -214,7 +214,7 @@ impl<'a> HerculesCPURef<'a> {
     }
 
     pub unsafe fn __from_parts(ptr: *mut u8, size: usize) -> Self {
-        assert!(ptr.is_aligned_to(32));
+        assert!(ptr.is_aligned_to(LARGEST_ALIGNMENT));
         Self {
             ptr: NonNull::new(ptr).unwrap(),
             size,
@@ -225,7 +225,7 @@ impl<'a> HerculesCPURef<'a> {
 
 impl<'a> HerculesCPURefMut<'a> {
     pub fn from_slice<T>(slice: &'a mut [T]) -> Self {
-        assert!(slice.as_ptr().is_aligned_to(32));
+        assert!(slice.as_ptr().is_aligned_to(LARGEST_ALIGNMENT));
         let ptr = unsafe { NonNull::new_unchecked(slice.as_ptr() as *mut u8) };
         let size = slice.len() * size_of::<T>();
         Self {
@@ -259,7 +259,7 @@ impl<'a> HerculesCPURefMut<'a> {
     }
 
     pub unsafe fn __from_parts(ptr: *mut u8, size: usize) -> Self {
-        assert!(ptr.is_aligned_to(32));
+        assert!(ptr.is_aligned_to(LARGEST_ALIGNMENT));
         Self {
             ptr: NonNull::new(ptr).unwrap(),
             size,
@@ -271,7 +271,7 @@ impl<'a> HerculesCPURefMut<'a> {
 #[cfg(feature = "cuda")]
 impl<'a> HerculesCUDARef<'a> {
     pub fn to_cpu_ref<'b, T>(self, dst: &'b mut [T]) -> HerculesCPURefMut<'b> {
-        assert!(dst.as_ptr().is_aligned_to(32));
+        assert!(dst.as_ptr().is_aligned_to(LARGEST_ALIGNMENT));
         unsafe {
             let size = self.size;
             assert_eq!(size, dst.len() * size_of::<T>());
@@ -313,7 +313,7 @@ impl<'a> HerculesCUDARefMut<'a> {
     }
 
     pub fn to_cpu_ref<'b, T>(self, dst: &mut [T]) -> HerculesCPURefMut<'b> {
-        assert!(dst.as_ptr().is_aligned_to(32));
+        assert!(dst.as_ptr().is_aligned_to(LARGEST_ALIGNMENT));
         unsafe {
             let size = self.size;
             let ptr = NonNull::new(dst.as_ptr() as *mut u8).unwrap();
@@ -872,3 +872,24 @@ impl<'a, T> HerculesRefInto<'a> for Box<[T]> {
         HerculesCPURef::from_slice(self)
     }
 }
+
+/*
+ * We need all allocations to be aligned to LARGEST_ALIGNMENT bytes for
+ * vectorization. This is the easiest way to do that.
+ */
+pub struct AlignedAlloc;
+
+unsafe impl GlobalAlloc for AlignedAlloc {
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        let layout = layout.align_to(LARGEST_ALIGNMENT).unwrap();
+        System.alloc(layout)
+    }
+
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+        let layout = layout.align_to(LARGEST_ALIGNMENT).unwrap();
+        System.dealloc(ptr, layout)
+    }
+}
+
+#[global_allocator]
+static A: AlignedAlloc = AlignedAlloc;
diff --git a/hercules_samples/dot/Cargo.toml b/hercules_samples/dot/Cargo.toml
index ab35cbaf02237b49aca0a344ec611e77fb55bb11..9b11ddc10b5d185e020d639e677f5b56a2c3b8d0 100644
--- a/hercules_samples/dot/Cargo.toml
+++ b/hercules_samples/dot/Cargo.toml
@@ -17,4 +17,3 @@ hercules_rt = { path = "../../hercules_rt" }
 rand = "*"
 async-std = "*"
 with_builtin_macros = "0.1.0"
-aligned-vec = "*"
diff --git a/hercules_samples/dot/src/main.rs b/hercules_samples/dot/src/main.rs
index 7bcaaebaf6694fb29a8fba4fe883500f90034c26..1f28cee28241827277f8836e963a0d80edeb5abc 100644
--- a/hercules_samples/dot/src/main.rs
+++ b/hercules_samples/dot/src/main.rs
@@ -4,14 +4,12 @@
 use hercules_rt::CUDABox;
 use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo};
 
-use aligned_vec::ABox;
-
 juno_build::juno!("dot");
 
 fn main() {
     async_std::task::block_on(async {
-        let a: ABox<[f32; 8]> = ABox::new(32, [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]);
-        let b: ABox<[f32; 8]> = ABox::new(32, [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]);
+        let a: Box<[f32; 8]> = Box::new([0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]);
+        let b: Box<[f32; 8]> = Box::new([0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]);
         let a = HerculesImmBox::from(a.as_ref() as &[f32]);
         let b = HerculesImmBox::from(b.as_ref() as &[f32]);
         let mut r = runner!(dot);
diff --git a/juno_samples/matmul/src/main.rs b/juno_samples/matmul/src/main.rs
index cb078c74c5915d1e3f821acf6c94d4243a030a21..29415b511992946b08a1496f3eb92d957615d8aa 100644
--- a/juno_samples/matmul/src/main.rs
+++ b/juno_samples/matmul/src/main.rs
@@ -22,8 +22,8 @@ fn main() {
                 }
             }
         }
-        let a = HerculesImmBox::from(&a as &[f32]);
-        let b = HerculesImmBox::from(&b as &[f32]);
+        let a = HerculesImmBox::from(a.as_ref());
+        let b = HerculesImmBox::from(b.as_ref());
         let mut r = runner!(matmul);
         let mut c = HerculesMutBox::from(r.run(I as u64, J as u64, K as u64, a.to(), b.to()).await);
         for (calc, correct) in zip(c.as_slice().into_iter().map(|x: &mut f32| *x), correct_c) {