diff --git a/.gitignore b/.gitignore
index f0f409c246fa6e8fa3e4c862959e184eb3556108..291a3dd611addcc307a74a444814a32d914982e9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
 /target
 *.dot
 *.bc
+*.out
+*.ll
+*.c
+*.o
diff --git a/Cargo.lock b/Cargo.lock
index 2dc1fecb35270afe9fc1a62f0c90eece5b283b08..bfcc04d022442a77417a1dbf3fd1075f51c8d9e1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -191,6 +191,15 @@ dependencies = [
  "ordered-float",
 ]
 
+[[package]]
+name = "hercules_matmul"
+version = "0.1.0"
+dependencies = [
+ "clap",
+ "hercules_rt",
+ "rand",
+]
+
 [[package]]
 name = "hercules_opt"
 version = "0.1.0"
@@ -198,6 +207,13 @@ dependencies = [
  "hercules_ir",
 ]
 
+[[package]]
+name = "hercules_rt"
+version = "0.1.0"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "libc"
 version = "0.2.153"
diff --git a/Cargo.toml b/Cargo.toml
index d6a8629e19126222a539896445cab51e9792716c..8320ec1520c225375638586a3b7c12e5ab4501ce 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,6 +4,10 @@ members = [
 	"hercules_cg",
 	"hercules_ir",
 	"hercules_opt",
+	"hercules_rt",
+
 	"hercules_tools/hercules_dot",
-	"hercules_tools/hercules_cpu"
+	"hercules_tools/hercules_cpu",
+
+	"hercules_samples/matmul"
 ]
diff --git a/hercules_cg/src/cpu_beta.rs b/hercules_cg/src/cpu_beta.rs
index bb73e1d348b31ccbe1b3e71fd1852364da3d08ce..e6c858fd6b5b9574c5e3ea36d63684bf21754567 100644
--- a/hercules_cg/src/cpu_beta.rs
+++ b/hercules_cg/src/cpu_beta.rs
@@ -165,7 +165,11 @@ pub fn cpu_beta_codegen<W: Write>(
         }
     }
 
-    // Step 4: do codegen for each function.
+    // Step 4: generate dummy uninitialized global - this is needed so that
+    // there'll be a non-empty .bss section in the ELF object file.
+    write!(w, "@dummy = dso_local global i32 0, align 4\n")?;
+
+    // Step 5: do codegen for each function.
     for function_idx in 0..functions.len() {
         let function = &functions[function_idx];
         let typing = &typing[function_idx];
@@ -176,7 +180,7 @@ pub fn cpu_beta_codegen<W: Write>(
         let fork_join_map = &fork_join_maps[function_idx];
         let fork_join_nest = &fork_join_nests[function_idx];
 
-        // Step 4.1: emit function signature.
+        // Step 5.1: emit function signature.
         let llvm_ret_type = &llvm_types[function.return_type.idx()];
         let mut llvm_params = function
             .param_types
@@ -198,7 +202,7 @@ pub fn cpu_beta_codegen<W: Write>(
         }
         write!(w, ") {{\n")?;
 
-        // Step 4.2: emit basic blocks. A node represents a basic block if its
+        // Step 5.2: emit basic blocks. A node represents a basic block if its
         // entry in the basic blocks vector points to itself. Each basic block
         // is created as four strings: the block header, the block's phis, the
         // block's data computations, and the block's terminator instruction.
@@ -217,7 +221,7 @@ pub fn cpu_beta_codegen<W: Write>(
             }
         }
 
-        // Step 4.3: emit nodes. Nodes are emitted into basic blocks separately
+        // Step 5.3: emit nodes. Nodes are emitted into basic blocks separately
         // as nodes are not necessarily emitted in order. Assemble worklist of
         // nodes, starting as reverse post order of nodes. For non-phi and non-
         // reduce nodes, only emit once all data uses are emitted. In addition,
@@ -261,7 +265,7 @@ pub fn cpu_beta_codegen<W: Write>(
             }
         }
 
-        // Step 4.4: put basic blocks in order.
+        // Step 5.4: put basic blocks in order.
         for node in reverse_postorder {
             if bb[node.idx()] == *node {
                 write!(
@@ -275,7 +279,7 @@ pub fn cpu_beta_codegen<W: Write>(
             }
         }
 
-        // Step 4.5: close function.
+        // Step 5.5: close function.
         write!(w, "}}\n")?;
     }
 
diff --git a/hercules_rt/Cargo.toml b/hercules_rt/Cargo.toml
new file mode 100644
index 0000000000000000000000000000000000000000..500265c40d3821a0077b16ed65d2dcff9a5075d4
--- /dev/null
+++ b/hercules_rt/Cargo.toml
@@ -0,0 +1,7 @@
+[package]
+name = "hercules_rt"
+version = "0.1.0"
+authors = ["Russel Arbore <rarbore2@illinois.edu>"]
+
+[dependencies]
+libc = "*"
diff --git a/hercules_rt/src/elf.rs b/hercules_rt/src/elf.rs
new file mode 100644
index 0000000000000000000000000000000000000000..9fc9dc3b60ba34b5646f28785d439d9f1a0d5ee7
--- /dev/null
+++ b/hercules_rt/src/elf.rs
@@ -0,0 +1,211 @@
+extern crate libc;
+
+use std::ffi::CStr;
+use std::mem::size_of;
+use std::ptr::copy_nonoverlapping;
+use std::ptr::null_mut;
+use std::ptr::read_unaligned;
+
+use self::libc::*;
+
+/*
+ * The libc crate doesn't have everything from elf.h, so these things need to be
+ * manually defined.
+ */
+
+#[repr(C)]
+#[derive(Debug)]
+struct Elf64_Rela {
+    r_offset: Elf64_Addr,
+    r_info: Elf64_Xword,
+    r_addend: Elf64_Sxword,
+}
+
+const R_X86_64_PC32: u64 = 2;
+const R_X86_64_PLT32: u64 = 4;
+const STT_FUNC: u8 = 2;
+
+/*
+ * Holds a mmaped copy of .text + .bss for direct execution, plus metadata for
+ * each function. The .bss section holds a table storing addresses to internal
+ * runtime functions, since this is literally easier than patching the object
+ * code to directly jump to those runtime functions.
+ */
+#[derive(Debug)]
+pub(crate) struct Elf {
+    pub(crate) function_names: Vec<String>,
+    pub(crate) function_pointers: Vec<isize>,
+    pub(crate) program_section: *mut u8,
+    pub(crate) program_size: usize,
+}
+
+impl Drop for Elf {
+    fn drop(&mut self) {
+        unsafe { munmap(self.program_section as *mut _, self.program_size) };
+    }
+}
+
+/*
+ * Function for parsing our internal memory representation of an ELF file from
+ * the raw bytes of an ELF file. This includes creating a executable section of
+ * code, and relocating function calls and global variables. This whole thing is
+ * very unsafe, and is predicated on the elf parameter referencing properly
+ * formatted bytes.
+ */
+pub(crate) unsafe fn parse_elf(elf: &[u8]) -> Elf {
+    fn page_align(n: usize) -> usize {
+        (n + (4096 - 1)) & !(4096 - 1)
+    }
+
+    // read_unaligned corresponds to memcpys in C - we need to memcpy structs
+    // out of the file's bytes, since they may be stored without proper
+    // alignment.
+    let header: Elf64_Ehdr = read_unaligned(elf.as_ptr() as *const _);
+    assert!(header.e_shentsize as usize == size_of::<Elf64_Shdr>());
+    let section_header_table: Box<[_]> = (0..header.e_shnum)
+        .map(|idx| {
+            read_unaligned(
+                (elf.as_ptr().offset(header.e_shoff as isize) as *const Elf64_Shdr)
+                    .offset(idx as isize),
+            )
+        })
+        .collect();
+
+    // Look for the .symtab, .strtab, .text, .bss, and .rela.text sections. Only
+    // the .rela.text section is not necessary.
+    let mut symtab_ndx = -1;
+    let mut strtab_ndx = -1;
+    let mut text_ndx = -1;
+    let mut bss_ndx = -1;
+    let mut rela_text_ndx = -1;
+    let shstrtab = &elf[section_header_table[header.e_shstrndx as usize].sh_offset as usize..];
+    for i in 0..header.e_shnum as usize {
+        let section_name = &shstrtab[section_header_table[i].sh_name as usize..];
+        if section_name.starts_with(b".symtab") {
+            symtab_ndx = i as i32;
+        } else if section_name.starts_with(b".strtab") {
+            strtab_ndx = i as i32;
+        } else if section_name.starts_with(b".text") {
+            text_ndx = i as i32;
+        } else if section_name.starts_with(b".bss") {
+            bss_ndx = i as i32;
+        } else if section_name.starts_with(b".rela.text") {
+            rela_text_ndx = i as i32;
+        }
+    }
+    assert!(symtab_ndx != -1);
+    assert!(strtab_ndx != -1);
+    assert!(text_ndx != -1);
+    assert!(bss_ndx != -1);
+
+    // Get the headers for the required sections.
+    let symtab_hdr = section_header_table[symtab_ndx as usize];
+    let strtab_hdr = section_header_table[strtab_ndx as usize];
+    let text_hdr = section_header_table[text_ndx as usize];
+    let bss_hdr = section_header_table[bss_ndx as usize];
+
+    // Collect the symbols in the symbol table.
+    assert!(symtab_hdr.sh_entsize as usize == size_of::<Elf64_Sym>());
+    let num_symbols = symtab_hdr.sh_size as usize / size_of::<Elf64_Sym>();
+    let symbol_table: Box<[_]> = (0..num_symbols)
+        .map(|idx| {
+            read_unaligned(
+                (elf.as_ptr().offset(symtab_hdr.sh_offset as isize) as *const Elf64_Sym)
+                    .offset(idx as isize),
+            )
+        })
+        .collect();
+
+    // The mmaped region includes both the .text and .bss sections.
+    let program_size = page_align(text_hdr.sh_size as usize) + page_align(bss_hdr.sh_size as usize);
+    let program_base = mmap(
+        null_mut(),
+        program_size,
+        PROT_READ | PROT_WRITE,
+        MAP_PRIVATE | MAP_ANONYMOUS,
+        -1,
+        0,
+    ) as *mut u8;
+    let text_base = program_base;
+    let bss_base = text_base.offset(page_align(text_hdr.sh_size as usize) as isize);
+
+    // Copy the object code into the mmaped region.
+    copy_nonoverlapping(
+        elf.as_ptr().offset(text_hdr.sh_offset as isize),
+        text_base,
+        text_hdr.sh_size as usize,
+    );
+
+    // If there are relocations, we process them here.
+    if rela_text_ndx != -1 {
+        let rela_text_hdr = section_header_table[rela_text_ndx as usize];
+        let num_relocations = rela_text_hdr.sh_size / rela_text_hdr.sh_entsize;
+
+        // We only iterate the relocations in order, so no need to collect.
+        let relocations = (0..num_relocations).map(|idx| {
+            read_unaligned(
+                (elf.as_ptr().offset(rela_text_hdr.sh_offset as isize) as *const Elf64_Rela)
+                    .offset(idx as isize),
+            )
+        });
+        for relocation in relocations {
+            let symbol_idx = relocation.r_info >> 32;
+            let ty = relocation.r_info & 0xFFFFFFFF;
+            let patch_offset = text_base.offset(relocation.r_offset as isize);
+
+            // We support PLT32 relocations only in the .text section, and PC32
+            // relocations only in the .bss section.
+            match ty {
+                R_X86_64_PLT32 => {
+                    let symbol_address =
+                        text_base.offset(symbol_table[symbol_idx as usize].st_value as isize);
+                    let patch = symbol_address
+                        .offset(relocation.r_addend as isize)
+                        .offset_from(patch_offset);
+                    (patch_offset as *mut u32).write_unaligned(patch as u32);
+                }
+                R_X86_64_PC32 => {
+                    let symbol_address =
+                        bss_base.offset(symbol_table[symbol_idx as usize].st_value as isize);
+                    let patch = symbol_address
+                        .offset(relocation.r_addend as isize)
+                        .offset_from(patch_offset);
+                    (patch_offset as *mut u32).write_unaligned(patch as u32);
+                }
+                _ => panic!("ERROR: Unrecognized relocation type: {}.", ty),
+            }
+        }
+    }
+
+    // Make the .text section readable and executable. The .bss section should
+    // still be readable and writable.
+    mprotect(
+        text_base as *mut c_void,
+        page_align(text_hdr.sh_size as usize),
+        PROT_READ | PROT_EXEC,
+    );
+
+    // Construct the final in-memory ELF representation. Look up the names of
+    // function symbols in the string table.
+    let strtab = &elf[strtab_hdr.sh_offset as usize..];
+    let mut elf = Elf {
+        function_names: vec![],
+        function_pointers: vec![],
+        program_section: program_base,
+        program_size,
+    };
+    for i in 0..num_symbols {
+        if symbol_table[i].st_info & 0xF == STT_FUNC {
+            let function_name_base = &strtab[symbol_table[i].st_name as usize..];
+            let function_name = CStr::from_ptr(function_name_base.as_ptr() as *const _)
+                .to_str()
+                .unwrap()
+                .to_owned();
+            elf.function_names.push(function_name);
+            elf.function_pointers
+                .push(symbol_table[i].st_value as isize);
+        }
+    }
+
+    elf
+}
diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
new file mode 100644
index 0000000000000000000000000000000000000000..f04c6c6aed58926330e1dc87614b90a51c962883
--- /dev/null
+++ b/hercules_rt/src/lib.rs
@@ -0,0 +1,50 @@
+use std::fs::File;
+use std::io::prelude::*;
+use std::path::Path;
+
+pub(crate) mod elf;
+pub(crate) use crate::elf::*;
+
+#[derive(Debug)]
+pub struct Module {
+    elf: Elf,
+}
+
+impl Module {
+    pub fn get_function_ptr(&self, name: &str) -> *mut u8 {
+        unsafe {
+            self.elf.program_section.offset(
+                self.elf.function_pointers[self
+                    .elf
+                    .function_names
+                    .iter()
+                    .position(|s| s == name)
+                    .unwrap()],
+            )
+        }
+    }
+}
+
+pub fn load_binary(path: &Path) -> Module {
+    let mut f = File::open(path).unwrap();
+    let mut buffer = vec![];
+    f.read_to_end(&mut buffer).unwrap();
+    let elf = unsafe { parse_elf(buffer.as_slice()) };
+    Module { elf }
+}
+
+/*
+ * An ugly, unchecked macro for looking up Hercules functions in a module. Curse
+ * Rust for not supporting variadic generics and type pattern matching :shrug:.
+ * TODO: Generate per-lookup struct type for checking that the provided types
+ * are correct.
+ */
+#[macro_export]
+macro_rules! lookup_function {
+    ($module:expr, $function:expr, $($param_ty:ty),*, => $ret_ty:ty) => {
+        {
+            let fn_ptr: fn($($param_ty),*) -> $ret_ty = unsafe { std::mem::transmute($module.get_function_ptr($function)) };
+            fn_ptr
+        }
+    };
+}
diff --git a/samples/ccp_example.hir b/hercules_samples/ccp_example.hir
similarity index 100%
rename from samples/ccp_example.hir
rename to hercules_samples/ccp_example.hir
diff --git a/samples/fork_join.hir b/hercules_samples/fork_join.hir
similarity index 100%
rename from samples/fork_join.hir
rename to hercules_samples/fork_join.hir
diff --git a/samples/gvn_example.hir b/hercules_samples/gvn_example.hir
similarity index 100%
rename from samples/gvn_example.hir
rename to hercules_samples/gvn_example.hir
diff --git a/samples/invalid/bad_phi.hir b/hercules_samples/invalid/bad_phi.hir
similarity index 100%
rename from samples/invalid/bad_phi.hir
rename to hercules_samples/invalid/bad_phi.hir
diff --git a/samples/invalid/bad_phi2.hir b/hercules_samples/invalid/bad_phi2.hir
similarity index 100%
rename from samples/invalid/bad_phi2.hir
rename to hercules_samples/invalid/bad_phi2.hir
diff --git a/hercules_samples/matmul/Cargo.toml b/hercules_samples/matmul/Cargo.toml
new file mode 100644
index 0000000000000000000000000000000000000000..945032691547814c74da7ac63e7d4a9f61a54196
--- /dev/null
+++ b/hercules_samples/matmul/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "hercules_matmul"
+version = "0.1.0"
+authors = ["Russel Arbore <rarbore2@illinois.edu>"]
+
+[dependencies]
+clap = { version = "*", features = ["derive"] }
+hercules_rt = { path = "../../hercules_rt" }
+rand = "*"
diff --git a/samples/matmul.hir b/hercules_samples/matmul/matmul.hir
similarity index 100%
rename from samples/matmul.hir
rename to hercules_samples/matmul/matmul.hir
diff --git a/hercules_samples/matmul/src/main.rs b/hercules_samples/matmul/src/main.rs
new file mode 100644
index 0000000000000000000000000000000000000000..02aeb3a4dc97d28bc45e9221f244e2b13f158bee
--- /dev/null
+++ b/hercules_samples/matmul/src/main.rs
@@ -0,0 +1,34 @@
+extern crate clap;
+
+use std::path::Path;
+
+fn main() {
+    let module = hercules_rt::load_binary(Path::new("test.o"));
+
+    let matmul = hercules_rt::lookup_function!(
+        module,
+        "matmul",
+        *const f32,
+        *const f32,
+        *mut f32,
+        u64,
+        u64,
+        u64,
+        => *const f32
+    );
+
+    let a = [[1.0f32, 2.0f32], [3.0f32, 4.0f32]];
+    let b = [[5.0f32, 6.0f32], [7.0f32, 8.0f32]];
+    let mut c = [[0.0f32, 0.0f32], [0.0f32, 0.0f32]];
+    unsafe {
+        matmul(
+            std::mem::transmute(a.as_ptr()),
+            std::mem::transmute(b.as_ptr()),
+            std::mem::transmute(c.as_mut_ptr()),
+            2,
+            2,
+            2,
+        )
+    };
+    println!("{} {}\n{} {}", c[0][0], c[0][1], c[1][0], c[1][1]);
+}
diff --git a/samples/simple1.hir b/hercules_samples/simple1.hir
similarity index 100%
rename from samples/simple1.hir
rename to hercules_samples/simple1.hir
diff --git a/samples/strset.hir b/hercules_samples/strset.hir
similarity index 100%
rename from samples/strset.hir
rename to hercules_samples/strset.hir