cpu.rs

extern crate bitvec;
extern crate hercules_ir;
extern crate hercules_rt;

use std::collections::HashMap;
use std::collections::VecDeque;

use std::iter::zip;

use std::fmt::Write;

use self::bitvec::prelude::*;

use self::hercules_ir::*;
use self::hercules_rt::manifest::*;

use crate::*;

/*
 * When assembling LLVM basic blocks, we traverse the nodes in a partition in an
 * ad-hoc order. Thus, we cannot assume block terminators will be visited after
 * data nodes, for example. However, textual LLVM IR requires that the
 * terminator instruction is last. So, we emit nodes into separate strings of
 * LLVM IR that will get stichted together when the block is complete.
 */
#[derive(Debug)]
struct LLVMBlock {
    header: String,
    phis: String,
    data: String,
    terminator: String,
}

impl<'a> FunctionContext<'a> {
    /*
     * Top level function to generate code for a partition, targeting the CPU.
     */
    pub(crate) fn codegen_cpu_partition<W: Write>(
        &self,
        top_node: NodeID,
        w: &mut W,
    ) -> Result<PartitionManifest, std::fmt::Error> {
        // Step 1: do some analysis to get a bunch of per-partition information.
        let partition_id = self.plan.partitions[top_node.idx()];
        let partition_context = PartitionContext::new(self, partition_id, top_node);

        // Step 2: emit the function signature. The partition function
        // parameters are the function parameters, the partition data inputs,
        // the array constant pointers, and the dynamic constants.
        let mut partition_function_parameters = partition_context
            // The data inputs to this partition. These are the data values
            // calculated in a different partition in the same function.
            .partition_input_types
            .iter()
            .enumerate()
            .map(|(idx, ty_id)| {
                (
                    self.llvm_types[ty_id.idx()].clone(),
                    format!("%part_arg.{}", idx),
                )
            })
            // The input types of the overall function.
            .chain(
                self.function
                    .param_types
                    .iter()
                    .enumerate()
                    .map(|(idx, ty_id)| {
                        (
                            self.llvm_types[ty_id.idx()].clone(),
                            format!("%func_arg.{}", idx),
                        )
                    }),
            )
            // Array constants are passed in, pre-initialized.
            .chain(
                self.partition_array_constant_inputs()
                    .into_iter()
                    .map(|(id, ty_id)| {
                        (
                            self.llvm_types[ty_id.idx()].clone(),
                            format!("%cons.{}", id.idx()),
                        )
                    }),
            )
            // Dynamic constants are passed in, since they are only known right
            // before runtime.
            .chain(
                self.partition_dynamic_constant_inputs()
                    .into_iter()
                    .map(|id| ("i64".to_string(), format!("%dyn_cons.{}", id.idx()))),
            );

        write!(
            w,
            "define {} @{}_part_{}(",
            generate_type_string(&partition_context.return_type, &self.llvm_types),
            self.function.name,
            partition_id.idx(),
        )?;
        let (first_ty, first_param) = partition_function_parameters.next().unwrap();
        write!(w, "{} {}", first_ty, first_param)?;
        for (ty, param) in partition_function_parameters {
            write!(w, ", {} {}", ty, param)?;
        }
        write!(w, ") {{\n")?;

        // Step 3: set up basic blocks. A node represents a basic block if its
        // entry in the basic blocks vector points to itself.
        let mut llvm_bbs = HashMap::new();
        for id in &self.partitions_inverted_map[partition_id.idx()] {
            if self.bbs[id.idx()] == *id {
                llvm_bbs.insert(
                    id,
                    LLVMBlock {
                        header: format!("bb_{}:\n", id.idx()),
                        phis: "".to_string(),
                        data: "".to_string(),
                        terminator: "".to_string(),
                    },
                );
            }
        }

        // Step 4: emit nodes. Nodes are emitted into basic blocks separately as
        // nodes are not necessarily emitted in order. Assemble worklist of
        // nodes, starting as reverse post order of nodes. For non-phi and non-
        // reduce nodes, only emit once all data uses are emitted. In addition,
        // consider additional anti-dependence edges from read to write nodes.
        let mut visited = bitvec![u8, Lsb0; 0; self.function.nodes.len()];
        let mut worklist = VecDeque::from(partition_context.reverse_postorder.clone());
        while let Some(id) = worklist.pop_front() {
            if !(self.function.nodes[id.idx()].is_phi()
                || self.function.nodes[id.idx()].is_reduce())
                && !get_uses(&self.function.nodes[id.idx()])
                    .as_ref()
                    .into_iter()
                    // If this node isn't a phi or reduce, we need to check that
                    // all uses, as well as all reads we anti-depend with, have
                    // been emitted.
                    .chain(self.antideps.iter().filter_map(|(read, write)| {
                        if id == *write {
                            Some(read)
                        } else {
                            None
                        }
                    }))
                    // Only data dependencies inside this partition need to have
                    // already been visited.
                    .all(|id| {
                        self.plan.partitions[id.idx()] != partition_id
                            || self.function.nodes[id.idx()].is_control()
                            || visited[id.idx()]
                    })
            {
                // Skip emitting node if it's not a phi or reduce node and if
                // its data uses are not emitted yet.
                worklist.push_back(id);
            } else {
                // Once all of the data dependencies for this node are emitted,
                // this node can be emitted. For reduce nodes specifically, we
                // want to emit the phi in the fork's basic block, not the
                // join's, so we handle that ugly case here. This is because
                // there is a fundamental mismatch between Hercules' notion of
                // reductions and LLVM's phi nodes. This is ok, since we can
                // translate between the two. It's just a pain.
                let bb = if let Node::Reduce {
                    control,
                    init: _,
                    reduct: _,
                } = self.function.nodes[id.idx()]
                {
                    // Figure out the fork corresponding to the associated join.
                    let fork_id = if let Node::Join { control } = self.function.nodes[control.idx()]
                    {
                        if let Type::Control(factors) =
                            &self.types[self.typing[control.idx()].idx()]
                        {
                            *factors.last().unwrap()
                        } else {
                            panic!("PANIC: Type of join node associated with reduce node is not a control type.")
                        }
                    } else {
                        panic!("PANIC: Node associated with reduce node isn't a join node.")
                    };

                    // Emit in the basic block of the fork.
                    llvm_bbs.get_mut(&self.bbs[fork_id.idx()]).unwrap()
                } else {
                    // In the normal case, emit in the basic block the node has
                    // been actually assigned to.
                    llvm_bbs.get_mut(&self.bbs[id.idx()]).unwrap()
                };
                partition_context.codegen_cpu_node(id, bb)?;
                visited.set(id.idx(), true);
            }
        }

        // Step 5: emit the now completed basic blocks, in order. Emit a dummy
        // header block to unconditionally jump to the "top" basic block.
        write!(w, "bb_header:\n  br label %bb_{}\n", top_node.idx())?;
        for id in partition_context.reverse_postorder {
            if self.bbs[id.idx()] == id {
                write!(
                    w,
                    "{}{}{}{}",
                    llvm_bbs[&id].header,
                    llvm_bbs[&id].phis,
                    llvm_bbs[&id].data,
                    llvm_bbs[&id].terminator
                )?;
            }
        }

        // Step 6: close the partition function - we're done. The partition
        // manifest is created by the partition context.
        write!(w, "}}\n\n")?;
        Ok(partition_context.manifest)
    }
}

impl<'a> PartitionContext<'a> {
    /*
     * Emit LLVM IR implementing a single node.
     */
    fn codegen_cpu_node(&self, id: NodeID, bb: &mut LLVMBlock) -> std::fmt::Result {
        // Helper to emit code to index a collection. All collections are
        // pointers to some memory at the LLVM IR level. This memory is passed
        // in as a parameter for anything involving arrays, and is alloca-ed for
        // product and summation types.
        let mut generate_index_code = |collect: NodeID, indices: &[Index]| -> std::fmt::Result {
            // Step 1: calculate the list of collection types corresponding to
            // each index.
            let mut collection_ty_ids = vec![];
            let mut curr_ty_id = self.function.typing[collect.idx()];
            for index in indices {
                match (index, &self.function.types[curr_ty_id.idx()]) {
                    (Index::Field(idx), Type::Product(ty_ids))
                    | (Index::Variant(idx), Type::Summation(ty_ids)) => {
                        collection_ty_ids.push(curr_ty_id);
                        curr_ty_id = ty_ids[*idx];
                    }
                    (Index::Position(_), Type::Array(elem_ty_id, _)) => {
                        collection_ty_ids.push(curr_ty_id);
                        curr_ty_id = *elem_ty_id;
                    }
                    _ => {
                        panic!("PANIC: Found unsupported combination of index and collection type.")
                    }
                }
            }
            assert!(
                self.function.types[curr_ty_id.idx()].is_primitive(),
                "PANIC: Cannot generate partial indexing code."
            );

            // Step 2: calculate, as LLVM IR values, the stride and offset
            // needed at each level of the collection. For products, the stride
            // is calculated using a getelementptr hack (and is the size of the
            // struct), and the offset corresponds to the field index (which is
            // translated to an offset using another getelementptr hack). For
            // arrays, the stride is the dynamic constant extent multiplied by
            // the stride of the element type, and the offset is the position
            // index multiplied by the stride of the element type. Additionally,
            // emit code to add up all of the offsets to get a total offset into
            // the collection. TODO: to support summations, and arrays in
            // arbitrary places, we need to not use the hacky getelementptr
            // technique, since LLVM IR can't represent arrays (in the Hercules
            // sense) or summations as primitive types. Instead, we need to do
            // collection memory layout entirely ourselves.
            let elem_llvm_ty = &self.function.llvm_types[curr_ty_id.idx()];
            write!(bb.data, "  %index{}.{}.total_offset = add i64 0, 0\n  %index{}.{}.stride.ptrhack = getelementptr {}, ptr null, i64 1\n  %index{}.{}.stride = ptrtoint ptr %index{}.{}.stride.ptrhack to i64\n",
                   id.idx(), indices.len(), id.idx(), indices.len(), elem_llvm_ty, id.idx(), indices.len(), id.idx(), indices.len()
            )?;
            for (idx, index) in indices.into_iter().enumerate().rev() {
                match index {
                    Index::Field(field) => {
                        let product_llvm_ty =
                            &self.function.llvm_types[collection_ty_ids[idx].idx()];
                        write!(
                            bb.data,
                            "  %index{}.{}.stride.ptrhack = getelementptr {}, ptr null, i64 1\n  %index{}.{}.stride = ptrtoint ptr %index{}.{}.stride.ptrhack to i64\n  %index{}.{}.offset.ptrhack = getelementptr {}, ptr null, i64 0, i64 {}\n  %index{}.{}.offset = ptrtoint ptr %index{}.{}.offset.ptrhack to i64\n",
                            id.idx(), idx,
                            product_llvm_ty,
                            id.idx(), idx,
                            id.idx(), idx,
                            id.idx(), idx,
                            product_llvm_ty,
                            field,
                            id.idx(), idx,
                            id.idx(), idx,
                        )?;
                    }
                    Index::Variant(_) => todo!(),
                    Index::Position(position) => {
                        let array_extents = self.function.types[collection_ty_ids[idx].idx()]
                            .try_extents()
                            .unwrap();

                        // TODO: calculate stride for arrays, needed for arrays
                        // nested in other collections.
                        write!(bb.data, "  %index{}.{}.offset.add.0 = add ", id.idx(), idx)?;
                        self.cpu_emit_use_of_node(position[0], Some(id), true, &mut bb.data)?;
                        write!(bb.data, ", {}\n", 0)?;
                        for (dim_idx, (extent_dc_id, position_id)) in
                            zip(array_extents, position.into_iter()).enumerate().skip(1)
                        {
                            write!(
                                bb.data,
                                "  %index{}.{}.offset.mul.{} = mul i64 {}, %index{}.{}.offset.add.{}\n",
                                id.idx(), idx,
                                dim_idx,
                                self.function.llvm_dynamic_constants[extent_dc_id.idx()],
                                id.idx(), idx,
                                dim_idx - 1
                            )?;
                            write!(
                                bb.data,
                                "  %index{}.{}.offset.add.{} = add ",
                                id.idx(),
                                idx,
                                dim_idx
                            )?;
                            self.cpu_emit_use_of_node(*position_id, Some(id), true, &mut bb.data)?;
                            write!(
                                bb.data,
                                ", %index{}.{}.offset.mul.{}\n",
                                id.idx(),
                                idx,
                                dim_idx
                            )?;
                        }
                        write!(bb.data, "  %index{}.{}.offset = mul i64 %index{}.{}.stride, %index{}.{}.offset.add.{}\n", id.idx(), idx, id.idx(), idx + 1, id.idx(), idx, position.len() - 1)?;
                    }
                    Index::Control(_) => panic!(
                        "PANIC: Found control index when generating collection indexing code."
                    ),
                }
                write!(
                    bb.data,
                    "  %index{}.{}.total_offset = add i64 %index{}.{}.total_offset, %index{}.{}.offset\n",
                    id.idx(), idx,
                    id.idx(), idx + 1,
                    id.idx(), idx
                )?;
            }

            // Step 3: emit the getelementptr using the total collection offset.
            write!(bb.data, "  %index{} = getelementptr i8, ", id.idx(),)?;
            self.cpu_emit_use_of_node(collect, Some(id), true, &mut bb.data)?;
            write!(bb.data, ", i64 %index{}.0.total_offset\n", id.idx())?;

            Ok(())
        };

        // Helper to find the basic block corresponding to a particular control
        // predecessor, for phi nodes. This is needed for when a predecessor
        // basic block is in a different partition. In this case, the phi's
        // control predecessor is set to the top block of the partition.
        let get_phi_predecessor = |pred_id: NodeID| {
            if self.function.plan.partitions[pred_id.idx()] == self.partition_id {
                format!("{}", self.function.bbs[pred_id.idx()].idx())
            } else {
                format!("header")
            }
        };

        // Emit the primary IR for each node.
        match self.function.function.nodes[id.idx()] {
            Node::Start | Node::Region { preds: _ } => {
                // Basic blocks containing a start or region node branch
                // unconditionally to their single successor.
                let successor = self
                    .function
                    .def_use
                    .get_users(id)
                    .iter()
                    .filter(|id| self.function.function.nodes[id.idx()].is_strictly_control())
                    .next()
                    .unwrap();
                bb.terminator = format!("  br label %bb_{}\n", successor.idx());
            }
            Node::If { control: _, cond } => {
                let successors = self.function.def_use.get_users(id);

                // Determine the order of the successors (true/false or false/
                // true) in the successors slice.
                let rev = if let Node::Read {
                    collect: _,
                    indices,
                } = &self.function.function.nodes[successors[0].idx()]
                {
                    indices[0] != Index::Control(0)
                } else {
                    panic!("PANIC: Successor of if node isn't a read node.")
                };
                bb.terminator = "  br ".to_string();
                self.cpu_emit_use_of_node(cond, Some(id), true, &mut bb.terminator)?;
                write!(
                    bb.terminator,
                    ", label %bb_{}, label %bb_{}\n",
                    successors[(!rev) as usize].idx(),
                    successors[rev as usize].idx()
                )?;
            }
            Node::Fork { control, factor: _ } => {
                // Calculate the join and successor.
                let join = self.function.fork_join_map[&id];
                let successor = self
                    .function
                    .def_use
                    .get_users(id)
                    .iter()
                    .filter(|id| self.function.function.nodes[id.idx()].is_strictly_control())
                    .next()
                    .unwrap();

                // Create the phi node for the loop index. This is used directly
                // by any thread ID user nodes. The control predecessor basic
                // blocks are the control node preceding the fork and the
                // corresponding join.
                write!(bb.phis, "  ")?;
                self.cpu_emit_use_of_node(id, None, false, &mut bb.phis)?;
                write!(
                    bb.phis,
                    " = phi i64 [ 0, %bb_{} ], [ %fork_inc{}, %bb_{} ]\n",
                    get_phi_predecessor(self.function.bbs[control.idx()]),
                    id.idx(),
                    get_phi_predecessor(self.function.bbs[join.idx()]),
                )?;

                // Increment the loop index by one each iteration.
                write!(bb.data, "  %fork_inc{} = add i64 1, ", id.idx())?;
                self.cpu_emit_use_of_node(id, None, false, &mut bb.data)?;
                write!(bb.data, "\n")?;

                // Branch to the successor basic block.
                write!(
                    bb.terminator,
                    "  br label %bb_{}\n",
                    self.function.bbs[successor.idx()].idx()
                )?;
            }
            Node::Join { control } => {
                // Get the fork, it's factor, and the successor to this join.
                let fork_id = if let Type::Control(factors) =
                    &self.function.types[self.function.typing[control.idx()].idx()]
                {
                    *factors.last().unwrap()
                } else {
                    panic!("PANIC: The type of a join node is incorrect.")
                };
                let factor = if let Node::Fork { control: _, factor } =
                    &self.function.function.nodes[fork_id.idx()]
                {
                    *factor
                } else {
                    panic!("PANIC: The node referenced by the control type of a join node is not a fork.")
                };
                let successor = self
                    .function
                    .def_use
                    .get_users(id)
                    .iter()
                    .filter(|id| self.function.function.nodes[id.idx()].is_strictly_control())
                    .next()
                    .unwrap();

                // Form the bottom of the loop. Check if the loop is finished,
                // and branch between the successor and the fork. The structure
                // of this loop implies that fork-joins have to iterate at least
                // once. Change the loop termination branch target if this is a
                // control return (see comment below for more details).
                let is_control_return = self.control_returns.contains(&id);
                write!(
                    bb.terminator,
                    "  %join_cond{} = icmp ult i64 %fork_inc{}, {}\n",
                    id.idx(),
                    fork_id.idx(),
                    self.function.llvm_dynamic_constants[factor.idx()]
                )?;
                write!(
                    bb.terminator,
                    "  br i1 %join_cond{}, label %bb_{}, label %bb_{}\n",
                    id.idx(),
                    self.function.bbs[fork_id.idx()].idx(),
                    if is_control_return {
                        format!("{}_join_cr", id.idx())
                    } else {
                        format!("{}", self.function.bbs[successor.idx()].idx())
                    }
                )?;

                // Join nodes are the only node that can be a control return
                // from a partition and generate a conditional branch. This
                // means we have to do this really ugly hack where we insert
                // another basic block to be the control return that we
                // conditionally branch to. Other control nodes that may be
                // control returns don't have this problem, because they always
                // unconditionally branch to their destination. We add this LLVM
                // IR text of a new basic block in the terminator of the current
                // basic block, since we don't have mutable access here to the
                // set of all LLVM basic blocks.
                if is_control_return {
                    write!(bb.terminator, "bb_{}_join_cr:\n", id.idx())?;
                }
            }
            Node::Phi {
                control: _,
                ref data,
            } => {
                // For each predecessor of the associated region, we determine
                // if that predecessor is in this partition or not. If so, then
                // the predecessor control is just the basic block of the
                // predecessor control node. If not, the predecessor control is
                // the first basic block of the partition. The corresponding
                // datum also needs to be provided by argument to the partition,
                // and this is handled by cpu_emit_use_of_node.
                let pred_ids =
                    get_uses(&self.function.function.nodes[self.function.bbs[id.idx()].idx()]);
                let mut control_datum_pairs = zip(data.into_iter(), pred_ids.as_ref().iter())
                    .map(|(datum, pred_id)| (*datum, get_phi_predecessor(*pred_id)));

                // TODO: this code burns my eyes to look at, it might be worth
                // making this not carcinogenic.
                write!(bb.phis, "  ")?;
                self.cpu_emit_use_of_node(id, None, false, &mut bb.phis)?;
                write!(
                    bb.phis,
                    " = phi {} [ ",
                    self.function.llvm_types[self.function.typing[id.idx()].idx()]
                )?;
                let (first_data, first_control) = control_datum_pairs.next().unwrap();
                self.cpu_emit_use_of_node(first_data, Some(id), false, &mut bb.phis)?;
                write!(bb.phis, ", %bb_{} ]", first_control)?;
                for (data, control) in control_datum_pairs {
                    write!(bb.phis, ", [ ")?;
                    self.cpu_emit_use_of_node(data, Some(id), false, &mut bb.phis)?;
                    write!(bb.phis, ", %bb_{} ]", control)?;
                }
                write!(bb.phis, "\n")?;
            }
            Node::ThreadID { control } => {
                // Just bitcast the loop index from the fork. The bitcast is a
                // no-op, but we add it to copy the value from the virtual
                // register the fork generates to the virtual register
                // corresponding to this thread ID node.
                assert!(self.function.function.nodes[control.idx()].is_fork());
                write!(bb.data, "  ")?;
                self.cpu_emit_use_of_node(id, None, false, &mut bb.data)?;
                write!(bb.data, " = bitcast i64 ",)?;
                self.cpu_emit_use_of_node(control, Some(id), false, &mut bb.data)?;
                write!(bb.data, " to i64\n",)?;
            }
            Node::Reduce {
                control,
                init,
                reduct,
            } => {
                // Figure out the fork corresponding to the associated join.
                let fork_id = if let Node::Join { control } =
                    self.function.function.nodes[control.idx()]
                {
                    if let Type::Control(factors) =
                        &self.function.types[self.function.typing[control.idx()].idx()]
                    {
                        *factors.last().unwrap()
                    } else {
                        panic!("PANIC: Type of join node associated with reduce node is not a control type.")
                    }
                } else {
                    panic!("PANIC: Node associated with reduce node isn't a join node.")
                };

                // Figure out the fork's predecessor.
                let pred = if let Node::Fork { control, factor: _ } =
                    self.function.function.nodes[fork_id.idx()]
                {
                    control
                } else {
                    panic!("PANIC: Node referenced in type of join node associated with a reduce node is not a fork node.")
                };

                // Reduce nodes just lower to phi nodes. We already did the ugly
                // hack so that "bb" refers to the basic block of the fork,
                // rather than the join. So, now we just need to emit the phi.
                write!(bb.phis, "  ")?;
                self.cpu_emit_use_of_node(id, Some(id), false, &mut bb.phis)?;
                write!(
                    bb.phis,
                    " = phi {} [ ",
                    self.function.llvm_types[self.function.typing[id.idx()].idx()]
                )?;
                self.cpu_emit_use_of_node(init, Some(id), false, &mut bb.phis)?;
                write!(
                    bb.phis,
                    ", %bb_{} ], [ ",
                    get_phi_predecessor(self.function.bbs[pred.idx()])
                )?;
                self.cpu_emit_use_of_node(reduct, Some(id), false, &mut bb.phis)?;
                write!(
                    bb.phis,
                    ", %bb_{} ]\n",
                    get_phi_predecessor(self.function.bbs[control.idx()])
                )?;
            }
            // These nodes are handled by other mechanisms in the code lowering
            // process.
            Node::Return {
                control: _,
                data: _,
            }
            | Node::Parameter { index: _ }
            | Node::Constant { id: _ }
            | Node::DynamicConstant { id: _ } => {}
            Node::Binary { left, right, op } => {
                let op = match op {
                    BinaryOperator::Add => {
                        if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
                            "fadd"
                        } else {
                            "add"
                        }
                    }
                    BinaryOperator::Sub => {
                        if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
                            "fsub"
                        } else {
                            "sub"
                        }
                    }
                    BinaryOperator::Mul => {
                        if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
                            "fmul"
                        } else {
                            "mul"
                        }
                    }
                    BinaryOperator::Div => {
                        if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
                            "fdiv"
                        } else if self.function.types[self.function.typing[left.idx()].idx()]
                            .is_unsigned()
                        {
                            "udiv"
                        } else {
                            "sdiv"
                        }
                    }
                    BinaryOperator::Rem => {
                        if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
                            "frem"
                        } else if self.function.types[self.function.typing[left.idx()].idx()]
                            .is_unsigned()
                        {
                            "urem"
                        } else {
                            "srem"
                        }
                    }
                    BinaryOperator::LT => {
                        if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
                            "fcmp olt"
                        } else if self.function.types[self.function.typing[left.idx()].idx()]
                            .is_unsigned()
                        {
                            "icmp ult"
                        } else {
                            "icmp slt"
                        }
                    }
                    BinaryOperator::LTE => {
                        if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
                            "fcmp ole"
                        } else if self.function.types[self.function.typing[left.idx()].idx()]
                            .is_unsigned()
                        {
                            "icmp ule"
                        } else {
                            "icmp sle"
                        }
                    }
                    BinaryOperator::GT => {
                        if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
                            "fcmp ogt"
                        } else if self.function.types[self.function.typing[left.idx()].idx()]
                            .is_unsigned()
                        {
                            "icmp ugt"
                        } else {
                            "icmp sgt"
                        }
                    }
                    BinaryOperator::GTE => {
                        if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
                            "fcmp oge"
                        } else if self.function.types[self.function.typing[left.idx()].idx()]
                            .is_unsigned()
                        {
                            "icmp uge"
                        } else {
                            "icmp sge"
                        }
                    }
                    BinaryOperator::EQ => {
                        if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
                            "fcmp oeq"
                        } else {
                            "icmp eq"
                        }
                    }
                    BinaryOperator::NE => {
                        if self.function.types[self.function.typing[left.idx()].idx()].is_float() {
                            "fcmp one"
                        } else {
                            "icmp ne"
                        }
                    }
                    BinaryOperator::Or => "or",
                    BinaryOperator::And => "and",
                    BinaryOperator::Xor => "xor",
                    BinaryOperator::LSh => "lsh",
                    BinaryOperator::RSh => {
                        if self.function.types[self.function.typing[left.idx()].idx()].is_unsigned()
                        {
                            "lshr"
                        } else {
                            "ashr"
                        }
                    }
                };
                write!(bb.data, "  ")?;
                self.cpu_emit_use_of_node(id, None, false, &mut bb.data)?;
                write!(bb.data, " = {} ", op)?;
                self.cpu_emit_use_of_node(left, Some(id), true, &mut bb.data)?;
                write!(bb.data, ", ")?;
                self.cpu_emit_use_of_node(right, Some(id), false, &mut bb.data)?;
                write!(bb.data, "\n")?;
            }
            Node::Read {
                collect,
                ref indices,
            } => {
                if self.function.function.nodes[collect.idx()].is_strictly_control() {
                    // Read nodes may be projection succesors of if or match
                    // nodes.
                    let successor = self.function.def_use.get_users(id)[0];
                    write!(
                        bb.terminator,
                        "  br label %bb_{}\n",
                        self.function.bbs[successor.idx()].idx()
                    )?;
                } else {
                    generate_index_code(collect, indices)?;
                    write!(bb.data, "  ")?;
                    self.cpu_emit_use_of_node(id, Some(id), false, &mut bb.data)?;
                    write!(
                        bb.data,
                        " = load {}, ptr %index{}\n",
                        self.function.llvm_types[self.function.typing[id.idx()].idx()],
                        id.idx(),
                    )?;
                }
            }
            Node::Write {
                collect,
                data,
                ref indices,
            } => {
                generate_index_code(collect, indices)?;
                write!(
                    bb.data,
                    "  store {} ",
                    self.function.llvm_types[self.function.typing[data.idx()].idx()]
                )?;
                self.cpu_emit_use_of_node(data, Some(id), false, &mut bb.data)?;
                write!(bb.data, ", ptr %index{}\n", id.idx())?;

                // We can't just "copy" in LLVM IR, but we want to forward the
                // pointer, unchanged, as the "output" of this write node. The
                // easiest way to do this is to insert a useless bitcast.
                write!(bb.data, "  ")?;
                self.cpu_emit_use_of_node(id, None, false, &mut bb.data)?;
                write!(bb.data, " = bitcast ptr ")?;
                self.cpu_emit_use_of_node(collect, Some(id), false, &mut bb.data)?;
                write!(bb.data, " to ptr\n")?;
            }
            _ => {
                eprintln!("TO LOWER: {:?}", self.function.function.nodes[id.idx()]);
            }
        }

        // If this node is a control return, we emit a return from this
        // partition function.
        if self.control_returns.contains(&id) {
            // Get rid of the old terminator, replace with return. Don't do this
            // if this node is a join node, since in that specific case we
            // generate specific control return logic. See the join node codegen
            // above for more details.
            if !self.function.function.nodes[id.idx()].is_join() {
                bb.terminator.clear();
            }

            // Making structs from the aggregated values in LLVM IR is a pain.
            // We need to, one-by-one, insertvalue each element into the struct.
            let ret_ty_str = generate_type_string(&self.return_type, &self.function.llvm_types);
            for (idx, data_output_id) in self.data_outputs.iter().enumerate() {
                write!(
                    bb.terminator,
                    "  %ret_agg{}.{} = insertvalue {} {}, ",
                    id.idx(),
                    idx,
                    ret_ty_str,
                    if idx == 0 {
                        "undef".to_string()
                    } else {
                        format!("%ret_agg{}.{}", id.idx(), idx - 1)
                    }
                )?;
                let mut data_output_id = *data_output_id;

                // Handle reduce specially here. Technically, the "user" here is
                // the join node, so cpu_emit_use_of_node would normally emit
                // the reduce node's virtual register directly. However, if a
                // data output is the result of a reduce node, that is
                // definitely outside for the corresponding fork-join. Thus, we
                // actually need to use the reduction use of the reduce node.
                // This all only applies if the reduce node is in the current
                // partition. If not, then use the reduce node as the argument
                // to cpu_emit_use_of_node as normal, so that the partition
                // function argument is properly used.
                while let Node::Reduce {
                    control: _,
                    init: _,
                    reduct,
                } = self.function.function.nodes[data_output_id.idx()]
                    && self.partition_id == self.function.plan.partitions[data_output_id.idx()]
                {
                    data_output_id = reduct;
                }
                self.cpu_emit_use_of_node(data_output_id, None, true, &mut bb.terminator)?;
                write!(bb.terminator, ", {}\n", idx)?;
            }

            // Now, we can return the aggregate value we calculated.
            if self.data_outputs.is_empty() && self.control_returns.len() == 1 {
                // If there are no data outputs, just return the empty struct.
                write!(bb.terminator, "  ret {} zeroinitializer\n", ret_ty_str)?;
            } else if self.data_outputs.is_empty() {
                // If there are multiple control returns, we need to return the
                // node ID of the control return, so that the runtime can do
                // control flow between partitions. In this case, there aren't
                // any data outputs that also need to be returned.
                write!(bb.terminator, "  %ret_agg{}.ctrl_pos = insertvalue {} undef, i64 {}, 0\n  ret {} %ret_agg{}.ctrl_pos\n",
                       id.idx(),
                       ret_ty_str,
                       id.idx(),
                       ret_ty_str,
                       id.idx()
                )?;
            } else if self.control_returns.len() == 1 {
                // In the normal case, we return the struct containing just the
                // data outputs.
                write!(
                    bb.terminator,
                    "  ret {} %ret_agg{}.{}\n",
                    ret_ty_str,
                    id.idx(),
                    self.data_outputs.len() - 1,
                )?;
            } else {
                // If there are multiple control returns from this partition and
                // there are data outputs, we add the control return node ID to
                // the return aggregate.
                write!(
                    bb.terminator,
                    "  %ret_agg{}.ctrl_pos = insertvalue {} %ret_agg{}.{}, i64 {}, {}\n  ret {} %ret_agg{}.ctrl_pos\n",
                    id.idx(),
                    ret_ty_str,
                    id.idx(),
                    self.data_outputs.len() - 1,
                    id.idx(),
                    self.data_outputs.len(),
                    ret_ty_str,
                    id.idx(),
                )?;
            }
        }

        Ok(())
    }

    /*
     * Emit the LLVM value corresponding to a node. Optionally prefix with the
     * LLVM type, which is required by textual LLVM IR in a few places.
     * Optionally provide the node that will be using this emission. This is
     * unused by all emitted node values except reduce nodes, which require the
     * user argument to be given. We chose this interface because at the
     * callsite of a cpu_emit_use_of_node, it is always known whether this thing
     * being emitted could (or should) possibly be a reduce node. If not, then
     * providing none gives a nice early panic when it is a reduce node, either
     * because the developer misjudged or because there is a bug.
     */
    fn cpu_emit_use_of_node<W: Write>(
        &self,
        id: NodeID,
        user: Option<NodeID>,
        emit_type: bool,
        w: &mut W,
    ) -> std::fmt::Result {
        // First, emit the type before the value (if applicable).
        if emit_type {
            write!(
                w,
                "{} ",
                self.function.llvm_types[self.function.typing[id.idx()].idx()]
            )?;
        }

        // Emitting the value can be surprisingly complicated, depending on what
        // the node is. For example, partition arguments are emitted specially.
        if let Some(input_idx) = self.data_inputs.iter().position(|inp_id| *inp_id == id) {
            // If a use is in another partition, it needs to get passed to this
            // partition's function as a parameter.
            write!(w, "%part_arg.{}", input_idx)?;
        } else {
            match self.function.function.nodes[id.idx()] {
                // Parameter nodes in this partition also represent parameters
                // to this partition function.
                Node::Parameter { index } => write!(w, "%func_arg.{}", index)?,
                // Constants are pre-defined.
                Node::Constant { id } => write!(w, "{}", self.function.llvm_constants[id.idx()])?,
                Node::DynamicConstant { id } => {
                    write!(w, "{}", self.function.llvm_dynamic_constants[id.idx()])?
                }
                // Reduce nodes, as usual, are not nice to handle. We need to
                // emit different LLVM depending on whether the user is inside
                // or outside the reduce's corresponding fork-join nest. Inside,
                // we emit as usual, since the user needs to use the phi node
                // inside the reduction loop. Outside, we need to use the reduct
                // use of the reduce node, so that we don't grab the reduction
                // variable one loop iteration too early.
                Node::Reduce {
                    control,
                    init: _,
                    reduct,
                } => {
                    // Figure out the fork corresponding to the associated join.
                    let fork_id = if let Node::Join { control } =
                        self.function.function.nodes[control.idx()]
                    {
                        if let Type::Control(factors) =
                            &self.function.types[self.function.typing[control.idx()].idx()]
                        {
                            *factors.last().unwrap()
                        } else {
                            panic!()
                        }
                    } else {
                        panic!()
                    };

                    // Check if the basic block containing the user node is in
                    // the fork-join nest for this reduce node. We make the user
                    // node an optional argument as a debugging tool - if we
                    // exercise this code branch when generating the code for a
                    // node that absolutely should not be using the result of a
                    // reduce node, we would like to know!
                    if self.function.fork_join_nest[&self.function.bbs[user.expect("PANIC: cpu_emit_use_of_node was called on a reduce node, but no user node ID was given.").idx()]]
                        .contains(&fork_id)
                    {
                        // If the user is inside the fork-join nest, then emit
                        // the reduce node directly.
                        assert_eq!(self.partition_id, self.function.plan.partitions[id.idx()]);
                        write!(w, "%virt.{}", id.idx())?;
                    } else {
                        // If the user is outside the fork-join nest, then
                        // recursively emit on the reduction input to the reduce
                        // node. This is needed when there is a reduce chain.
                        assert_eq!(
                            self.partition_id,
                            self.function.plan.partitions[reduct.idx()]
                        );
                        self.cpu_emit_use_of_node(reduct, user, emit_type, w)?;
                    }
                }
                // Uses that are in this partition are just virtual registers.
                // Clang is really annoying about numbering virtual registers,
                // so to avoid that silliness we prepend all our virtual
                // registers with a prefix indicating what kind of thing it is.
                // For normal values, we use "virt" for virtual register.
                _ => {
                    assert_eq!(self.partition_id, self.function.plan.partitions[id.idx()]);
                    write!(w, "%virt.{}", id.idx())?;
                }
            }
        }

        Ok(())
    }
}