diff --git a/hpvm/test/epoch_dnn/host_code/_ref_impl.py b/hpvm/test/epoch_dnn/host_code/_ref_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..00f789af54c318998b9e2f9dbd442f84fdac489e
--- /dev/null
+++ b/hpvm/test/epoch_dnn/host_code/_ref_impl.py
@@ -0,0 +1,31 @@
+import lightnet as ln
+import torch
+from sys import argv
+from PIL import Image, ImageDraw
+
+model = ln.models.TinyYoloV2(12)
+transform = ln.data.transform.Compose([
+    # GetBoxes transformation generates bounding boxes from network output
+    ln.data.transform.GetDarknetBoxes(
+        conf_thresh=0.5,
+        network_stride=model.stride,
+        anchors=model.anchors
+    ),
+    # Filter transformation to filter the output boxes
+    ln.data.transform.NMS(
+        nms_thresh=0.5
+    ),
+])
+
+image_output_shape = [85, 19, 19]
+with open(argv[1]) as f:
+    data = [float(s) for s in f.read().split()]
+data = torch.tensor(data).reshape(-1, *image_output_shape)
+boxes = transform(data)
+
+image = Image.open(argv[2]).resize((640, 640))
+imagedraw = ImageDraw.ImageDraw(image)
+for box in boxes:
+    _, x0, y0, x1, y1, score, label = box
+    imagedraw.rectangle((x0, y0, x1, y1), outline="red")
+image.save(argv[3])
diff --git a/hpvm/test/epoch_dnn/host_code/eigen_utils.h b/hpvm/test/epoch_dnn/host_code/eigen_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..2328640a5ef665d2695ee7b22320f0df858eb8d3
--- /dev/null
+++ b/hpvm/test/epoch_dnn/host_code/eigen_utils.h
@@ -0,0 +1,186 @@
+#include <array>
+#include <unordered_set>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Index;
+using Eigen::Tensor;
+template <int N> using BTensor = Eigen::Tensor<bool, N>;
+template <int N> using FTensor = Eigen::Tensor<float, N>;
+template <int N> using ITensor = Eigen::Tensor<Index, N>;
+template <int N> using IArray = std::array<Index, N>;
+
+template <typename Nested, int ODim>
+using Reshape = Eigen::TensorReshapingOp<const IArray<ODim>, const Nested>;
+
+template <typename Nested, int ODim>
+using BcastReshape = Eigen::TensorBroadcastingOp<const IArray<ODim>,
+                                                 const Reshape<Nested, ODim>>;
+
+template <int N, typename Scalar, int Dim>
+Reshape<Tensor<Scalar, Dim>, Dim + N>
+unsqueeze(const Tensor<Scalar, Dim> &input, const IArray<N> &dims) {
+  IArray<Dim + N> reshapeTo;
+  std::unordered_set<Index> dimSet(dims.begin(), dims.end());
+  size_t fromDim = 0;
+  for (size_t dim = 0; dim < Dim + 1; dim++) {
+    if (dimSet.find(dim) == dimSet.end())
+      reshapeTo[dim] = input.dimension(fromDim++);
+    else
+      reshapeTo[dim] = 1;
+  }
+  return input.reshape(reshapeTo);
+}
+
+template <size_t IDim, size_t ODim, typename TensorOp>
+BcastReshape<TensorOp, ODim> broadcast(const TensorOp &iTensor,
+                                       const IArray<IDim> &fromShape,
+                                       const IArray<ODim> &toShape) {
+  assert(ODim >= IDim);
+  IArray<ODim> reshapeShape, broadcastN = toShape;
+  reshapeShape.fill(1);
+  size_t offset = ODim - IDim;
+  for (int i = offset; i < ODim; i++) {
+    Index size = fromShape[i - offset];
+    reshapeShape[i] = size;
+    if (size == 1)
+      broadcastN[i] = toShape[i];
+    else if (size == toShape[i])
+      broadcastN[i] = 1;
+    else
+      throw std::runtime_error("Shape mismatch at dimension " +
+                               std::to_string(i));
+  }
+  return iTensor.reshape(reshapeShape).broadcast(broadcastN);
+}
+
+template <size_t ODim, typename Scalar, int IDim>
+BcastReshape<Tensor<Scalar, IDim>, ODim>
+broadcast(const Tensor<Scalar, IDim> &iTensor, const IArray<ODim> &shape) {
+  return broadcast(iTensor, iTensor.dimensions(), shape);
+}
+
+template <size_t ODim, typename Scalar, int IDim>
+BcastReshape<Reshape<Tensor<Scalar, IDim>, IDim + 1>, ODim>
+broadcast(const Tensor<Scalar, IDim> &iTensor, const IArray<ODim> &shape,
+          Index unsqueezeDim) {
+  auto unsqueezed = unsqueeze<1>(iTensor, {unsqueezeDim});
+  return broadcast(unsqueezed, unsqueezed.dimensions(), shape);
+}
+
+FTensor<1> arange(Index from, Index to) {
+  // to - 1 because LinSpaced is [from, to] closed interval.
+  Eigen::ArrayXf array = Eigen::ArrayXf::LinSpaced(to - from, from, to - 1);
+  return Eigen::TensorMap<FTensor<1>>(array.data(), to - from);
+}
+
+std::tuple<FTensor<1>, FTensor<1>> meshgrid(Index h, Index w) {
+  IArray<1> reshapeTo({h * w});
+  // auto b = broadcast<1>(arange(0, h), {h, w}, 1);
+  FTensor<1> linx = broadcast<2>(arange(0, h), {h, w}, 1).reshape(reshapeTo);
+  FTensor<1> liny = broadcast<2>(arange(0, w), {h, w}, 0).reshape(reshapeTo);
+  return std::make_pair(linx, liny);
+}
+
+template <int Dim> FTensor<4> softmax(const FTensor<4> &input) {
+  IArray<1> dimToReduce({Dim});
+  IArray<4> inputShape = input.dimensions();
+  FTensor<3> maxElems = input.maximum(dimToReduce);
+  FTensor<4> expInput = (input - broadcast(maxElems, inputShape, Dim)).exp();
+  FTensor<3> sumExp = expInput.sum(dimToReduce);
+  return expInput / broadcast(sumExp, inputShape, Dim);
+}
+
+template <int AlongDim, typename Scalar, int IDim>
+Tensor<Scalar, IDim> maskSelect(const Tensor<Scalar, IDim> &input,
+                                const BTensor<1> &mask) {
+  size_t nSelected = 0;
+  for (Index i = 0; i < mask.dimension(0); i++)
+    if (mask[i])
+      nSelected += 1;
+  IArray<IDim> retShape = input.dimensions();
+  retShape[AlongDim] = nSelected;
+  Tensor<Scalar, IDim> ret(retShape);
+  for (Index i = 0, j = 0; i < mask.dimension(0); i++)
+    if (mask[i]) {
+      ret.chip(j, AlongDim) = input.chip(i, AlongDim);
+      ++j;
+    }
+  return ret;
+}
+
+template <int AlongDim, typename Scalar, int IDim>
+Tensor<Scalar, IDim> &maskAssign(Tensor<Scalar, IDim> &tensor,
+                                 const BTensor<1> &mask,
+                                 const Tensor<Scalar, IDim> &values) {
+  for (Index i = 0, j = 0; i < mask.dimension(0); i++)
+    if (mask[i]) {
+      tensor.chip(i, AlongDim) = values.chip(j, AlongDim);
+      ++j;
+    }
+  return tensor;
+}
+
+template <int AlongDim, typename Scalar, int IDim>
+Tensor<Scalar, IDim> &maskAssign(Tensor<Scalar, IDim> &tensor,
+                                 const BTensor<1> &mask, const Scalar &value) {
+  for (Index i = 0, j = 0; i < mask.dimension(0); i++)
+    if (mask[i]) {
+      tensor.chip(i, AlongDim).setConstant(value);
+      ++j;
+    }
+  return tensor;
+}
+
+template <typename Scalar, int IDim>
+Tensor<Scalar, IDim> dimSelect(const Tensor<Scalar, IDim> &input, size_t dim,
+                               Index from, Index to) {
+  Index dimSize = input.dimension(dim);
+  if (from < 0)
+    from += dimSize;
+  if (to < 0)
+    to += dimSize + 1;
+  IArray<IDim> slice_starts, slice_extents = input.dimensions();
+  slice_starts.fill(0);
+  slice_starts[dim] = from;
+  slice_extents[dim] = to - from;
+  return input.slice(slice_starts, slice_extents);
+}
+
+template <Index AlongDim, int IDim, int N>
+FTensor<IDim> concat(const std::array<FTensor<IDim>, N> &tensors) {
+  size_t outputSize = 0;
+  for (const auto &tensor : tensors)
+    outputSize += tensor.dimension(AlongDim);
+  auto dim = tensors[0].dimensions();
+  dim[AlongDim] = outputSize;
+
+  FTensor<IDim> ret(dim);
+  IArray<IDim> slice_starts, slice_extents = dim;
+  slice_starts.fill(0);
+  for (const auto &tensor : tensors) {
+    size_t size = tensor.dimension(AlongDim);
+    slice_extents[AlongDim] = size;
+    ret.slice(slice_starts, slice_extents) = tensor;
+    slice_starts[AlongDim] += size;
+  }
+  return ret;
+}
+
+template <typename OutScalar, typename InScalar, typename FuncT>
+Tensor<OutScalar, 2> outerAction(const Tensor<InScalar, 1> &xs,
+                                 const FuncT &func) {
+  Index nElem = xs.size();
+  auto lhs = broadcast<2>(xs, {nElem, nElem}, 0),
+       rhs = broadcast<2>(xs, {nElem, nElem}, 1);
+  return func(lhs, rhs);
+}
+
+template <typename Scalar>
+Tensor<Scalar, 1> scatter1D(const Tensor<Index, 1> &indices,
+                            const Tensor<Scalar, 1> &values) {
+  assert(indices.size() == values.size());
+  Tensor<Scalar, 1> ret(indices.size());
+  for (size_t i = 0; i < indices.size(); i++)
+    ret[indices[i]] = values[i];
+  return ret;
+}
diff --git a/hpvm/test/epoch_dnn/host_code/host_bin b/hpvm/test/epoch_dnn/host_code/host_bin
new file mode 100755
index 0000000000000000000000000000000000000000..3ac575fbf559a533a1ae4dc8f2a5e461e8d143c1
Binary files /dev/null and b/hpvm/test/epoch_dnn/host_code/host_bin differ
diff --git a/hpvm/test/epoch_dnn/host_code/yolo.cpp b/hpvm/test/epoch_dnn/host_code/yolo.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6f11daa787505fd4c7d98a741dfe7c676290950
--- /dev/null
+++ b/hpvm/test/epoch_dnn/host_code/yolo.cpp
@@ -0,0 +1,267 @@
+#include "eigen_utils.h"
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <cstdlib>
+
+using namespace Eigen;
+using namespace std;
+
+template <typename T, int N>
+std::ostream &operator<<(std::ostream &os, const DSizes<T, N> &data) {
+  os << "[";
+  bool has_last = false;
+  for (const auto &x : data) {
+    if (has_last)
+      os << ", ";
+    else
+      has_last = true;
+    os << x;
+  }
+  os << "]";
+  return os;
+}
+
+class GetDarknetBoxes {
+public:
+  GetDarknetBoxes(float conf_thresh_, size_t network_stride_,
+                  const FTensor<2> &anchors_)
+      : conf_thresh(conf_thresh_), network_stride(network_stride_),
+        anchors(anchors_), num_anchors(anchors_.dimension(0)),
+        anchors_step(anchors_.dimension(1)) {}
+
+  FTensor<2> forward(const FTensor<4> &network_output);
+
+private:
+  float conf_thresh;
+  size_t network_stride;
+  FTensor<2> anchors;
+  Index num_anchors, anchors_step;
+};
+
+class NMS {
+public:
+  NMS(float nms_thresh_) : nms_thresh(nms_thresh_) {}
+
+  FTensor<2> forward(const FTensor<2> &boxes, size_t nImages);
+
+private:
+  BTensor<1> nms(const FTensor<2> &boxes);
+
+  float nms_thresh;
+};
+
+FTensor<2> GetDarknetBoxes::forward(const FTensor<4> &network_output) {
+  const auto &dims = network_output.dimensions();
+
+  // Some shape-dependent constants
+  Index batch = dims[0], channels = dims[1], h = dims[2], w = dims[3];
+  assert(channels % this->num_anchors == 0);
+  Index num_features = channels / this->num_anchors,
+        num_classes = num_features - 5;
+  assert(num_features > 5);
+  Index num_boxes_per_im = this->num_anchors * h * w,
+        num_boxes = batch * num_boxes_per_im;
+
+  IArray<4> xs_shape{{batch, this->num_anchors, num_features, h * w}};
+  FTensor<4> xs = network_output.reshape(xs_shape);
+
+  // Decode network output into boxes and cls_scores
+  FTensor<1> lin_x, lin_y;
+  std::tie(lin_x, lin_y) = meshgrid(h, w);
+  FTensor<1> anchor_w = this->anchors.chip<1>(0);
+  FTensor<1> anchor_h = this->anchors.chip<1>(1);
+  IArray<3> shape3D{{batch, this->num_anchors, h * w}};
+  float st = this->network_stride;
+  FTensor<3> xssig = xs.chip<2>(0).sigmoid(), yssig = xs.chip<2>(1).sigmoid(),
+             wexp = xs.chip<2>(2).exp(), hexp = xs.chip<2>(3).exp();
+  FTensor<3> x_center = (xssig + broadcast(lin_x, shape3D)) * st,
+             y_center = (yssig + broadcast(lin_y, shape3D)) * st,
+             width = wexp * broadcast(anchor_w, shape3D, 1) * st,
+             height = hexp * broadcast(anchor_h, shape3D, 1) * st,
+             score = xs.chip<2>(4).sigmoid();
+  FTensor<3> x0 = x_center - width / 2.0f, x1 = x_center + width / 2.0f,
+             y0 = y_center - height / 2.0f, y1 = y_center + height / 2.0f;
+
+  FTensor<4> boxes(IArray<4>({batch, this->num_anchors, h * w, 4}));
+  boxes.chip<3>(0) = x0;
+  boxes.chip<3>(1) = y0;
+  boxes.chip<3>(2) = x1;
+  boxes.chip<3>(3) = y1;
+
+  // Get classes and scores of boxes
+  FTensor<4> cls_scores = dimSelect(xs, 2, 5, -1);
+  FTensor<1> cls_max;
+  ITensor<1> cls_max_idx;
+  IArray<1> shape1D({num_boxes});
+  if (num_classes > 1) {
+    IArray<1> max_dim({2});
+    cls_scores = softmax<2>(cls_scores);
+    cls_max = (cls_scores.maximum(max_dim) * score).reshape(shape1D);
+    cls_max_idx = cls_scores.argmax(2).reshape(shape1D);
+  } else {
+    cls_max = score.reshape(shape1D);
+    cls_max_idx = ITensor<1>(shape1D).setZero();
+  }
+
+  // Flatten coords into boxes (2D).
+  FTensor<2> boxes2d = boxes.reshape(IArray<2>({num_boxes, 4}));
+  BTensor<1> score_thresh = cls_max > this->conf_thresh;
+  BTensor<0> has_thresh = score_thresh.any();
+  if (!has_thresh(0))
+    return FTensor<2>(0, 7).setZero();
+
+  // Mask select boxes > conf_thresh
+  boxes2d = maskSelect<0>(boxes2d, score_thresh);
+  FTensor<1> scores = maskSelect<0>(cls_max, score_thresh);
+  FTensor<1> idx = maskSelect<0>(cls_max_idx, score_thresh).cast<float>();
+
+  // Get batch numbers of the detections
+  FTensor<1> batch_num =
+      broadcast<2>(arange(1, batch + 1), {batch, num_boxes_per_im}, 1)
+          .reshape(IArray<1>({num_boxes}));
+  batch_num = maskSelect<0>(batch_num, score_thresh) - 1.0f;
+  return concat<1, 2, 4>({unsqueeze<1>(batch_num, {1}), boxes2d,
+                          unsqueeze<1>(scores, {1}), unsqueeze<1>(idx, {1})});
+}
+
+FTensor<2> NMS::forward(const FTensor<2> &boxes, size_t nImages) {
+  if (boxes.dimension(0) == 0)
+    return boxes;
+  auto batchIdx = boxes.chip<1>(0).cast<size_t>();
+  BTensor<1> keep(boxes.dimension(0));
+  for (size_t i = 0; i < nImages; i++) {
+    auto mask = batchIdx == i;
+    maskAssign<0>(keep, mask, this->nms(maskSelect<0>(boxes, mask)));
+  }
+  return maskSelect<0>(boxes, keep);
+}
+
+FTensor<2> sortByCol(const FTensor<2> &input, Index col, bool descending,
+                     ITensor<1> &order) {
+  using TupleTy = Eigen::Tuple<Index, float>;
+  Tensor<TupleTy, 1> sortCol = input.chip<1>(col).index_tuples();
+  TupleTy *sortValPtr = sortCol.data();
+  auto cmp = [descending](const TupleTy &lhs, const TupleTy &rhs) {
+    Index lhs_ = lhs.first, rhs_ = rhs.first;
+    return descending ? lhs_ > rhs_ : lhs_ < rhs_;
+  };
+  std::sort(sortValPtr, sortValPtr + input.dimension(0), cmp);
+  FTensor<2> sorted(input.dimensions());
+  for (size_t i = 0; i < input.dimension(0); i++) {
+    order[i] = sortCol[i].first;
+    sorted.chip<0>(i) = input.chip<0>(sortCol[i].first);
+  }
+  return sorted;
+}
+
+BTensor<2> triu1(BTensor<2> &xs) {
+  using MatTy = Matrix<bool, Dynamic, Dynamic>;
+  Index m = xs.dimension(0), n = xs.dimension(1);
+  auto inMat = Eigen::Map<MatTy>(xs.data(), m, n);
+  MatTy outMat = inMat.triangularView<Eigen::StrictlyUpper>();
+  return Eigen::TensorMap<BTensor<2>>(outMat.data(), {m, n});
+}
+
+BTensor<1> NMS::nms(const FTensor<2> &boxes) {
+  // Sort by descending score
+  Index nBoxes = boxes.dimension(0);
+  std::cout << nBoxes << " boxes\n";
+  ITensor<1> order(nBoxes);
+  FTensor<2> sortedBoxes = sortByCol(boxes, 5, true, order);
+  FTensor<1> x1 = sortedBoxes.chip<1>(1), y1 = sortedBoxes.chip<1>(2),
+             x2 = sortedBoxes.chip<1>(3), y2 = sortedBoxes.chip<1>(4),
+             scores = sortedBoxes.chip<1>(5);
+  ITensor<1> classes = sortedBoxes.chip<1>(6).cast<Index>();
+
+  // Compute dx and dy between each pair of boxes (these mat contain
+  // every pair twice...)
+  std::cout << "len(x1) = " << x1.dimension(0)
+            << ", len(x2) = " << x2.dimension(0) << "\n";
+  using F2Ref = const FTensor<2> &;
+  auto cwiseMin = [](F2Ref lhs, F2Ref rhs) { return lhs.cwiseMin(rhs); };
+  auto cwiseMax = [](F2Ref lhs, F2Ref rhs) { return lhs.cwiseMax(rhs); };
+  FTensor<2> dx2 = outerAction<float>(x2, cwiseMin),
+             dx1 = outerAction<float>(x1, cwiseMax),
+             dy2 = outerAction<float>(y2, cwiseMin),
+             dy1 = outerAction<float>(y1, cwiseMax);
+  FTensor<2> dx = (dx2 - dx1).cwiseMax(0.0f), dy = (dy2 - dy1).cwiseMax(0.0f);
+  std::cout << "Done with dx and dy\n";
+
+  // Compute iou
+  auto sum = [](F2Ref lhs, F2Ref rhs) { return lhs + rhs; };
+  FTensor<1> areas = (x2 - x1) * (y2 - y1);
+  FTensor<2> intersections = dx * dy,
+             unions = outerAction<float>(areas, sum) - intersections,
+             ious = intersections / unions;
+
+  // Filter based on iou(and class)
+  BTensor<2> conflicting = ious > this->nms_thresh;
+  // if (this->class_nms)
+  using I2Ref = const ITensor<2> &;
+  auto equal = [](I2Ref lhs, I2Ref rhs) { return lhs == rhs; };
+  BTensor<2> same_class = outerAction<bool>(classes, equal);
+  conflicting = triu1(conflicting) && same_class;
+
+  BTensor<1> keep(nBoxes), suppress(nBoxes);
+  keep.setConstant(false);
+  suppress.setConstant(false);
+  for (size_t i = 0; i < nBoxes; i++) {
+    if (suppress[i])
+      continue;
+    keep[i] = true;
+    maskAssign<0>(suppress, conflicting.chip<0>(i), true);
+  }
+  return scatter1D(order, keep);
+}
+
+size_t ImageOutputShape[3] = {85, 19, 19};
+
+FTensor<4> loadTensor(const std::string &filename) {
+  ifstream fin(filename);
+  if (!fin.good())
+    throw std::runtime_error("Cannot open file " + filename);
+  Index c = ImageOutputShape[0], h = ImageOutputShape[1],
+        w = ImageOutputShape[2], sizePerImage = c * h * w;
+  std::vector<float> data;
+  float x;
+  while (fin >> x)
+    data.push_back(x);
+  Index nElem = data.size();
+  FTensor<1> flatbuffer({nElem});
+  for (Index i = 0; i < nElem; i++)
+    flatbuffer[i] = data[i];
+
+  if (nElem % sizePerImage)
+    throw std::runtime_error("Number of elements in file (" + to_string(nElem) +
+                             ") is not a multiple of " +
+                             to_string(sizePerImage));
+  return flatbuffer.reshape(IArray<4>({nElem / sizePerImage, c, h, w}));
+}
+
+int main(int argc, char *argv[]) {
+  if (argc != 3) {
+    std::cerr << "Usage: " << argv[0] << " IMAGE_PATH SCALE\n";
+    return 1;
+  }
+  // std::string cmd = std::string("./nvdla_runtime --image ") + argv[1] +
+  //                   " --loadable hpvm-mod.nvdla --rawdump";
+  // std::cout << "Running NVDLA model as \"" << cmd << "\"\n";
+  // std::system(cmd.c_str());
+  FTensor<4> inputs = loadTensor("output.dimg") * std::stof(argv[2]);
+  std::cout << "Loaded input of shape " << inputs.dimensions() << "\n";
+  auto tinyYoloV2Anchors = FTensor<2>(5, 2);
+  // This should match that defined in torch_dnn/yolo/model.py
+  tinyYoloV2Anchors.setValues({{1.08, 1.19},
+                               {3.42, 4.41},
+                               {6.63, 11.38},
+                               {9.42, 5.11},
+                               {16.62, 10.52}});
+  GetDarknetBoxes getBoxes(0.5, 32, tinyYoloV2Anchors);
+  NMS nms(0.5);
+  FTensor<2> ret1 = getBoxes.forward(inputs);
+  FTensor<2> ret2 = nms.forward(ret1, inputs.dimension(0));
+  cout << ret2 << "\n";
+  return 0;
+}