diff --git a/hpvm/test/epoch_dnn/host_code/_ref_impl.py b/hpvm/test/epoch_dnn/host_code/_ref_impl.py new file mode 100644 index 0000000000000000000000000000000000000000..00f789af54c318998b9e2f9dbd442f84fdac489e --- /dev/null +++ b/hpvm/test/epoch_dnn/host_code/_ref_impl.py @@ -0,0 +1,31 @@ +import lightnet as ln +import torch +from sys import argv +from PIL import Image, ImageDraw + +model = ln.models.TinyYoloV2(12) +transform = ln.data.transform.Compose([ + # GetBoxes transformation generates bounding boxes from network output + ln.data.transform.GetDarknetBoxes( + conf_thresh=0.5, + network_stride=model.stride, + anchors=model.anchors + ), + # Filter transformation to filter the output boxes + ln.data.transform.NMS( + nms_thresh=0.5 + ), +]) + +image_output_shape = [85, 19, 19] +with open(argv[1]) as f: + data = [float(s) for s in f.read().split()] +data = torch.tensor(data).reshape(-1, *image_output_shape) +boxes = transform(data) + +image = Image.open(argv[2]).resize((640, 640)) +imagedraw = ImageDraw.ImageDraw(image) +for box in boxes: + _, x0, y0, x1, y1, score, label = box + imagedraw.rectangle((x0, y0, x1, y1), outline="red") +image.save(argv[3]) diff --git a/hpvm/test/epoch_dnn/host_code/eigen_utils.h b/hpvm/test/epoch_dnn/host_code/eigen_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..2328640a5ef665d2695ee7b22320f0df858eb8d3 --- /dev/null +++ b/hpvm/test/epoch_dnn/host_code/eigen_utils.h @@ -0,0 +1,186 @@ +#include <array> +#include <unordered_set> +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Index; +using Eigen::Tensor; +template <int N> using BTensor = Eigen::Tensor<bool, N>; +template <int N> using FTensor = Eigen::Tensor<float, N>; +template <int N> using ITensor = Eigen::Tensor<Index, N>; +template <int N> using IArray = std::array<Index, N>; + +template <typename Nested, int ODim> +using Reshape = Eigen::TensorReshapingOp<const IArray<ODim>, const Nested>; + +template <typename Nested, int ODim> +using BcastReshape = Eigen::TensorBroadcastingOp<const IArray<ODim>, + const Reshape<Nested, ODim>>; + +template <int N, typename Scalar, int Dim> +Reshape<Tensor<Scalar, Dim>, Dim + N> +unsqueeze(const Tensor<Scalar, Dim> &input, const IArray<N> &dims) { + IArray<Dim + N> reshapeTo; + std::unordered_set<Index> dimSet(dims.begin(), dims.end()); + size_t fromDim = 0; + for (size_t dim = 0; dim < Dim + 1; dim++) { + if (dimSet.find(dim) == dimSet.end()) + reshapeTo[dim] = input.dimension(fromDim++); + else + reshapeTo[dim] = 1; + } + return input.reshape(reshapeTo); +} + +template <size_t IDim, size_t ODim, typename TensorOp> +BcastReshape<TensorOp, ODim> broadcast(const TensorOp &iTensor, + const IArray<IDim> &fromShape, + const IArray<ODim> &toShape) { + assert(ODim >= IDim); + IArray<ODim> reshapeShape, broadcastN = toShape; + reshapeShape.fill(1); + size_t offset = ODim - IDim; + for (int i = offset; i < ODim; i++) { + Index size = fromShape[i - offset]; + reshapeShape[i] = size; + if (size == 1) + broadcastN[i] = toShape[i]; + else if (size == toShape[i]) + broadcastN[i] = 1; + else + throw std::runtime_error("Shape mismatch at dimension " + + std::to_string(i)); + } + return iTensor.reshape(reshapeShape).broadcast(broadcastN); +} + +template <size_t ODim, typename Scalar, int IDim> +BcastReshape<Tensor<Scalar, IDim>, ODim> +broadcast(const Tensor<Scalar, IDim> &iTensor, const IArray<ODim> &shape) { + return broadcast(iTensor, iTensor.dimensions(), shape); +} + +template <size_t ODim, typename Scalar, int IDim> +BcastReshape<Reshape<Tensor<Scalar, IDim>, IDim + 1>, ODim> +broadcast(const Tensor<Scalar, IDim> &iTensor, const IArray<ODim> &shape, + Index unsqueezeDim) { + auto unsqueezed = unsqueeze<1>(iTensor, {unsqueezeDim}); + return broadcast(unsqueezed, unsqueezed.dimensions(), shape); +} + +FTensor<1> arange(Index from, Index to) { + // to - 1 because LinSpaced is [from, to] closed interval. + Eigen::ArrayXf array = Eigen::ArrayXf::LinSpaced(to - from, from, to - 1); + return Eigen::TensorMap<FTensor<1>>(array.data(), to - from); +} + +std::tuple<FTensor<1>, FTensor<1>> meshgrid(Index h, Index w) { + IArray<1> reshapeTo({h * w}); + // auto b = broadcast<1>(arange(0, h), {h, w}, 1); + FTensor<1> linx = broadcast<2>(arange(0, h), {h, w}, 1).reshape(reshapeTo); + FTensor<1> liny = broadcast<2>(arange(0, w), {h, w}, 0).reshape(reshapeTo); + return std::make_pair(linx, liny); +} + +template <int Dim> FTensor<4> softmax(const FTensor<4> &input) { + IArray<1> dimToReduce({Dim}); + IArray<4> inputShape = input.dimensions(); + FTensor<3> maxElems = input.maximum(dimToReduce); + FTensor<4> expInput = (input - broadcast(maxElems, inputShape, Dim)).exp(); + FTensor<3> sumExp = expInput.sum(dimToReduce); + return expInput / broadcast(sumExp, inputShape, Dim); +} + +template <int AlongDim, typename Scalar, int IDim> +Tensor<Scalar, IDim> maskSelect(const Tensor<Scalar, IDim> &input, + const BTensor<1> &mask) { + size_t nSelected = 0; + for (Index i = 0; i < mask.dimension(0); i++) + if (mask[i]) + nSelected += 1; + IArray<IDim> retShape = input.dimensions(); + retShape[AlongDim] = nSelected; + Tensor<Scalar, IDim> ret(retShape); + for (Index i = 0, j = 0; i < mask.dimension(0); i++) + if (mask[i]) { + ret.chip(j, AlongDim) = input.chip(i, AlongDim); + ++j; + } + return ret; +} + +template <int AlongDim, typename Scalar, int IDim> +Tensor<Scalar, IDim> &maskAssign(Tensor<Scalar, IDim> &tensor, + const BTensor<1> &mask, + const Tensor<Scalar, IDim> &values) { + for (Index i = 0, j = 0; i < mask.dimension(0); i++) + if (mask[i]) { + tensor.chip(i, AlongDim) = values.chip(j, AlongDim); + ++j; + } + return tensor; +} + +template <int AlongDim, typename Scalar, int IDim> +Tensor<Scalar, IDim> &maskAssign(Tensor<Scalar, IDim> &tensor, + const BTensor<1> &mask, const Scalar &value) { + for (Index i = 0, j = 0; i < mask.dimension(0); i++) + if (mask[i]) { + tensor.chip(i, AlongDim).setConstant(value); + ++j; + } + return tensor; +} + +template <typename Scalar, int IDim> +Tensor<Scalar, IDim> dimSelect(const Tensor<Scalar, IDim> &input, size_t dim, + Index from, Index to) { + Index dimSize = input.dimension(dim); + if (from < 0) + from += dimSize; + if (to < 0) + to += dimSize + 1; + IArray<IDim> slice_starts, slice_extents = input.dimensions(); + slice_starts.fill(0); + slice_starts[dim] = from; + slice_extents[dim] = to - from; + return input.slice(slice_starts, slice_extents); +} + +template <Index AlongDim, int IDim, int N> +FTensor<IDim> concat(const std::array<FTensor<IDim>, N> &tensors) { + size_t outputSize = 0; + for (const auto &tensor : tensors) + outputSize += tensor.dimension(AlongDim); + auto dim = tensors[0].dimensions(); + dim[AlongDim] = outputSize; + + FTensor<IDim> ret(dim); + IArray<IDim> slice_starts, slice_extents = dim; + slice_starts.fill(0); + for (const auto &tensor : tensors) { + size_t size = tensor.dimension(AlongDim); + slice_extents[AlongDim] = size; + ret.slice(slice_starts, slice_extents) = tensor; + slice_starts[AlongDim] += size; + } + return ret; +} + +template <typename OutScalar, typename InScalar, typename FuncT> +Tensor<OutScalar, 2> outerAction(const Tensor<InScalar, 1> &xs, + const FuncT &func) { + Index nElem = xs.size(); + auto lhs = broadcast<2>(xs, {nElem, nElem}, 0), + rhs = broadcast<2>(xs, {nElem, nElem}, 1); + return func(lhs, rhs); +} + +template <typename Scalar> +Tensor<Scalar, 1> scatter1D(const Tensor<Index, 1> &indices, + const Tensor<Scalar, 1> &values) { + assert(indices.size() == values.size()); + Tensor<Scalar, 1> ret(indices.size()); + for (size_t i = 0; i < indices.size(); i++) + ret[indices[i]] = values[i]; + return ret; +} diff --git a/hpvm/test/epoch_dnn/host_code/host_bin b/hpvm/test/epoch_dnn/host_code/host_bin new file mode 100755 index 0000000000000000000000000000000000000000..3ac575fbf559a533a1ae4dc8f2a5e461e8d143c1 Binary files /dev/null and b/hpvm/test/epoch_dnn/host_code/host_bin differ diff --git a/hpvm/test/epoch_dnn/host_code/yolo.cpp b/hpvm/test/epoch_dnn/host_code/yolo.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c6f11daa787505fd4c7d98a741dfe7c676290950 --- /dev/null +++ b/hpvm/test/epoch_dnn/host_code/yolo.cpp @@ -0,0 +1,267 @@ +#include "eigen_utils.h" +#include <fstream> +#include <iostream> +#include <string> +#include <vector> +#include <cstdlib> + +using namespace Eigen; +using namespace std; + +template <typename T, int N> +std::ostream &operator<<(std::ostream &os, const DSizes<T, N> &data) { + os << "["; + bool has_last = false; + for (const auto &x : data) { + if (has_last) + os << ", "; + else + has_last = true; + os << x; + } + os << "]"; + return os; +} + +class GetDarknetBoxes { +public: + GetDarknetBoxes(float conf_thresh_, size_t network_stride_, + const FTensor<2> &anchors_) + : conf_thresh(conf_thresh_), network_stride(network_stride_), + anchors(anchors_), num_anchors(anchors_.dimension(0)), + anchors_step(anchors_.dimension(1)) {} + + FTensor<2> forward(const FTensor<4> &network_output); + +private: + float conf_thresh; + size_t network_stride; + FTensor<2> anchors; + Index num_anchors, anchors_step; +}; + +class NMS { +public: + NMS(float nms_thresh_) : nms_thresh(nms_thresh_) {} + + FTensor<2> forward(const FTensor<2> &boxes, size_t nImages); + +private: + BTensor<1> nms(const FTensor<2> &boxes); + + float nms_thresh; +}; + +FTensor<2> GetDarknetBoxes::forward(const FTensor<4> &network_output) { + const auto &dims = network_output.dimensions(); + + // Some shape-dependent constants + Index batch = dims[0], channels = dims[1], h = dims[2], w = dims[3]; + assert(channels % this->num_anchors == 0); + Index num_features = channels / this->num_anchors, + num_classes = num_features - 5; + assert(num_features > 5); + Index num_boxes_per_im = this->num_anchors * h * w, + num_boxes = batch * num_boxes_per_im; + + IArray<4> xs_shape{{batch, this->num_anchors, num_features, h * w}}; + FTensor<4> xs = network_output.reshape(xs_shape); + + // Decode network output into boxes and cls_scores + FTensor<1> lin_x, lin_y; + std::tie(lin_x, lin_y) = meshgrid(h, w); + FTensor<1> anchor_w = this->anchors.chip<1>(0); + FTensor<1> anchor_h = this->anchors.chip<1>(1); + IArray<3> shape3D{{batch, this->num_anchors, h * w}}; + float st = this->network_stride; + FTensor<3> xssig = xs.chip<2>(0).sigmoid(), yssig = xs.chip<2>(1).sigmoid(), + wexp = xs.chip<2>(2).exp(), hexp = xs.chip<2>(3).exp(); + FTensor<3> x_center = (xssig + broadcast(lin_x, shape3D)) * st, + y_center = (yssig + broadcast(lin_y, shape3D)) * st, + width = wexp * broadcast(anchor_w, shape3D, 1) * st, + height = hexp * broadcast(anchor_h, shape3D, 1) * st, + score = xs.chip<2>(4).sigmoid(); + FTensor<3> x0 = x_center - width / 2.0f, x1 = x_center + width / 2.0f, + y0 = y_center - height / 2.0f, y1 = y_center + height / 2.0f; + + FTensor<4> boxes(IArray<4>({batch, this->num_anchors, h * w, 4})); + boxes.chip<3>(0) = x0; + boxes.chip<3>(1) = y0; + boxes.chip<3>(2) = x1; + boxes.chip<3>(3) = y1; + + // Get classes and scores of boxes + FTensor<4> cls_scores = dimSelect(xs, 2, 5, -1); + FTensor<1> cls_max; + ITensor<1> cls_max_idx; + IArray<1> shape1D({num_boxes}); + if (num_classes > 1) { + IArray<1> max_dim({2}); + cls_scores = softmax<2>(cls_scores); + cls_max = (cls_scores.maximum(max_dim) * score).reshape(shape1D); + cls_max_idx = cls_scores.argmax(2).reshape(shape1D); + } else { + cls_max = score.reshape(shape1D); + cls_max_idx = ITensor<1>(shape1D).setZero(); + } + + // Flatten coords into boxes (2D). + FTensor<2> boxes2d = boxes.reshape(IArray<2>({num_boxes, 4})); + BTensor<1> score_thresh = cls_max > this->conf_thresh; + BTensor<0> has_thresh = score_thresh.any(); + if (!has_thresh(0)) + return FTensor<2>(0, 7).setZero(); + + // Mask select boxes > conf_thresh + boxes2d = maskSelect<0>(boxes2d, score_thresh); + FTensor<1> scores = maskSelect<0>(cls_max, score_thresh); + FTensor<1> idx = maskSelect<0>(cls_max_idx, score_thresh).cast<float>(); + + // Get batch numbers of the detections + FTensor<1> batch_num = + broadcast<2>(arange(1, batch + 1), {batch, num_boxes_per_im}, 1) + .reshape(IArray<1>({num_boxes})); + batch_num = maskSelect<0>(batch_num, score_thresh) - 1.0f; + return concat<1, 2, 4>({unsqueeze<1>(batch_num, {1}), boxes2d, + unsqueeze<1>(scores, {1}), unsqueeze<1>(idx, {1})}); +} + +FTensor<2> NMS::forward(const FTensor<2> &boxes, size_t nImages) { + if (boxes.dimension(0) == 0) + return boxes; + auto batchIdx = boxes.chip<1>(0).cast<size_t>(); + BTensor<1> keep(boxes.dimension(0)); + for (size_t i = 0; i < nImages; i++) { + auto mask = batchIdx == i; + maskAssign<0>(keep, mask, this->nms(maskSelect<0>(boxes, mask))); + } + return maskSelect<0>(boxes, keep); +} + +FTensor<2> sortByCol(const FTensor<2> &input, Index col, bool descending, + ITensor<1> &order) { + using TupleTy = Eigen::Tuple<Index, float>; + Tensor<TupleTy, 1> sortCol = input.chip<1>(col).index_tuples(); + TupleTy *sortValPtr = sortCol.data(); + auto cmp = [descending](const TupleTy &lhs, const TupleTy &rhs) { + Index lhs_ = lhs.first, rhs_ = rhs.first; + return descending ? lhs_ > rhs_ : lhs_ < rhs_; + }; + std::sort(sortValPtr, sortValPtr + input.dimension(0), cmp); + FTensor<2> sorted(input.dimensions()); + for (size_t i = 0; i < input.dimension(0); i++) { + order[i] = sortCol[i].first; + sorted.chip<0>(i) = input.chip<0>(sortCol[i].first); + } + return sorted; +} + +BTensor<2> triu1(BTensor<2> &xs) { + using MatTy = Matrix<bool, Dynamic, Dynamic>; + Index m = xs.dimension(0), n = xs.dimension(1); + auto inMat = Eigen::Map<MatTy>(xs.data(), m, n); + MatTy outMat = inMat.triangularView<Eigen::StrictlyUpper>(); + return Eigen::TensorMap<BTensor<2>>(outMat.data(), {m, n}); +} + +BTensor<1> NMS::nms(const FTensor<2> &boxes) { + // Sort by descending score + Index nBoxes = boxes.dimension(0); + std::cout << nBoxes << " boxes\n"; + ITensor<1> order(nBoxes); + FTensor<2> sortedBoxes = sortByCol(boxes, 5, true, order); + FTensor<1> x1 = sortedBoxes.chip<1>(1), y1 = sortedBoxes.chip<1>(2), + x2 = sortedBoxes.chip<1>(3), y2 = sortedBoxes.chip<1>(4), + scores = sortedBoxes.chip<1>(5); + ITensor<1> classes = sortedBoxes.chip<1>(6).cast<Index>(); + + // Compute dx and dy between each pair of boxes (these mat contain + // every pair twice...) + std::cout << "len(x1) = " << x1.dimension(0) + << ", len(x2) = " << x2.dimension(0) << "\n"; + using F2Ref = const FTensor<2> &; + auto cwiseMin = [](F2Ref lhs, F2Ref rhs) { return lhs.cwiseMin(rhs); }; + auto cwiseMax = [](F2Ref lhs, F2Ref rhs) { return lhs.cwiseMax(rhs); }; + FTensor<2> dx2 = outerAction<float>(x2, cwiseMin), + dx1 = outerAction<float>(x1, cwiseMax), + dy2 = outerAction<float>(y2, cwiseMin), + dy1 = outerAction<float>(y1, cwiseMax); + FTensor<2> dx = (dx2 - dx1).cwiseMax(0.0f), dy = (dy2 - dy1).cwiseMax(0.0f); + std::cout << "Done with dx and dy\n"; + + // Compute iou + auto sum = [](F2Ref lhs, F2Ref rhs) { return lhs + rhs; }; + FTensor<1> areas = (x2 - x1) * (y2 - y1); + FTensor<2> intersections = dx * dy, + unions = outerAction<float>(areas, sum) - intersections, + ious = intersections / unions; + + // Filter based on iou(and class) + BTensor<2> conflicting = ious > this->nms_thresh; + // if (this->class_nms) + using I2Ref = const ITensor<2> &; + auto equal = [](I2Ref lhs, I2Ref rhs) { return lhs == rhs; }; + BTensor<2> same_class = outerAction<bool>(classes, equal); + conflicting = triu1(conflicting) && same_class; + + BTensor<1> keep(nBoxes), suppress(nBoxes); + keep.setConstant(false); + suppress.setConstant(false); + for (size_t i = 0; i < nBoxes; i++) { + if (suppress[i]) + continue; + keep[i] = true; + maskAssign<0>(suppress, conflicting.chip<0>(i), true); + } + return scatter1D(order, keep); +} + +size_t ImageOutputShape[3] = {85, 19, 19}; + +FTensor<4> loadTensor(const std::string &filename) { + ifstream fin(filename); + if (!fin.good()) + throw std::runtime_error("Cannot open file " + filename); + Index c = ImageOutputShape[0], h = ImageOutputShape[1], + w = ImageOutputShape[2], sizePerImage = c * h * w; + std::vector<float> data; + float x; + while (fin >> x) + data.push_back(x); + Index nElem = data.size(); + FTensor<1> flatbuffer({nElem}); + for (Index i = 0; i < nElem; i++) + flatbuffer[i] = data[i]; + + if (nElem % sizePerImage) + throw std::runtime_error("Number of elements in file (" + to_string(nElem) + + ") is not a multiple of " + + to_string(sizePerImage)); + return flatbuffer.reshape(IArray<4>({nElem / sizePerImage, c, h, w})); +} + +int main(int argc, char *argv[]) { + if (argc != 3) { + std::cerr << "Usage: " << argv[0] << " IMAGE_PATH SCALE\n"; + return 1; + } + // std::string cmd = std::string("./nvdla_runtime --image ") + argv[1] + + // " --loadable hpvm-mod.nvdla --rawdump"; + // std::cout << "Running NVDLA model as \"" << cmd << "\"\n"; + // std::system(cmd.c_str()); + FTensor<4> inputs = loadTensor("output.dimg") * std::stof(argv[2]); + std::cout << "Loaded input of shape " << inputs.dimensions() << "\n"; + auto tinyYoloV2Anchors = FTensor<2>(5, 2); + // This should match that defined in torch_dnn/yolo/model.py + tinyYoloV2Anchors.setValues({{1.08, 1.19}, + {3.42, 4.41}, + {6.63, 11.38}, + {9.42, 5.11}, + {16.62, 10.52}}); + GetDarknetBoxes getBoxes(0.5, 32, tinyYoloV2Anchors); + NMS nms(0.5); + FTensor<2> ret1 = getBoxes.forward(inputs); + FTensor<2> ret2 = nms.forward(ret1, inputs.dimension(0)); + cout << ret2 << "\n"; + return 0; +}