diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..2da9be9ac4ecfec68edac79f118cfe0be5e99862
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "examples/stand_count/pytorch-ssd"]
+	path = examples/stand_count/pytorch-ssd
+	url = git@github.com:qfgaohao/pytorch-ssd.git
diff --git a/examples/stand_count/metric.py b/examples/stand_count/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..101b0792dc29de78b66afe877b56ed91212b9362
--- /dev/null
+++ b/examples/stand_count/metric.py
@@ -0,0 +1,38 @@
+import numpy as np
+import torch
+from mean_average_precision import MeanAveragePrecision2d
+
+
+def calculate_mAP(gt_boxes, pred_boxes, pred_scores, style="pascal"):
+    if style == "pascal":
+        settings = {
+            "iou_thresholds": 0.5,
+            "recall_thresholds": np.arange(0.0, 1.1, 0.1),
+        }
+    elif style == "coco":
+        settings = {
+            "iou_thresholds": np.arange(0.5, 1.0, 0.05),
+            "recall_thresholds": np.arange(0.0, 1.01, 0.01),
+            "mpolicy": "soft",
+        }
+    else:
+        raise ValueError(f"Unrecognized style {style}")
+
+    map_calc = MeanAveragePrecision2d(num_classes=1)
+    torch2np = lambda x: x.detach().cpu().numpy()
+    for gt_boxes_, pred_boxes_, pred_scores_ in zip(gt_boxes, pred_boxes, pred_scores):
+        # Both groundtruth and predicted needs to be ndarray.
+        # Format for groundtruth: [xmin, ymin, xmax, ymax, class_id, difficult, crowd]
+        # Format for predicted: [xmin, ymin, xmax, ymax, class_id, confidence]
+        n_gt, n_pred = len(gt_boxes_), len(pred_boxes_)
+        gt_np = np.zeros((n_gt, 7))
+        gt_np[:, :4] = torch2np(gt_boxes_)
+        gt_np[:, 4] = 0  # Class ID
+        pred_np = np.zeros((n_pred, 6))
+        pred_np[:, :4] = torch2np(pred_boxes_)
+        pred_np[:, 4] = 0  # Class ID
+        pred_np[:, 5] = torch2np(pred_scores_)
+        map_calc.add(pred_np, gt_np)
+    # Calculate mAP
+    mAP = map_calc.value(**settings)["mAP"]
+    return torch.tensor(mAP)
diff --git a/examples/stand_count/pytorch-ssd b/examples/stand_count/pytorch-ssd
new file mode 160000
index 0000000000000000000000000000000000000000..f61ab424d09bf3d4bb3925693579ac0a92541b0d
--- /dev/null
+++ b/examples/stand_count/pytorch-ssd
@@ -0,0 +1 @@
+Subproject commit f61ab424d09bf3d4bb3925693579ac0a92541b0d
diff --git a/examples/stand_count/ssd.py b/examples/stand_count/ssd.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfca965ca1846fd7843ca83ad3fa749a75d0f1ab
--- /dev/null
+++ b/examples/stand_count/ssd.py
@@ -0,0 +1,211 @@
+from pathlib import Path
+from typing import Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.nn.modules.module import Module
+from torch.utils.data.dataloader import default_collate
+
+from .vision.nn.multibox_loss import MultiboxLoss
+from .vision.ssd.ssd import SSD, MatchPrior
+from .vision.utils import box_utils
+
+# common across ssd configs:
+center_variance = 0.1
+size_variance = 0.2
+
+
+class SSDWrapper(Module):
+    def __init__(
+        self,
+        net: SSD,
+        size: int,
+        image_mean: np.ndarray,
+        image_std: float,
+        priors: torch.Tensor,
+    ) -> None:
+        Module.__init__(self)
+        self.net = net.eval()
+        self.input_shape = 1, 3, size, size
+        mean = torch.as_tensor(image_mean).unsqueeze(-1).unsqueeze(-1)
+        self.register_buffer("mean", mean)
+        self.transform = lambda img: (F.interpolate(img, size) - self.mean) / image_std
+        self.register_buffer("priors", priors)
+        self.pprocessor = BoxPostprocessor()
+        self._scripting = False
+
+    def step0(self, image: torch.Tensor):
+        scores, locations = self.net.forward(self.transform(image))
+        return scores, locations
+
+    def step1(self, scores: torch.Tensor, locations: torch.Tensor):
+        confidences = F.softmax(scores, dim=2)
+        locations = box_utils.convert_locations_to_boxes(
+            locations,
+            self.priors,
+            center_variance,
+            size_variance,
+        )
+        boxes = box_utils.center_form_to_corner_form(locations)
+        return confidences, boxes
+
+    def step2(self, confidences: torch.Tensor, boxes: torch.Tensor):
+        postprocessed = [
+            self.pprocessor(image_scores, image_boxes)
+            for image_scores, image_boxes in zip(confidences, boxes)
+        ]
+        boxes, labels, confidences = zip(*postprocessed)
+        return boxes, labels, confidences
+
+    def forward(self, image: torch.Tensor):
+        scores, locations = self.step0(image)
+        if self.training:
+            return scores, locations
+        confidences, boxes = self.step1(scores, locations)
+        if self._scripting:
+            return confidences, boxes
+        boxes, labels, confidences = self.step2(confidences, boxes)
+        return boxes, labels, confidences, (scores, locations)
+
+
+def get_mobilenetv2_lite(prefix: str) -> SSDWrapper:
+    from .vision.ssd.config import mobilenetv1_ssd_config as config
+    from .vision.ssd.mobilenet_v2_ssd_lite import create_mobilenetv2_ssd_lite
+
+    ssd_model = create_mobilenetv2_ssd_lite(
+        2, onnx_compatible=True, is_test=False
+    ).eval()
+    ssd_model.init_from_pretrained_ssd(Path(prefix) / "mb2-ssd-lite-mp-0_686.pth")
+    wrapper = SSDWrapper(
+        ssd_model, config.image_size, config.image_mean, config.image_std, config.priors
+    )
+    return wrapper
+
+
+def get_vgg16(prefix: str) -> SSDWrapper:
+    from .vision.ssd.config import vgg_ssd_config as config
+    from .vision.ssd.vgg_ssd import create_vgg_ssd
+
+    ssd_model = create_vgg_ssd(2, is_test=False).eval()
+    ssd_model.init_from_pretrained_ssd(Path(prefix) / "vgg16-ssd-mp-0_7726.pth")
+    wrapper = SSDWrapper(
+        ssd_model, config.image_size, config.image_mean, config.image_std, config.priors
+    )
+    return wrapper
+
+
+class LossAndMAP:
+    def __init__(self, priors: torch.Tensor, device: Union[str, torch.device]) -> None:
+        priors = priors.cpu()
+        self.loss_fn = MultiboxLoss(
+            priors,
+            iou_threshold=0.5,
+            neg_pos_ratio=3,
+            center_variance=0.1,
+            size_variance=0.2,
+            device=device,
+        )
+        self.dataset_target_tr = MatchPrior(priors, center_variance, size_variance, 0.5)
+        self.device = device
+
+    def __call__(self, model_output, targets):
+        if len(model_output) == 2:
+            # Training outputs.
+            scores, locations = model_output
+            boxes = confidences = []
+        else:
+            # Evaluation outputs.
+            boxes, _, confidences, (scores, locations) = model_output
+        matched_labels, matched_boxes, gt_boxes = self._process_targets(targets)
+        reg_loss, cls_loss = self.loss_fn(
+            scores, locations, matched_labels, matched_boxes
+        )
+        return reg_loss, cls_loss, gt_boxes, boxes, confidences
+
+    def avg_reg_loss(self, loss_and_map):
+        reg_losses = list(zip(*loss_and_map))[0]
+        return torch.stack(reg_losses, 0).mean()
+
+    def avg_cls_loss(self, loss_and_map):
+        cls_losses = list(zip(*loss_and_map))[1]
+        return torch.stack(cls_losses, 0).mean()
+
+    def all_losses(self, loss_and_map):
+        return self.avg_reg_loss(loss_and_map) + self.avg_cls_loss(loss_and_map)
+
+    def combine_map(self, loss_and_map):
+        from .metric import calculate_mAP
+
+        flatten = lambda xss: [x for xs in xss for x in xs]
+        _, _, gt_boxes, boxes, confidences = zip(*loss_and_map)
+        gt_boxes = flatten(gt_boxes)
+        boxes, confidences = flatten(boxes), flatten(confidences)
+        return -calculate_mAP(gt_boxes, boxes, confidences)
+
+    def _process_targets(self, targets):
+        gt_boxes, matched_boxes, matched_labels = [], [], []
+        for t in targets:
+            this_boxes, this_labels = t["boxes"], t["labels"]
+            gt_boxes.append(this_boxes)
+            this_boxes_, this_labels_ = self.dataset_target_tr(
+                this_boxes.cpu(), this_labels.cpu()
+            )
+            matched_boxes.append(this_boxes_)
+            matched_labels.append(this_labels_)
+        matched_boxes = default_collate(matched_boxes).to(self.device)
+        matched_labels = default_collate(matched_labels).to(self.device)
+        return matched_labels, matched_boxes, gt_boxes
+
+
+class BoxPostprocessor(Module):
+    def __init__(
+        self,
+        nms_method=None,
+        iou_threshold=0.45,
+        filter_threshold=0.50,
+        candidate_size=200,
+        sigma=0.5,
+    ):
+        super().__init__()
+        self.iou_threshold = iou_threshold
+        self.filter_threshold = filter_threshold
+        self.candidate_size = candidate_size
+        self.nms_method = nms_method
+        self.sigma = sigma
+
+    def forward(self, scores, boxes, top_k=-1, prob_threshold=None):
+        cpu_device = torch.device("cpu")
+        prob_threshold = prob_threshold or self.filter_threshold
+        # this version of nms is slower on GPU, so we move data to CPU.
+        boxes = boxes.to(cpu_device)
+        scores = scores.to(cpu_device)
+        picked_box_probs = []
+        picked_labels = []
+        for class_index in range(1, scores.size(1)):
+            probs = scores[:, class_index]
+            mask = probs > prob_threshold
+            probs = probs[mask]
+            if probs.size(0) == 0:
+                continue
+            subset_boxes = boxes[mask, :]
+            box_probs = torch.cat([subset_boxes, probs.reshape(-1, 1)], dim=1)
+            box_probs = box_utils.nms(
+                box_probs,
+                self.nms_method,
+                score_threshold=prob_threshold,
+                iou_threshold=self.iou_threshold,
+                sigma=self.sigma,
+                top_k=top_k,
+                candidate_size=self.candidate_size,
+            )
+            picked_box_probs.append(box_probs)
+            picked_labels.extend([class_index] * box_probs.size(0))
+        if not picked_box_probs:
+            return torch.zeros(0, 4), torch.tensor([]), torch.tensor([])
+        picked_box_probs = torch.cat(picked_box_probs)
+        return (
+            picked_box_probs[:, :4],
+            torch.tensor(picked_labels),
+            picked_box_probs[:, 4],
+        )  # boxes, labels, scores
diff --git a/examples/stand_count/vision b/examples/stand_count/vision
new file mode 120000
index 0000000000000000000000000000000000000000..c69be8baf28e36e160cc6935b5cb768b7754b6a7
--- /dev/null
+++ b/examples/stand_count/vision
@@ -0,0 +1 @@
+pytorch-ssd/vision
\ No newline at end of file