8.2.3. Example: Object Detection using SSD

COCO dataset を対象に SSD-ResNet34 の学習済みモデルを用いた推論を行うサンプルプログラム

注釈

ここで利用している学習済みモデル、及び一部ソースコードは intel / ai-reference-models にあるものを部分的に改変、またはそのまま利用しています。これらのライセンスはいずれも Apache License, Version 2.0 です。

また、COCO dataset は CC BY 4.0 のライセンスで提供されており、その中でも特に CC BY 2.0 で提供されている画像を使用します。

Execution Method

初回実行時には以下の内容のダウンロードが行われます (二回目以降は省略されます)。デフォルトでのダウンロード先は /tmp/mlsdk_ssd_inference/ 以下です。

$ cd /opt/pfn/pfcomp/codegen/MLSDK/examples/ssd_inference
$ ./run_ssd_inference.sh /tmp/mlsdk_ssd_inference/coco/val2017/000000363666.jpg --device mncore2:auto

Expected Output

検出結果は、デフォルトでは実行時のカレントディレクトリに out_mncore_ prefix のファイルとして出力されます。同時に out_torch_ prefix のファイルも出力されますが、こちらは結果比較用に PyTorch で実行した結果です。比較には structural similarity index measure (SSIM) アルゴリズムを用います。

Tip

結果ファイルの出力先ディレクトリを指定するには --out_img_dir オプションを使用できます。

  • 検出結果 (./out_mncore_000000363666.jpg)

Object detection result using SSD-ResNet34 on MN-Core 2

図 8.3 Object detection using SSD-ResNet34 on MN-Core 2

  • ログ出力 (SSIM score は 0.99 以上であれば PyTorch と同等の推論が出来ているとみなします。)

Drawing detection result => /opt/pfn/pfcomp/codegen/MLSDK/examples/ssd_inference/out_mncore_000000363666.jpg
Drawing detection result => /opt/pfn/pfcomp/codegen/MLSDK/examples/ssd_inference/out_torch_000000363666.jpg
SSIM score: 0.9959848160827895

Scripts

リスト 8.18 /opt/pfn/pfcomp/codegen/MLSDK/examples/ssd_inference/run_ssd_inference.sh
 1#! /bin/bash
 2
 3set -eux -o pipefail
 4
 5EXAMPLE_NAME="mlsdk_ssd_inference"
 6VENV_DIR=${VENV_DIR:-"/tmp/${EXAMPLE_NAME}/venv"}
 7EXTERNAL_DIR=${EXTERNAL_DIR:-"/tmp/${EXAMPLE_NAME}/external"}
 8COCO_DIR=${COCO_DIR:-"/tmp/${EXAMPLE_NAME}/coco"}
 9OUT_DIR=${OUT_DIR:-"/tmp/${EXAMPLE_NAME}/out"}
10
11CURRENT_DIR=$(realpath $(dirname $0))
12CODEGEN_DIR=$(realpath ${CURRENT_DIR}/../../../)
13BUILD_DIR="${CODEGEN_DIR}/build"
14
15### Prepare and source venv/
16
17if [[ ! -d ${VENV_DIR} ]]; then
18    python3 -m venv --system-site-packages ${VENV_DIR}
19    source ${VENV_DIR}/bin/activate
20    pip3 install -r ${CURRENT_DIR}/requirements.txt
21else
22    source ${VENV_DIR}/bin/activate
23fi
24
25### Prepare external/ items
26
27mkdir -p ${EXTERNAL_DIR}
28pushd ${EXTERNAL_DIR}
29
30if [[ ! -d ai-reference-models ]]; then
31    git clone 'https://github.com/intel/ai-reference-models.git' ai-reference-models --depth 1 --branch v3.3
32fi
33if [[ ! -f resnet34-ssd1200.pth ]]; then
34    # For downloading the trained model, please refer to this documentation.
35    # Ref: https://github.com/intel/ai-reference-models/blob/main/models_v2/pytorch/ssd-resnet34/inference/cpu/CONTAINER.md
36    wget --no-check-certificate \
37        'https://docs.google.com/uc?export=download&id=13kWgEItsoxbVKUlkQz4ntjl1IZGk6_5Z' \
38        -O resnet34-ssd1200.pth
39fi
40
41popd
42
43### Prepare coco/ items
44
45mkdir -p ${COCO_DIR}
46pushd ${COCO_DIR}
47
48# download the COCO datasets and annotations
49if [[ ! -d val2017 ]]; then
50    curl -O 'http://images.cocodataset.org/zips/val2017.zip'
51    unzip val2017.zip
52    rm -f val2017.zip
53fi
54if [[ ! -d annotations ]]; then
55    curl -O 'http://images.cocodataset.org/annotations/annotations_trainval2017.zip'
56    unzip annotations_trainval2017.zip
57    rm -f annotations_trainval2017.zip
58fi
59# fetch images and annotations for inference
60if [[ ! -f annotations/fetched_annotations_eval.json ]]; then
61    python3 ${CURRENT_DIR}/coco_preparation.py \
62        -a annotations/instances_val2017.json \
63        -i val2017 \
64        -f annotations/fetched_annotations_eval.json
65fi
66
67popd
68
69### Run ssd_inference.py
70
71source "${BUILD_DIR}/codegen_pythonpath.sh"
72
73# Do not split a small h/w image among L1/L2 blocks.
74export CODEGEN_SPATIAL_SPLIT_THRESHOLD_L2B_L1B=32
75
76PYTHONPATH=${PYTHONPATH}:"${EXTERNAL_DIR}/ai-reference-models/models_v2/pytorch/ssd-resnet34/inference/cpu"
77python3 ${CURRENT_DIR}/ssd_inference.py \
78    --model_path "${EXTERNAL_DIR}/resnet34-ssd1200.pth" \
79    --coco_data_path "${COCO_DIR}/val2017" \
80    --coco_annotation_path "${COCO_DIR}/annotations/fetched_annotations_eval.json" \
81    --out_img_dir "${CURRENT_DIR}" \
82    --outdir "${OUT_DIR}" \
83    --option_json "${CODEGEN_DIR}/preset_options/O1.json" \
84    ${@}
リスト 8.19 /opt/pfn/pfcomp/codegen/MLSDK/examples/ssd_inference/ssd_inference.py
  1import argparse
  2import os
  3from pathlib import Path
  4from typing import Mapping, Optional
  5
  6import cv2
  7import matplotlib
  8import matplotlib.patches as patches
  9import matplotlib.pyplot as plt
 10import numpy as np
 11import torch
 12import torch.utils.data
 13from PIL import Image, ImageFile
 14from skimage.metrics import structural_similarity as ssim
 15
 16# isort: off
 17
 18# MLSDK modules
 19from mlsdk import (
 20    MNDevice,
 21    Context,
 22    storage,
 23    CacheOptions,
 24    TensorLike,
 25)
 26
 27# following modules are from:
 28# [ai-reference-models repository](https://github.com/intel/ai-reference-models/tree/main/models_v2/pytorch/ssd-resnet34/inference/cpu)  # NOQA: B950
 29from infer import dboxes_R34_coco
 30from ssd_r34 import SSD_R34
 31from utils import COCODetection, Encoder, SSDTransformer
 32
 33# isort: on
 34
 35# No GUI output
 36matplotlib.use("Agg")
 37
 38
 39# label_num = 81 and strides = [3, 3, 2, 2, 2, 2] are from original source in:
 40# https://github.com/intel/ai-reference-models/tree/main/models_v2/pytorch/ssd-resnet34/inference/cpu/infer.py
 41def load_model(
 42    path: str | os.PathLike,
 43    label_num: int = 81,
 44    strides: tuple[int, ...] = (3, 3, 2, 2, 2, 2),
 45) -> torch.nn.Module:
 46    # Create class object
 47    model = SSD_R34(label_num, strides=strides)
 48    # Load pretrained model's parameters
 49    model.load_state_dict(
 50        torch.load(path, map_location=lambda storage, loc: storage)["model"]
 51    )
 52    assert isinstance(model, torch.nn.Module)
 53    return model
 54
 55
 56# Load an input image given as a path
 57def load_image(
 58    img_path: str | os.PathLike,
 59    img_size: tuple[int, int] = (1200, 1200),
 60) -> tuple[ImageFile.ImageFile, torch.Tensor]:
 61    # Resize input img for model
 62    orig_img = Image.open(img_path)
 63
 64    # Convert img to tensor and change shape
 65    resized_img = orig_img.resize(img_size)
 66    converted_img_np = np.array(resized_img).transpose(
 67        2, 0, 1
 68    )  # (C,H,W), assuming dtype=uint8
 69    converted_img = (
 70        torch.from_numpy(converted_img_np).contiguous().unsqueeze(dim=0)
 71    )  # (1,C,H,W)
 72
 73    return orig_img, converted_img
 74
 75
 76# Decode a model's output for an input image
 77def decode_outputs(
 78    out_locs: torch.Tensor,
 79    out_labels: torch.Tensor,
 80    encoder: Encoder,
 81    criteria: float = 0.50,
 82    max_output: int = 200,
 83    device: int = 0,
 84) -> list[tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
 85    try:
 86        decoded_outputs = encoder.decode_batch(
 87            out_locs,
 88            out_labels,
 89            criteria=criteria,
 90            max_output=max_output,
 91            device=device,
 92        )
 93    except Exception as e:
 94        print(f"Error in decode_outputs: {type(e)} - {e}")
 95        return []
 96
 97    results = []
 98    idx = 0
 99    for i in range(decoded_outputs[3].size(0)):
100        detection_num = decoded_outputs[3][i].item()
101        idx_range = idx + detection_num
102        results.append(
103            (
104                decoded_outputs[0][idx:idx_range],
105                decoded_outputs[1][idx:idx_range],
106                decoded_outputs[2][idx:idx_range],
107            )
108        )
109        idx += detection_num
110
111    return results
112
113
114# This function is originally from:
115# [ai-reference-models](https://github.com/intel/ai-reference-models/blob/main/models_v2/pytorch/ssd-resnet34/inference/cpu/utils.py)  # NOQA: B950
116# Drawing bboxes and labels for detected objects
117def draw_patches(  # NOQA: CFQ002
118    img: ImageFile.ImageFile,
119    img_path: Path,
120    bboxes: torch.Tensor,
121    labels: torch.Tensor,
122    order: str = "ltrb",
123    label_map: Mapping[int, str] | None = None,
124    bbox_alpha: float = 1.0,
125    bbox_linewidth: int = 1,
126    label_alpha: float = 0.3,
127    label_linewidth: int = 2,
128    text_size: int = 10,
129) -> None:
130    img_np = np.array(img)
131    labels_np = labels.detach().numpy()
132    bboxes_np = bboxes.detach().numpy()
133
134    # From labels_np (ndarray[int]) to labels_list (list[str])
135    if label_map is not None:
136        labels_list = [
137            label_map.get(int(label_i), "(no label)") for label_i in labels_np
138        ]
139    else:
140        # Use an int value as a label str
141        labels_list = [str(int(label_i)) for label_i in labels_np]
142
143    if order == "ltrb":
144        xmin, ymin, xmax, ymax = (
145            bboxes_np[:, 0],
146            bboxes_np[:, 1],
147            bboxes_np[:, 2],
148            bboxes_np[:, 3],
149        )
150        cx, cy, w, h = (xmin + xmax) / 2, (ymin + ymax) / 2, xmax - xmin, ymax - ymin
151    else:
152        cx, cy, w, h = (
153            bboxes_np[:, 0],
154            bboxes_np[:, 1],
155            bboxes_np[:, 2],
156            bboxes_np[:, 3],
157        )
158
159    htot, wtot, _ = img_np.shape
160    cx *= wtot
161    cy *= htot
162    w *= wtot
163    h *= htot
164
165    plt.imshow(img_np)
166    ax = plt.gca()
167    for cx_i, cy_i, w_i, h_i, label_i in zip(cx, cy, w, h, labels_list):
168        if label_i == "background":
169            continue
170        ax.add_patch(
171            patches.Rectangle(
172                (cx_i - 0.5 * w_i, cy_i - 0.5 * h_i),
173                w_i,
174                h_i,
175                fill=False,
176                color="r",
177                alpha=bbox_alpha,
178                linewidth=bbox_linewidth,
179            )
180        )
181        bbox_props = dict(
182            boxstyle="square,pad=0",
183            fc="y",
184            ec="y",
185            alpha=label_alpha,
186            linewidth=label_linewidth,
187        )
188        ax.text(
189            cx_i - 0.5 * w_i,
190            cy_i - 0.5 * h_i,
191            label_i,
192            ha="left",
193            va="bottom",
194            size=text_size,
195            bbox=bbox_props,
196        )
197
198    plt.savefig(img_path)
199    plt.clf()
200    plt.close()
201
202
203# Fetch results satisfying threshold and draw bounding box on the given input image
204def draw_detection_result(  # NOQA: CFQ002
205    out_locs: torch.Tensor,
206    out_labels: torch.Tensor,
207    img: ImageFile.ImageFile,
208    img_path: Path,
209    encoder: Encoder,
210    threshold: float,
211    label_info: dict[int, str],
212) -> None:
213
214    decoded_output = decode_outputs(out_locs, out_labels, encoder)
215    if not decoded_output:
216        print("no objects have been detected")
217        return
218
219    bboxes, labels, scores = decoded_output[0]
220    detection_mask = scores > threshold
221    fetched_bboxes = bboxes[detection_mask]
222    fetched_labels = labels[detection_mask]
223    draw_patches(img, img_path, fetched_bboxes, fetched_labels, label_map=label_info)
224
225
226def calc_ssim(
227    lhs_img_path: Path,
228    rhs_img_path: Path,
229) -> float:
230    lhs_img = cv2.imread(lhs_img_path)
231    rhs_img = cv2.imread(rhs_img_path)
232    assert lhs_img.shape == rhs_img.shape
233
234    # Convert to gray scale (only image structure is necessary for SSIM)
235    lhs_img_gray = cv2.cvtColor(lhs_img, cv2.COLOR_BGR2GRAY)
236    rhs_img_gray = cv2.cvtColor(rhs_img, cv2.COLOR_BGR2GRAY)
237
238    return float(ssim(lhs_img_gray, rhs_img_gray))  # type: ignore[no-untyped-call]
239
240
241def run_infer(  # NOQA: CFQ002
242    *,
243    img_path: Path,
244    model_path: Path,
245    coco_data_path: Path,
246    coco_annotation_path: Path,
247    out_img_dir: Path,
248    threshold: float,
249    device_name: str,
250    outdir: str,
251    option_json_path: Optional[Path] = None,
252) -> None:
253    # Create Dataset object
254    IMG_SIZE = [1200, 1200]
255    STRIDES = [3, 3, 2, 2, 2, 2]
256    default_boxes = dboxes_R34_coco(IMG_SIZE, STRIDES)
257    transformer = SSDTransformer(default_boxes, tuple(IMG_SIZE), val=True)
258    coco = COCODetection(coco_data_path, coco_annotation_path, transformer)
259
260    # Create encoder for decode process
261    encoder = Encoder(default_boxes)
262
263    # Load and prepare images
264    orig_img, converted_img = load_image(img_path)
265    infer_input = {"image": converted_img}
266    img_name = img_path.name
267
268    # Create pretrained model object
269    model = load_model(model_path)
270    model.eval()
271
272    # Inference function for Context.compile
273    def infer_fn(sample: dict[str, TensorLike]) -> dict[str, TensorLike]:
274        with torch.no_grad():
275            locs, labels = model(sample["image"].float() / 255.0)
276        return {"locs": locs, "labels": labels}
277
278    device = MNDevice(device_name)
279    context = Context(device)
280    Context.switch_context(context)
281
282    context.registry.register("model", model)
283
284    compile_options = {}
285    if option_json_path is not None:
286        compile_options = {"option_json": str(option_json_path)}
287
288    compiled_infer_fn = context.compile(
289        infer_fn,
290        infer_input,
291        storage.path(outdir),
292        options=compile_options,
293        cache_options=CacheOptions(outdir + "/cache"),
294    )
295
296    out_mncore = compiled_infer_fn(infer_input)
297    out_locs_mncore, out_labels_mncore = (
298        out_mncore["locs"].cpu(),
299        out_mncore["labels"].cpu(),
300    )
301
302    mncore_img_path = out_img_dir / ("out_mncore_" + img_name)
303    print(f"Drawing detection result => {mncore_img_path}")
304    draw_detection_result(
305        out_locs_mncore,
306        out_labels_mncore,
307        orig_img,
308        mncore_img_path,
309        encoder,
310        threshold,
311        label_info=coco.label_info,
312    )
313
314    out_torch = infer_fn(infer_input)
315    out_locs_torch, out_labels_torch = (
316        out_torch["locs"].cpu(),
317        out_torch["labels"].cpu(),
318    )
319
320    torch_img_path = out_img_dir / ("out_torch_" + img_name)
321    print(f"Drawing detection result => {torch_img_path}")
322    draw_detection_result(
323        out_locs_torch,
324        out_labels_torch,
325        orig_img,
326        torch_img_path,
327        encoder,
328        threshold,
329        label_info=coco.label_info,
330    )
331
332    score = calc_ssim(mncore_img_path, torch_img_path)
333    print(f"SSIM score: {score}")
334    assert score > 0.99, "Generated images differ."
335
336
337def main() -> None:
338    parser = argparse.ArgumentParser(
339        description="Run SSD-ResNet34-1200 model for input image"
340    )
341    parser.add_argument("img_path", type=Path, help="Path to input image")
342    parser.add_argument(
343        "--model_path",
344        type=Path,
345        required=True,
346        help="Path to trained SSD model (e.g. resnet34-ssd1200.pth)",
347    )
348    parser.add_argument(
349        "--coco_data_path", type=Path, required=True, help="Path to the COCO dataset"
350    )
351    parser.add_argument(
352        "--coco_annotation_path",
353        type=Path,
354        required=True,
355        help="Path to annotation data (JSON) corresponding to the COCO dataset",
356    )
357    parser.add_argument(
358        "--out_img_dir",
359        type=Path,
360        default=Path("."),
361        help="Path to directory to output detection result images",
362    )
363    parser.add_argument(
364        "--threshold",
365        type=float,
366        default=0.4,
367        help="""
368        Detection threshold (0.0-1.0), a smaller threshold make object detecting
369        more sensitive""",
370    )
371    parser.add_argument("--device", type=str, default="mncore2:auto")
372    parser.add_argument("--outdir", type=str, default="/tmp/mlsdk_ssd_inference/out")
373    parser.add_argument(
374        "--option_json",
375        type=Path,
376        default="/opt/pfn/pfcomp/codegen/preset_options/O1.json",
377    )
378    args = parser.parse_args()
379
380    # Validate arguments
381    assert 0 <= args.threshold and args.threshold <= 1.0
382    assert not args.out_img_dir.is_file(), "Please specify directory path."
383
384    args.out_img_dir.mkdir(parents=True, exist_ok=True)
385
386    run_infer(
387        img_path=args.img_path,
388        model_path=args.model_path,
389        coco_data_path=args.coco_data_path,
390        coco_annotation_path=args.coco_annotation_path,
391        out_img_dir=args.out_img_dir,
392        threshold=args.threshold,
393        device_name=args.device,
394        outdir=args.outdir,
395        option_json_path=args.option_json,
396    )
397
398
399if __name__ == "__main__":
400    main()
リスト 8.20 /opt/pfn/pfcomp/codegen/MLSDK/examples/ssd_inference/requirements.txt
1matplotlib
2pycocotools
3defusedxml
リスト 8.21 /opt/pfn/pfcomp/codegen/MLSDK/examples/ssd_inference/coco_preparation.py
 1import argparse
 2import json
 3import os
 4from typing import Any
 5
 6
 7def fetch_imgs_and_ids(
 8    json_obj: dict[str, Any], license_id_list: list[int]
 9) -> tuple[list[list[dict[str, int | str]]], list[int]]:
10    img_list = []
11    id_list = []
12    for i in json_obj["images"]:
13        if i["license"] in license_id_list:
14            img_list.append(i)
15            id_list.append(i["id"])
16
17    return img_list, id_list
18
19
20def fetch_annotations(json_obj: dict[str, Any], id_list: list[int]) -> list[Any]:
21    ann_list = []
22    for i in json_obj["annotations"]:
23        if i["image_id"] in id_list:
24            ann_list.append(i)
25
26    return ann_list
27
28
29def remove_wasted_imgs(
30    json_obj: dict[str, Any], fetched_dict: dict[str, Any], img_dir: str | os.PathLike
31) -> None:
32    org_file_names = set([i["file_name"] for i in json_obj["images"]])
33    fetched_file_name = set([i["file_name"] for i in fetched_dict["images"]])
34    diff_set = org_file_names - fetched_file_name
35    for i in diff_set:
36        wasted_img_path = os.path.join(img_dir, i)
37        if os.path.exists(wasted_img_path):
38            os.remove(wasted_img_path)
39
40
41def main() -> None:
42    parser = argparse.ArgumentParser()
43    parser.add_argument(
44        "-a",
45        "--annotation_file",
46        type=str,
47        default="./coco/annotations/instances_val2017.json",
48    )
49    parser.add_argument("-i", "--images_dir", type=str, default="./coco/val2017/")
50    parser.add_argument(
51        "-f",
52        "--fetched_annotation_file",
53        type=str,
54        default="./coco/annotations/fetched_annotations.json",
55    )
56    args = parser.parse_args()
57
58    # fetch images' infomation with no license problems
59    json_obj = None
60    with open(args.annotation_file, mode="r") as f:
61        json_obj = json.load(f)
62    license_id_list = [4]  # 4: CC-BY 2.0
63
64    img_list, id_list = fetch_imgs_and_ids(json_obj, license_id_list)
65    ann_list = fetch_annotations(json_obj, id_list)
66    fetched_dict = (
67        {"info": json_obj["info"]}
68        | {"licenses": json_obj["licenses"]}
69        | {"images": img_list}
70        | {"annotations": ann_list}
71        | {"categories": json_obj["categories"]}
72    )
73
74    # dump fetched infomations
75    with open(args.fetched_annotation_file, mode="w") as f:
76        json.dump(fetched_dict, f)
77
78    # remove files unused in inference
79    remove_wasted_imgs(json_obj, fetched_dict, args.images_dir)
80
81
82if __name__ == "__main__":
83    main()