7.2.5. Example: Image Segmentation using DETR

Sample program demonstrating inference using the DETR trained model on the COCO dataset

Note

The trained model and some source code used in this example have been partially modified or directly sourced from facebookresearch/detr. All of these components are licensed under the Apache License, Version 2.0.

The COCO dataset is licensed under CC BY 4.0, and we exclusively use images licensed under CC BY 2.0.

Execution Method

The first execution performs the following downloads (subsequent runs will skip these steps). By default, the download location is /tmp/mlsdk_detr_inference/.

Python packages
DETR
COCO dataset

$ cd /opt/pfn/pfcomp/codegen/MLSDK/examples/detr_inference
$ ./run_detr_inference.sh /tmp/mlsdk_detr_inference/coco/val2017/000000000785.jpg

Expected Output

A segmentation result will be saved in the current working directory.

Segmentation result (./000000000785.png)

Fig. 7.5 Segmentation result using DETR on MN-Core 2

Scripts

Listing 7.23 /opt/pfn/pfcomp/codegen/MLSDK/examples/detr_inference/run_detr_inference.sh

#! /bin/bash

set -eux -o pipefail

EXAMPLE_NAME="mlsdk_detr_inference"
VENV_DIR=${VENV_DIR:-"/tmp/${EXAMPLE_NAME}/venv"}
EXTERNAL_DIR=${EXTERNAL_DIR:-"/tmp/${EXAMPLE_NAME}/external"}
COCO_DIR=${COCO_DIR:-"/tmp/${EXAMPLE_NAME}/coco"}
OUT_DIR=${OUT_DIR:-"/tmp/${EXAMPLE_NAME}/out"}

CURRENT_DIR=$(realpath $(dirname $0))
CODEGEN_DIR=$(realpath ${CURRENT_DIR}/../../../)
BUILD_DIR="${CODEGEN_DIR}/build"

### Prepare and source venv/

if [[ ! -d ${VENV_DIR} ]]; then
    python3 -m venv --system-site-packages ${VENV_DIR}
    source ${VENV_DIR}/bin/activate
    pip3 install -r ${CURRENT_DIR}/requirements.txt
else
    source ${VENV_DIR}/bin/activate
fi

### Prepare external/ items

mkdir -p ${EXTERNAL_DIR}
pushd ${EXTERNAL_DIR}
if [[ ! -d detr ]]; then
    git clone https://github.com/facebookresearch/detr.git --depth 1
fi
popd

TARGET_FILES=(
    "models/detr.py"
    "models/matcher.py"
)

for REL_PATH in "${TARGET_FILES[@]}"; do
    BASE_NAME=$(basename "$REL_PATH" .py)
    PATCH_TARGET="${EXTERNAL_DIR}/detr/${REL_PATH}"
    PATCH_FILE="${CURRENT_DIR}/patches/${BASE_NAME}.patch"
    patch --forward --backup -i "$PATCH_FILE" "$PATCH_TARGET" || [ $? -eq 1 ]
done

cp ${CURRENT_DIR}/lsa.py ${EXTERNAL_DIR}/detr/models/

### Run detr_inference.py

export PYTHONPATH="${EXTERNAL_DIR}/detr${PYTHONPATH:+:${PYTHONPATH}}"
echo PYTHONPATH

source "${BUILD_DIR}/codegen_pythonpath.sh"

export MNCORE_USE_EXTERNAL_DATA_FORMAT=1

python3 ${CURRENT_DIR}/detr_inference.py ${@}

Listing 7.24 /opt/pfn/pfcomp/codegen/MLSDK/examples/detr_inference/detr_inference.py

import argparse
from pathlib import Path
from typing import Any

import torch
import torchvision.transforms as T
from detr_eval import DETRSegmToBBoxAttn, MaskHeadPart, run_eval
from mlsdk import Context, MNDevice
from PIL import Image
from utility import apply_toml_defaults


def prepare_task_components(args: argparse.Namespace) -> dict[str, Any]:
    task_components = {}

    # Create model and post processor objs
    task_components["model"], task_components["postprocessors"] = torch.hub.load(
        "facebookresearch/detr",
        "detr_resnet50_panoptic",
        pretrained=True,
        return_postprocessor=True,
    )

    # standard PyTorch mean-std input image normalization
    transform = T.Compose(
        [
            T.Resize((800, 800)),
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]
    )
    im = Image.open(args.img_path)
    task_components["orig_target_size"] = torch.as_tensor(
        T.functional.to_tensor(im).shape[-2:]
    ).unsqueeze(0)
    task_components["image"] = transform(im).unsqueeze(0)

    return task_components


def main(args: argparse.Namespace) -> None:
    # Pass device info to the Context obj
    device = MNDevice(args.device_name)
    context = Context(device)
    Context.switch_context(context)

    task_components = prepare_task_components(args)
    task_components["mask_head"] = MaskHeadPart(
        task_components["model"], num_split=args.num_split
    )
    task_components["model"] = DETRSegmToBBoxAttn(task_components["model"])

    run_eval(args, task_components, context)


if __name__ == "__main__":

    parser = argparse.ArgumentParser()

    parser.add_argument("img_path", type=Path, help="Path to input image")
    parser.add_argument("--device", type=str, default="mncore2:auto")
    parser.add_argument("--outdir", type=str, default="/tmp/mlsdk_detr_inference/out")
    parser.add_argument(
        "--option_json",
        type=Path,
        default="/opt/pfn/pfcomp/codegen/preset_options/O1.json",
    )

    # load configs from toml file
    apply_toml_defaults(Path(__file__).resolve().parent / "configs.toml", parser)

    args = parser.parse_args()

    # Set "device" attribute for detr module
    args.device, args.device_name = "cpu", args.device

    main(args)

Listing 7.25 /opt/pfn/pfcomp/codegen/MLSDK/examples/detr_inference/detr_eval.py

import io
import os
from argparse import Namespace
from pathlib import Path
from typing import Any

import numpy
import torch
from detectron2.data import MetadataCatalog
from detectron2.utils.visualizer import Visualizer
from mlsdk import CompiledFunction, Context
from models.segmentation import DETRsegm
from panopticapi.utils import rgb2id
from PIL import Image
from util.misc import nested_tensor_from_tensor_list
from utility import compile_fn


class DETRSegmToBBoxAttn(torch.nn.Module):
    def __init__(self, detr_segm: DETRsegm) -> None:
        super().__init__()
        self.detr = detr_segm.detr
        self.bbox_attention = detr_segm.bbox_attention

    def forward(self, sample: torch.Tensor) -> dict[str, torch.Tensor]:
        out = {}

        sample = nested_tensor_from_tensor_list(sample)
        features, pos = self.detr.backbone(sample)

        src, mask = features[-1].decompose()
        src_proj = self.detr.input_proj(src)
        hs, memory = self.detr.transformer(
            src_proj, mask, self.detr.query_embed.weight, pos[-1]
        )

        outputs_class = self.detr.class_embed(hs)
        outputs_coord = self.detr.bbox_embed(hs).sigmoid()

        # FIXME h_boxes takes the last one computed, keep this in mind
        bbox_mask = self.bbox_attention(hs[-1], memory, mask=mask)

        out.update(
            feat2=features[2].tensors,
            feat1=features[1].tensors,
            feat0=features[0].tensors,
            src_proj=src_proj,
            pred_logits=outputs_class[-1],
            pred_boxes=outputs_coord[-1],
            bbox_mask=bbox_mask,
        )

        return out


class MaskHeadPart(torch.nn.Module):
    def __init__(self, detr_segm: DETRsegm, num_split: int = 100) -> None:
        super().__init__()

        self.mask_head = detr_segm.mask_head
        self.num_queries = detr_segm.detr.num_queries  # 100
        self.num_split = num_split

    def forward(
        self, x: torch.Tensor, bbox_mask: torch.Tensor, fpns: list[torch.Tensor]
    ) -> torch.Tensor:
        seg_mask = self.mask_head(x, bbox_mask, fpns)
        output_seg_mask = seg_mask.view(
            x.shape[0],
            self.num_queries // self.num_split,
            seg_mask.shape[-2],
            seg_mask.shape[-1],
        )

        return output_seg_mask


def visualize_prediction(
    args: Namespace,
    task_components: dict[str, Any],
    outputs: dict[str, torch.Tensor],
) -> None:

    image = numpy.array(Image.open(args.img_path))[:, :, ::-1]
    result = task_components["postprocessors"](outputs, [image.shape[:2]])[0]

    # Panoptic predictions are stored in a special format png
    panoptic_seg = Image.open(io.BytesIO(result["png_string"]))

    # We convert the png into an segment id map
    panoptic_seg = numpy.array(panoptic_seg, dtype=numpy.uint8)
    panoptic_seg = torch.from_numpy(rgb2id(panoptic_seg))

    # Detectron2 uses a different numbering of coco classes,
    # here we convert the class ids accordingly
    meta = MetadataCatalog.get("coco_2017_val_panoptic_separated")
    for info in result["segments_info"]:
        c = info["category_id"]
        if info["isthing"]:
            info["category_id"] = meta.thing_dataset_id_to_contiguous_id[c]
        else:
            info["category_id"] = meta.stuff_dataset_id_to_contiguous_id[c]

    # Finally we visualize the prediction
    v = Visualizer(image, meta)
    v._default_font_size = 20
    v = v.draw_panoptic_seg_predictions(panoptic_seg, result["segments_info"])
    v.save(os.path.join(args.outdir, f"{Path(args.img_path).stem}.png"))


def compile_eval_fn(
    args: Namespace,
    task_components: dict[str, Any],
    context: Context,
) -> tuple[CompiledFunction, CompiledFunction, dict[str, torch.Tensor]]:

    # in case using a mncore2 backend, the DETR model is separated
    # into two halves to avoid the LM oom and the reshape errors:
    # the first half calculates the bbox_masks,
    # and the second half calculates the remaining MaskHead part
    def eval_to_bbox_or_full(
        sample: dict[str, torch.Tensor],
    ) -> dict[str, torch.Tensor]:  # eval_all_or_bbox
        with torch.no_grad():
            return task_components["model"](sample["image"])

    def eval_mask_head(sample: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
        with torch.no_grad():
            pred_mask = task_components["mask_head"](
                sample["src_proj"],
                sample["bbox_mask"],
                [sample["feat2"], sample["feat1"], sample["feat0"]],
            )

        return {"pred_mask": pred_mask}

    eval_fn = eval_to_bbox_or_full
    mask_fn = eval_mask_head

    # For compilation for MLSDK/MN-Core 2
    sample = {"image": task_components["image"]}

    eval_fn = compile_fn(
        context,
        eval_fn,
        task_components["model"],
        sample,
        os.path.join(args.outdir, "to_bbox"),
        model_name="detr_to_bbox",
        option_json=args.option_json,
    )

    sample.update(
        src_proj=torch.randn(1, 256, 25, 25),
        bbox_mask=torch.rand(
            1, task_components["mask_head"].num_queries // args.num_split, 8, 25, 25
        ),  # originally, 1, 100, 8, 25, 25
        feat2=torch.randn(1, 1024, 50, 50),
        feat1=torch.randn(1, 512, 100, 100),
        feat0=torch.randn(1, 256, 200, 200),
    )

    mask_fn = compile_fn(
        context,
        mask_fn,
        task_components["mask_head"],
        sample,
        os.path.join(args.outdir, "mask_head"),
        model_name="detr_mask_head",
        option_json=args.option_json,
    )

    return eval_fn, mask_fn


@torch.no_grad()
def evaluate(
    args: Namespace,
    eval_fn: CompiledFunction,
    mask_fn: CompiledFunction,
    task_components: dict[str, Any],
) -> dict[str, torch.Tensor]:

    sample = {"image": task_components["image"]}

    outputs = eval_fn(sample)

    num_queries = outputs["bbox_mask"].shape[1]
    bbox_masks = outputs["bbox_mask"].split(num_queries // args.num_split, dim=1)
    sample.update(
        src_proj=outputs["src_proj"],
        feat2=outputs["feat2"],
        feat1=outputs["feat1"],
        feat0=outputs["feat0"],
    )

    pred_masks = []
    for bbox_mask in bbox_masks:
        sample.update(bbox_mask=bbox_mask)
        pred_masks.append(mask_fn(sample)["pred_mask"].cpu())

    pred_masks = torch.cat(pred_masks, dim=0)
    outputs = {
        "pred_logits": outputs["pred_logits"],
        "pred_boxes": outputs["pred_boxes"],
        "pred_masks": pred_masks.view(
            1, num_queries, pred_masks.shape[-2], pred_masks.shape[-1]
        ),
    }

    return outputs


def run_eval(
    args: Namespace, task_components: dict[str, Any], context: Context
) -> None:

    task_components["model"].eval()
    task_components["mask_head"].eval()
    task_components["postprocessors"].eval()

    eval_fn, mask_fn = compile_eval_fn(args, task_components, context)
    outputs = evaluate(args, eval_fn, mask_fn, task_components)
    visualize_prediction(args, task_components, outputs)

Listing 7.26 /opt/pfn/pfcomp/codegen/MLSDK/examples/detr_inference/utility.py

import argparse
import os
import sys
from collections.abc import Callable
from pathlib import Path

import tomllib
import torch
from mlsdk import (
    CompiledFunction,
    Context,
    get_tensor_name,
    set_tensor_name_in_module,
    storage,
)


def register_model(
    context: Context,
    name: str,
    model: torch.nn.Module,
) -> None:
    if (
        get_tensor_name(next(model.parameters())) is None
    ):  # in case the model obj isn't registered to the context
        set_tensor_name_in_module(model, name)
        for p in model.parameters():
            context.register_param(p)
        for b in model.buffers():
            context.register_buffer(b)


def compile_fn(  # noqa: CFQ002
    context: Context,
    target_fn: Callable[
        [
            dict[str, torch.Tensor],
        ],
        dict[str, torch.Tensor],
    ],  # compiled fn
    model: torch.nn.Module,
    sample_input: dict[str, torch.Tensor],
    outdir: str = "/tmp/example_output",
    model_name: str = "example",
    option_json: Path | None = None,
) -> CompiledFunction:

    if option_json is None:
        option_json = Path("/opt/pfn/pfcomp/codegen/preset_options/O1.json")

    compile_options = {"option_json": str(option_json)}

    compile_args = {
        "function": target_fn,
        "inputs": sample_input,
        "options": compile_options,
    }

    codegen_base_dir = storage.path(outdir)
    compile_args["codegen_dir"] = codegen_base_dir / model_name

    register_model(context, "model", model)

    return context.compile(**compile_args)


# for type hint of the configs from toml
class TomlValue:
    str | int | float | bool | list["TomlValue"] | dict[str, "TomlValue"]


class TomlDict:
    dict[str, TomlValue]


def read_configs_from_toml(
    toml_path: str,
) -> TomlDict:

    configs_dict = None
    with open(toml_path, mode="rb") as f:
        configs_dict = tomllib.load(f)

    return configs_dict


def apply_toml_defaults(
    configs: TomlDict | str | os.PathLike,
    parser: argparse.ArgumentParser,
) -> None:

    if isinstance(configs, dict):
        for k, v in configs.items():
            if isinstance(v, dict):  # in case v is (nested) dict
                apply_toml_defaults(v, parser)
            else:
                parser.add_argument(f"--{k}", default=v, type=type(v))
    elif isinstance(configs, str) or isinstance(configs, os.PathLike):
        configs_dict = read_configs_from_toml(configs)

        apply_toml_defaults(configs_dict, parser)
    else:
        sys.exit("")

Listing 7.27 /opt/pfn/pfcomp/codegen/MLSDK/examples/detr_inference/requirements.txt

Cython==3.1.3
matplotlib==3.10.5
pycocotools==2.0.10
scipy==1.16.1
git+https://github.com/cocodataset/panopticapi.git
git+https://github.com/facebookresearch/detectron2.git

Listing 7.28 /opt/pfn/pfcomp/codegen/MLSDK/examples/detr_inference/configs.toml

title = "detr_inference"

[model.backbone]
backbone = "resnet50"
dilation = false

[model.transformer]
enc_layers      = 6
dec_layers      = 6
dim_feedforward = 2048
hidden_dim      = 256
dropout         = 0.1
nheads          = 8
num_queries     = 100
pre_norm        = false

[model.loss.matcher]
set_cost_class = 1
set_cost_bbox  = 5
set_cost_giou  = 2

[model.loss.loss_coefficients]
eos_coef       = 0.1
mask_loss_coef = 1
dice_loss_coef = 1
bbox_loss_coef = 5
giou_loss_coef = 2


[data]
max_num_segms      = 90                # max num of segments in the ground truth labels
num_workers        = 2


[training]
lr             = 1e-4
lr_backbone    = 1e-5
weight_decay   = 1e-4
epochs         = 300
lr_drop        = 200   # epoch at which lr drops
clip_max_norm  = 0.1   # gradient clipping max norm
frozen_weights = ""    # path to the pretrained model. if set, only the mask head will be trained
resume         = ""    # checkpoint to resume from
masks          = true  # create DETRSegm obj
aux_loss       = false


[evaluation]
num_split          = 100
position_embedding = "sine"
dataset_file       = "coco_panoptic"


[misc]
batch_size = 1