8.2.6. Example: Recommendation using NCF

MovieLens 1M Dataset を用いて Neural Collaborative Filtering（NCF）モデルの学習方法を示すサンプルプログラム

注釈

本例で使用した学習済みモデルおよび一部のソースコードは、mlcommons/training から部分的に修正または直接引用されています。これらのコンポーネントはすべて Apache License, Version 2.0 のライセンス下で提供されています。

Execution Method

初回実行時は以下のダウンロードを行います（次回以降の実行では自動的にスキップされます）。デフォルトでは、ダウンロード先は /tmp/ncf_training/ に設定されています。

$ cd /opt/pfn/pfcomp/codegen/MLSDK/examples/ncf_training
$ ./preparation.sh
$ ./ncf_training.sh --option_json ../../../preset_options/debug.json

Output

評価結果は以下形式で出力されます。Hit Rate（HR）とNormalized Documented Cumulative Gain（NDCG）が表示されます。

HR@10 = 0.1628, NDCG@10 = 0.0808

Scripts

リスト 8.32 /opt/pfn/pfcomp/codegen/MLSDK/examples/ncf_training/ncf_training.py

import argparse
import os
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
from alias_generator import AliasSample
from mlsdk import Context, MNCoreAdam, MNCoreOptimizer, MNDevice
from ncf_eval import run_eval
from ncf_utils import (
    generate_neg_dataset,
    generate_padding,
    load_eval_data,
    load_model,
    load_train_pos_data,
    save_model,
)

# import from externals/mlcommons-ncf/recommendation/pytorch
from neumf import NeuMF
from torch.utils.data import ConcatDataset, DataLoader
from utility import apply_toml_defaults, compile_fn, set_deterministic_mode


def run_train(  # noqa: CFQ002
    args: argparse.Namespace,
    model: nn.Module,
    device: str,
    context: Context,
    optimizer: MNCoreOptimizer,
    loss_fn: nn.BCEWithLogitsLoss,
    outdir: str,
    pos_dataset: torch.utils.data.Dataset,
    neg_sampler: AliasSample,
    num_items: np.int64,
) -> None:
    # Define training functions
    def train_fx2onnx(sample_d: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
        optimizer.zero_grad()

        outputs = model(sample_d["user"], sample_d["item"])
        loss = loss_fn(outputs, sample_d["label"]).float()
        loss = torch.mean(loss.view(-1), 0)

        loss.backward()
        optimizer.step()
        return {"result": loss}

    # Dummy input for compilation
    samples = {
        "user": torch.randint(1, (args.train_batch_size,), dtype=torch.int64),
        "item": torch.randint(1, (args.train_batch_size,), dtype=torch.int64),
        "label": torch.rand(args.train_batch_size).view(-1, 1),
    }

    train_fn = compile_fn(
        context,
        train_fx2onnx,
        model,
        samples,
        outdir=outdir,
        model_name="ncf_train",
        is_train=True,
        optimizers=[optimizer],
        option_json=str(args.option_json),
    )

    model.train()

    total_iter = (
        len(pos_dataset) * (1 + args.train_neg_ratio)
    ) // args.train_batch_size

    for epoch in range(args.epoch):
        pos_users, _, pos_labels = pos_dataset.tensors
        neg_dataset = generate_neg_dataset(
            pos_users,
            pos_labels.size(),
            args.train_neg_ratio,
            num_items,
            args.allow_collision_with_pos,
            neg_sampler,
        )
        dataloader = DataLoader(
            ConcatDataset([pos_dataset, neg_dataset]),
            batch_size=args.train_batch_size,
            shuffle=True,
            num_workers=args.loader_num_workers,
            drop_last=True,
        )

        for num_batch, (users, items, labels) in enumerate(dataloader):
            samples["user"] = users
            samples["item"] = items
            samples["label"] = labels.view(-1, 1)

            output = train_fn(samples)
            epoch_str = f"Epoch {epoch + 1}/{args.epoch}"
            iteration_str = f"Iteration {num_batch + 1}/{total_iter}"
            loss_str = f"Loss: {output["result"].item():.4}"
            print(f"{epoch_str}, {iteration_str}, {loss_str}")

    # Synchronize tensors on MN-Core 2"s DRAM and PyTorch tensors
    context.synchronize()


def main(args: argparse.Namespace) -> None:  # noqa: CFQ001

    if args.save_path != "":
        dir_path = os.path.dirname(args.save_path) or "."
        if not os.access(dir_path, os.W_OK):
            raise ValueError("Parent directory of save_path is not writable.")

    # Fix seed values for reproducibility
    set_deterministic_mode(args.seed)

    # Decide device and outdir from given options
    device_name = args.device
    outdir = args.outdir

    # Load positive data for training and create dataset
    data_dir = f"/tmp/ncf_training/{args.dataset}"
    scaled_data_dir = (
        f"{data_dir}/{args.dataset}x{args.user_scaling}x{args.item_scaling}"
    )
    train_pos_dataset, num_users, num_items, neg_sampler = load_train_pos_data(
        scaled_data_dir, args.user_scaling, args.item_scaling
    )

    # Define model
    model = NeuMF(
        num_users,
        num_items,
        mf_dim=args.factors,
        mf_reg=0.0,
        mlp_layer_sizes=args.mlp_layers,
        mlp_layer_regs=([0.0] * len(args.mlp_layers)),
    )

    # Create loss function object
    loss_fn = nn.BCEWithLogitsLoss(reduction="none")

    # Create optimizer object
    optimizer = MNCoreAdam(
        model.parameters(),
        lr=args.learning_rate,
        chainer_use_torch=True,
    )

    # Pass the device information to context or move the model and optimizer to specified device
    device = MNDevice(device_name)
    train_context = Context(device)
    eval_context = Context(device)
    Context.switch_context(train_context)

    # Load pre-trained model
    if args.load_path != "":
        load_model(model, optimizer, model_path=args.load_path)

    # Run training
    run_train(
        args,
        model,
        device,
        train_context,
        optimizer,
        loss_fn,
        outdir,
        train_pos_dataset,
        neg_sampler,
        num_items,
    )

    # Save trained model
    if args.save_path != "":
        save_model(model, optimizer, outdir, model_path=args.save_path)

    # Load positive and negative data for evaluation and create dataloader
    eval_dataset, samples_per_user = load_eval_data(
        scaled_data_dir,
        num_users,
        args.user_scaling,
        args.item_scaling,
        args.eval_neg_ratio,
    )
    users_per_eval_batch = max(args.eval_batch_size // samples_per_user, 1)
    eval_dataset = ConcatDataset(
        [
            eval_dataset,
            generate_padding(len(eval_dataset), users_per_eval_batch, samples_per_user),
        ]
    )
    eval_dataloader = DataLoader(
        eval_dataset,
        batch_size=users_per_eval_batch,
        shuffle=False,
        num_workers=args.loader_num_workers,
    )

    # Switch to evaluation context
    Context.switch_context(eval_context)

    # Run evaluation
    run_eval(
        args,
        model,
        device,
        eval_context,
        outdir,
        eval_dataloader,
        samples_per_user,
        num_users,
    )


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # mlsdk options
    parser.add_argument("--device", type=str, default="mncore2:auto")
    parser.add_argument("--outdir", type=str, default="/tmp/mlsdk_ncf_training/out")
    parser.add_argument(
        "--option_json",
        type=Path,
        default="/opt/pfn/pfcomp/codegen/preset_options/O1.json",
    )

    apply_toml_defaults(str(Path(__file__).parent / "configs.toml"), parser)

    # Parse command line args and opts
    args = parser.parse_args()

    main(args)

リスト 8.33 /opt/pfn/pfcomp/codegen/MLSDK/examples/ncf_training/ncf_eval.py

import argparse
import math

import torch
from mlsdk import CompiledFunction, Context
from utility import compile_fn


# Measure inference accuracy as hit ratio (HR) and normalized documented cumulative gain (NDCG)
def measure_acc(  # noqa: CFQ002
    args: argparse.Namespace,
    device: str,
    dataloader: torch.utils.data.DataLoader,
    samples_per_user: int,
    infer_fn: CompiledFunction,
    K: int,
    num_user: int,
) -> None:
    log_2 = math.log(2)

    hits = torch.tensor(0.0)
    ndcg = torch.tensor(0.0)

    with torch.no_grad():
        for user, item, dup_mask, pos_item_indices in dataloader:
            samples = {
                "user": user.view(-1),
                "item": item.view(-1),
            }

            scores = infer_fn(samples)["result"].detach().view(-1, samples_per_user)

            # Set scores of duplicate items to -1 to exclude them from top-k
            scores[dup_mask.bool()] = -1
            _, top_k_indices = torch.topk(scores, K)

            # Check if the positive item is among the top-k recommendations (a "hit")
            hit_mask = top_k_indices == pos_item_indices.unsqueeze(1)
            hits += hit_mask.sum().item()

            # Find normalized documented cumulative gain (NDCG)
            hit_ranks = torch.nonzero(hit_mask)[:, 1].view(-1).to(torch.float)
            ndcg += (log_2 / (hit_ranks + 2).log_()).sum()

    hit_rate = hits.item() / num_user
    ndcg = ndcg.item() / num_user
    print(f"HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}")


def run_eval(  # noqa: CFQ002
    args: argparse.Namespace,
    model: torch.nn.Module,
    device: str,
    context: Context,
    outdir: str,
    dataloader: torch.utils.data.DataLoader,
    samples_per_user: int,
    num_user: int,
) -> None:
    # Define inference functions
    def infer_fx2onnx(sample_d: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
        output = None
        with torch.no_grad():
            output = model(sample_d["user"], sample_d["item"], sigmoid=True)

        return {"result": output}

    # Sample input for compilation
    user, item = next(iter(dataloader))[:2]
    sample = {
        "user": user.view(-1),
        "item": item.view(-1),
    }

    infer_fn = compile_fn(
        context,
        infer_fx2onnx,
        model,
        sample,
        outdir=outdir,
        model_name="ncf_eval",
        is_train=False,
        option_json=str(args.option_json),
    )

    model.eval()

    measure_acc(
        args, device, dataloader, samples_per_user, infer_fn, args.topk, num_user
    )

リスト 8.34 /opt/pfn/pfcomp/codegen/MLSDK/examples/ncf_training/ncf_utils.py

import os
import pickle

import numpy as np
import torch

# import from externals/mlcommons-ncf/recommendation/pytorch
from alias_generator import AliasSample
from convert import CACHE_FN, generate_negatives
from mlsdk import MNCoreOptimizer
from torch.utils.data import TensorDataset


def load_sampler(
    data_dir: str, user_scaling: int, item_scaling: int
) -> tuple[AliasSample, np.ndarray, np.ndarray, np.int64]:
    fn_prefix = data_dir + "/" + CACHE_FN.format(user_scaling, item_scaling)
    sampler_cache = fn_prefix + "cached_sampler.pkl"

    if os.path.exists(data_dir):
        print(f"Using alias file: {sampler_cache}")
        with open(sampler_cache, "rb") as f:
            sampler, pos_users, pos_items, num_items, _ = pickle.load(f)
    else:
        raise ValueError(f"sampler directory does not exist: {data_dir}")

    return (sampler, pos_users, pos_items, num_items)


def generate_neg_dataset(
    pos_users: torch.Tensor,
    label_size: torch.Size,
    neg_ratio: int,
    num_items: np.int64,
    allow_collision: bool,
    sampler: AliasSample,
) -> TensorDataset:
    if allow_collision:
        neg_users = pos_users.repeat(neg_ratio)
        neg_items = torch.empty_like(neg_users, dtype=torch.int64).random_(
            0, int(num_items)
        )
    else:
        # Use sampler which had been generated in convert.py
        # The sampler avoids collision of item id between positive and negative data
        negatives = generate_negatives(sampler, neg_ratio, pos_users.numpy())
        negatives = torch.from_numpy(negatives)
        neg_users = negatives[:, 0]
        neg_items = negatives[:, 1]
    neg_labels = torch.zeros(label_size, dtype=torch.float32).repeat(neg_ratio)

    return TensorDataset(neg_users, neg_items, neg_labels)


def load_train_pos_data(
    data_dir: str, user_scaling: int, item_scaling: int
) -> tuple[TensorDataset, int, np.int64, AliasSample]:
    sampler, pos_users, pos_items, num_items = load_sampler(
        data_dir, user_scaling, item_scaling
    )

    num_users = len(sampler.num_regions)
    pos_users = torch.from_numpy(pos_users).type(torch.LongTensor)
    pos_items = torch.from_numpy(pos_items).type(torch.LongTensor)
    pos_labels = torch.ones_like(pos_users, dtype=torch.float32)
    dataset = TensorDataset(pos_users, pos_items, pos_labels)

    return dataset, num_users, num_items, sampler


def load_eval_data(
    data_dir: str, num_users: int, user_scaling: int, item_scaling: int, neg_ratio: int
) -> tuple[TensorDataset, int]:
    # Load positive items
    pos_item_chunks = []
    for chunk_id in range(user_scaling):
        pos_ratings = torch.from_numpy(
            np.load(
                f"{data_dir}/testx{user_scaling}x{item_scaling}_{chunk_id}.npz",
                encoding="bytes",
            )["arr_0"]
        )
        pos_item_chunks.append(pos_ratings[:, 1].reshape(-1, 1))

    # Load negative items
    neg_item_chunks = []
    for chunk_id in range(user_scaling):
        neg_ratings = torch.from_numpy(
            np.load(
                f"{data_dir}/test_negx{user_scaling}x{item_scaling}_{chunk_id}.npz",
                encoding="bytes",
            )["arr_0"]
        )
        neg_item_chunks.append(neg_ratings[:, 1].reshape(-1, neg_ratio))

    # Concat positive and negative items
    item_chunks = [
        torch.cat((negs, poses), dim=1)
        for negs, poses in zip(neg_item_chunks, pos_item_chunks)
    ]

    # Get indices of positive items in concatenated items
    pos_item_index_chunks = []
    for items, pos_items in zip(item_chunks, pos_item_chunks):
        is_positive_mask = items == pos_items
        pos_item_index_chunks.append(torch.argmax(is_positive_mask.long(), dim=1))

    # Create a mask to identify duplicate items to avoid them during evaluation
    dup_mask_chunks = []
    for items in item_chunks:
        stable_indices = torch.argsort(items, dim=1, stable=True)
        sorted_items = torch.gather(items, 1, stable_indices)

        is_duplicate_sorted = sorted_items[:, 1:] == sorted_items[:, :-1]
        dup_mask_sorted = torch.cat(
            [
                torch.zeros(is_duplicate_sorted.shape[0], 1, dtype=torch.bool),
                is_duplicate_sorted,
            ],
            dim=1,
        )

        # Unsort the mask back to the original item order
        inverse_indices = torch.argsort(stable_indices, dim=1)
        dup_mask = torch.gather(dup_mask_sorted, 1, inverse_indices)
        dup_mask_chunks.append(dup_mask)

    # Concatenate all chunks into final Tensors
    items = torch.cat(item_chunks, dim=0).long()
    dup_mask = torch.cat(dup_mask_chunks, dim=0)
    pos_item_indices = torch.cat(pos_item_index_chunks, dim=0)

    # Replicate each user ID for the number of item samples they have
    users = torch.arange(num_users, dtype=torch.long).unsqueeze(1)
    users = users.repeat(1, items.shape[1])

    dataset = TensorDataset(users, items, dup_mask, pos_item_indices)
    samples_per_user = items.size(1)

    return dataset, samples_per_user


def generate_padding(
    data_len: int, users_per_batch: int, samples_per_user: int
) -> TensorDataset:
    remainder_users = data_len % users_per_batch
    padding_users = users_per_batch - remainder_users if remainder_users > 0 else 0

    dummy_users = torch.zeros(padding_users, samples_per_user, dtype=torch.long)
    dummy_items = torch.zeros(padding_users, samples_per_user, dtype=torch.long)
    dummy_dup_mask = torch.zeros(padding_users, samples_per_user, dtype=torch.bool)
    dummy_pos_item_indices = torch.full((padding_users,), -1, dtype=torch.long)

    return TensorDataset(
        dummy_users, dummy_items, dummy_dup_mask, dummy_pos_item_indices
    )


def load_model(
    model: torch.nn.Module,
    optimizer: MNCoreOptimizer | None = None,
    model_path: str | os.PathLike = "./ncf_model.pth",
) -> None:
    print(f"loading model from {model_path}")
    weights = torch.load(model_path, weights_only=True)

    model.load_state_dict(weights["model"])
    if optimizer is not None:
        optimizer.load_state_dict(weights["optimizer"])


def save_model(
    model: torch.nn.Module,
    optimizer: MNCoreOptimizer,
    outdir: str,
    model_path: str | os.PathLike = "ncf_model.pth",
) -> None:
    torch.save(
        {
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
        },
        os.path.join(outdir, model_path),
    )

    print(f"model saved to {model_path}")

リスト 8.35 /opt/pfn/pfcomp/codegen/MLSDK/examples/ncf_training/utility.py

import argparse
import os
import pathlib
import random
import sys
from collections.abc import Callable
from typing import Any

import numpy as np
import tomllib
import torch
from fx2onnx import set_tensor_name
from mlsdk import (
    CacheOptions,
    CompiledFunction,
    Context,
    MNCoreOptimizer,
    get_tensor_name,
    set_buffer_name_in_optimizer,
    set_tensor_name_in_module,
    storage,
)


def set_deterministic_mode(seed: int) -> None:
    # Set seed
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    # Set cudnn.benchmark mode and specify the use of deterministic algorithms
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)


def register_model(
    context: Context,
    name: str,
    model: torch.nn.Module,
) -> None:
    if (
        get_tensor_name(next(model.parameters())) is None
    ):  # in case the model obj isn't registered to the context
        set_tensor_name_in_module(model, name)
        for p in model.parameters():
            context.register_param(p)
        for b in model.buffers():
            context.register_buffer(b)


def compile_fn(  # noqa: CFQ002
    context: Context,
    target_fn: Callable[
        [
            dict[str, torch.Tensor],
        ],
        dict[str, torch.Tensor],
    ],  # compiled fn
    model: torch.nn.Module | dict[str, torch.nn.Module],
    sample_input: dict[str, torch.Tensor],
    outdir: str = "/tmp/example_output",
    model_name: str = "example",
    is_train: bool = True,
    optimizers: (
        list[MNCoreOptimizer] | None
    ) = None,  # list[] is for multiple optimizers
    option_json: str = "/opt/pfn/pfcomp/codegen/preset_options/O1.json",
    preset_options_dir: str | None = None,
    enable_cache: bool = False,
    **kwargs: Any,  # used in `compile_args` in Context.compile()
) -> CompiledFunction:

    if preset_options_dir is None:
        preset_options_dir = pathlib.Path.cwd().parent.parent.parent / "preset_options"

    compile_options = {"option_json": option_json}

    compile_args = {
        "function": target_fn,
        "inputs": sample_input,
        "options": compile_options,
    }

    codegen_base_dir = storage.path(outdir)
    compile_args["codegen_dir"] = codegen_base_dir / model_name

    if enable_cache:
        compile_args["cache_options"] = CacheOptions(
            f"{outdir}/{model_name}/cache",
            enable_app_cache=True,
            enable_onnx_cache=True,
            enable_codegen_cache=True,
            enable_gpfn2obj_cache=True,
        )

    if isinstance(model, torch.nn.Module):
        register_model(context, model_name, model)
    else:  # if isinstance(models, dict[str, torch.nn.Module]):
        for name, actual_model in model.items():
            register_model(context, name, actual_model)

    if is_train:
        if optimizers is None:  # in case that optimizer.step() will be done at the host
            if isinstance(model, torch.nn.Module):
                for n, p in model.named_parameters():
                    p.grad = torch.nn.Parameter(
                        torch.zeros_like(p), requires_grad=p.requires_grad
                    )
                    set_tensor_name(p.grad, f"{model_name}@{n}@grad".replace(".", "_"))
                    context.register_param(p.grad)
            else:
                for name, actual_model in model.items():
                    for n, p in actual_model.named_parameters():
                        p.grad = torch.nn.Parameter(
                            torch.zeros_like(p), requires_grad=p.requires_grad
                        )
                        set_tensor_name(p.grad, f"{name}@{n}".replace(".", "_"))
                        context.register_param(p.grad)
        else:
            for idx, optimizer in enumerate(optimizers):
                optimizer_name = "optimizer" + str(idx)
                set_buffer_name_in_optimizer(optimizer, optimizer_name)
                context.register_optimizer_buffers(optimizer)

    compile_args.update(kwargs)

    return context.compile(**compile_args)


# for type hint of the configs from toml
class TomlValue:
    str | int | float | bool | list["TomlValue"] | dict[str, "TomlValue"]


class TomlDict:
    dict[str, TomlValue]


def read_configs_from_toml(
    toml_path: str,
) -> TomlDict:

    configs_dict = None
    with open(toml_path, mode="rb") as f:
        configs_dict = tomllib.load(f)

    return configs_dict


def str2bool(v: bool | str) -> bool:
    if v.lower() in ("yes", "true", "on", "enable", "y", "t", "1"):
        return True
    elif v.lower() in ("no", "false", "off", "disable", "n", "f", "0"):
        return False
    elif isinstance(v, str | bool):
        return v
    else:
        raise argparse.ArgumentTypeError("Str or boolean value expected")


def apply_toml_defaults(
    configs: TomlDict | str | os.PathLike,
    parser: argparse.ArgumentParser,
) -> None:

    if isinstance(configs, dict):
        for k, v in configs.items():
            if isinstance(v, dict):  # in case v is (nested) dict
                apply_toml_defaults(v, parser)
            else:
                # just checking whether v is list is enough for array args
                # because array in toml is converted to the list by tomllib
                args_type = None
                if isinstance(v, list):
                    args_type = type(v[0])
                elif isinstance(v, bool):
                    args_type = str2bool
                else:
                    args_type = type(v)
                parser.add_argument(
                    f"--{k}",
                    default=v,
                    type=args_type,
                    nargs="*" if isinstance(v, list) else "?",
                )
    elif isinstance(configs, str | os.PathLike):
        configs_dict = read_configs_from_toml(configs)

        apply_toml_defaults(configs_dict, parser)
    else:
        sys.exit("")

リスト 8.36 /opt/pfn/pfcomp/codegen/MLSDK/examples/ncf_training/requirements_datagen.txt

absl-py==0.7.0
numpy==1.16.2
pandas==0.24.2
protobuf==3.19.6
scikit-image==0.14.2
scikit-learn==0.20.3
scipy==1.2.1
six==1.12.0
tensorflow==1.13.1

リスト 8.37 /opt/pfn/pfcomp/codegen/MLSDK/examples/ncf_training/requirements_training.txt

scipy==1.16.0
torch==2.9.0
numpy==2.3.0
numpy_indexed==0.3.7
pandas==2.3.0
mlperf_compliance==0.0.10

リスト 8.38 /opt/pfn/pfcomp/codegen/MLSDK/examples/ncf_training/configs.toml

title = "ncf_training"


[model]
factors    = 64
mlp_layers = [256, 256, 128, 64]
save_path  = "./ncf_model.pth"
load_path  = ""
model_path = "./ncf_model.pth"


[dataset]
dataset                  = "ml-1m"
user_scaling             = 1       # this value must be the same as specified in preparation.sh
item_scaling             = 1       # this value must be the same as specified in preparation.sh
train_neg_ratio          = 4
eval_neg_ratio           = 999
allow_collision_with_pos = false
loader_num_workers       = 2


[training]
epoch         = 20
learning_rate = 0.0002


[evaluation]
topk = 10


[misc]
seed             = 0
train_batch_size = 65536 # 2**16
eval_batch_size  = 16384 # 2**14