8.2.7. Example: Recommendation using NCF

Sample program demonstrating training of the Neural Collaborative Filtering (NCF) model on the MovieLens 1M Dataset

Note

The trained model and some source code used in this example have been partially modified or directly sourced from mlcommons/training. All of these components are licensed under the Apache License, Version 2.0.

Execution Method

The first execution performs the following downloads (subsequent runs will skip these steps). By default, the download location is /tmp/ncf_training/.

$ cd /opt/pfn/pfcomp/codegen/MLSDK/examples/ncf_training
$ ./preparation.sh
$ ./ncf_training.sh --option_json ../../../preset_options/debug.json

Output

The evaluation results are output in the following format. They shows the hit rate (HR) and the normalized documented cumulative gain (NDCG).

HR@10 = 0.1628, NDCG@10 = 0.0808

Scripts

Listing 8.34 /opt/pfn/pfcomp/codegen/MLSDK/examples/ncf_training/ncf_training.py

import argparse
import os
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
from alias_generator import AliasSample
from mlsdk import Context, MNCoreAdam, MNCoreOptimizer, MNDevice
from ncf_eval import run_eval
from ncf_utils import (
    generate_neg_dataset,
    generate_padding,
    load_eval_data,
    load_model,
    load_train_pos_data,
    save_model,
)

# import from externals/mlcommons-ncf/recommendation/pytorch
from neumf import NeuMF
from torch.utils.data import ConcatDataset, DataLoader
from utility import apply_toml_defaults, compile_fn, set_deterministic_mode


def run_train(  # noqa: CFQ002
    args: argparse.Namespace,
    model: nn.Module,
    device: str,
    context: Context,
    optimizer: MNCoreOptimizer,
    loss_fn: nn.BCEWithLogitsLoss,
    outdir: str,
    pos_dataset: torch.utils.data.Dataset,
    neg_sampler: AliasSample,
    num_items: np.int64,
) -> None:
    # Define training functions
    def train_fx2onnx(sample_d: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
        optimizer.zero_grad()

        outputs = model(sample_d["user"], sample_d["item"])
        loss = loss_fn(outputs, sample_d["label"]).float()
        loss = torch.mean(loss.view(-1), 0)

        loss.backward()
        optimizer.step()
        return {"result": loss}

    # Dummy input for compilation
    samples = {
        "user": torch.randint(1, (args.train_batch_size,), dtype=torch.int64),
        "item": torch.randint(1, (args.train_batch_size,), dtype=torch.int64),
        "label": torch.rand(args.train_batch_size).view(-1, 1),
    }

    train_fn = compile_fn(
        context,
        train_fx2onnx,
        model,
        samples,
        outdir=outdir,
        model_name="ncf_train",
        is_train=True,
        optimizers=[optimizer],
        option_json=str(args.option_json),
    )

    model.train()

    total_iter = (
        len(pos_dataset) * (1 + args.train_neg_ratio)
    ) // args.train_batch_size

    for epoch in range(args.epoch):
        pos_users, _, pos_labels = pos_dataset.tensors
        neg_dataset = generate_neg_dataset(
            pos_users,
            pos_labels.size(),
            args.train_neg_ratio,
            num_items,
            args.allow_collision_with_pos,
            neg_sampler,
        )
        dataloader = DataLoader(
            ConcatDataset([pos_dataset, neg_dataset]),
            batch_size=args.train_batch_size,
            shuffle=True,
            num_workers=args.loader_num_workers,
            drop_last=True,
        )

        for num_batch, (users, items, labels) in enumerate(dataloader):
            samples["user"] = users
            samples["item"] = items
            samples["label"] = labels.view(-1, 1)

            output = train_fn(samples)
            epoch_str = f"Epoch {epoch + 1}/{args.epoch}"
            iteration_str = f"Iteration {num_batch + 1}/{total_iter}"
            loss_str = f"Loss: {output["result"].item():.4}"
            print(f"{epoch_str}, {iteration_str}, {loss_str}")

    # Synchronize tensors on MN-Core 2"s DRAM and PyTorch tensors
    context.synchronize()


def main(args: argparse.Namespace) -> None:  # noqa: CFQ001

    if args.save_path != "":
        dir_path = os.path.dirname(args.save_path) or "."
        if not os.access(dir_path, os.W_OK):
            raise ValueError("Parent directory of save_path is not writable.")

    # Fix seed values for reproducibility
    set_deterministic_mode(args.seed)

    # Decide device and outdir from given options
    device_name = args.device
    outdir = args.outdir

    # Load positive data for training and create dataset
    data_dir = f"/tmp/ncf_training/{args.dataset}"
    scaled_data_dir = (
        f"{data_dir}/{args.dataset}x{args.user_scaling}x{args.item_scaling}"
    )
    train_pos_dataset, num_users, num_items, neg_sampler = load_train_pos_data(
        scaled_data_dir, args.user_scaling, args.item_scaling
    )

    # Define model
    model = NeuMF(
        num_users,
        num_items,
        mf_dim=args.factors,
        mf_reg=0.0,
        mlp_layer_sizes=args.mlp_layers,
        mlp_layer_regs=([0.0] * len(args.mlp_layers)),
    )

    # Create loss function object
    loss_fn = nn.BCEWithLogitsLoss(reduction="none")

    # Create optimizer object
    optimizer = MNCoreAdam(
        model.parameters(),
        lr=args.learning_rate,
        chainer_use_torch=True,
    )

    # Pass the device information to context or move the model and optimizer to specified device
    device = MNDevice(device_name)
    train_context = Context(device)
    eval_context = Context(device)
    Context.switch_context(train_context)

    # Load pre-trained model
    if args.load_path != "":
        load_model(model, optimizer, model_path=args.load_path)

    # Run training
    run_train(
        args,
        model,
        device,
        train_context,
        optimizer,
        loss_fn,
        outdir,
        train_pos_dataset,
        neg_sampler,
        num_items,
    )

    # Save trained model
    if args.save_path != "":
        save_model(model, optimizer, outdir, model_path=args.save_path)

    # Load positive and negative data for evaluation and create dataloader
    eval_dataset, samples_per_user = load_eval_data(
        scaled_data_dir,
        num_users,
        args.user_scaling,
        args.item_scaling,
        args.eval_neg_ratio,
    )
    users_per_eval_batch = max(args.eval_batch_size // samples_per_user, 1)
    eval_dataset = ConcatDataset(
        [
            eval_dataset,
            generate_padding(len(eval_dataset), users_per_eval_batch, samples_per_user),
        ]
    )
    eval_dataloader = DataLoader(
        eval_dataset,
        batch_size=users_per_eval_batch,
        shuffle=False,
        num_workers=args.loader_num_workers,
    )

    # Switch to evaluation context
    Context.switch_context(eval_context)

    # Run evaluation
    run_eval(
        args,
        model,
        device,
        eval_context,
        outdir,
        eval_dataloader,
        samples_per_user,
        num_users,
    )


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # mlsdk options
    parser.add_argument("--device", type=str, default="mncore2:auto")
    parser.add_argument("--outdir", type=str, default="/tmp/mlsdk_ncf_training/out")
    parser.add_argument(
        "--option_json",
        type=Path,
        default="/opt/pfn/pfcomp/codegen/preset_options/O1.json",
    )

    apply_toml_defaults(str(Path(__file__).parent / "configs.toml"), parser)

    # Parse command line args and opts
    args = parser.parse_args()

    main(args)

Listing 8.35 /opt/pfn/pfcomp/codegen/MLSDK/examples/ncf_training/ncf_eval.py

import argparse
import math

import torch
from mlsdk import CompiledFunction, Context
from utility import compile_fn


# Measure inference accuracy as hit ratio (HR) and normalized documented cumulative gain (NDCG)
def measure_acc(  # noqa: CFQ002
    args: argparse.Namespace,
    device: str,
    dataloader: torch.utils.data.DataLoader,
    samples_per_user: int,
    infer_fn: CompiledFunction,
    K: int,
    num_user: int,
) -> None:
    log_2 = math.log(2)

    hits = torch.tensor(0.0)
    ndcg = torch.tensor(0.0)

    with torch.no_grad():
        for user, item, dup_mask, pos_item_indices in dataloader:
            samples = {
                "user": user.view(-1),
                "item": item.view(-1),
            }

            scores = infer_fn(samples)["result"].detach().view(-1, samples_per_user)

            # Set scores of duplicate items to -1 to exclude them from top-k
            scores[dup_mask.bool()] = -1
            _, top_k_indices = torch.topk(scores, K)

            # Check if the positive item is among the top-k recommendations (a "hit")
            hit_mask = top_k_indices == pos_item_indices.unsqueeze(1)
            hits += hit_mask.sum().item()

            # Find normalized documented cumulative gain (NDCG)
            hit_ranks = torch.nonzero(hit_mask)[:, 1].view(-1).to(torch.float)
            ndcg += (log_2 / (hit_ranks + 2).log_()).sum()

    hit_rate = hits.item() / num_user
    ndcg = ndcg.item() / num_user
    print(f"HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}")


def run_eval(  # noqa: CFQ002
    args: argparse.Namespace,
    model: torch.nn.Module,
    device: str,
    context: Context,
    outdir: str,
    dataloader: torch.utils.data.DataLoader,
    samples_per_user: int,
    num_user: int,
) -> None:
    # Define inference functions
    def infer_fx2onnx(sample_d: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
        output = None
        with torch.no_grad():
            output = model(sample_d["user"], sample_d["item"], sigmoid=True)

        return {"result": output}

    # Sample input for compilation
    user, item = next(iter(dataloader))[:2]
    sample = {
        "user": user.view(-1),
        "item": item.view(-1),
    }

    infer_fn = compile_fn(
        context,
        infer_fx2onnx,
        model,
        sample,
        outdir=outdir,
        model_name="ncf_eval",
        is_train=False,
        option_json=str(args.option_json),
    )

    model.eval()

    measure_acc(
        args, device, dataloader, samples_per_user, infer_fn, args.topk, num_user
    )

Listing 8.36 /opt/pfn/pfcomp/codegen/MLSDK/examples/ncf_training/ncf_utils.py

import os
import pickle

import numpy as np
import torch

# import from externals/mlcommons-ncf/recommendation/pytorch
from alias_generator import AliasSample
from convert import CACHE_FN, generate_negatives
from mlsdk import MNCoreOptimizer
from torch.utils.data import TensorDataset


def load_sampler(
    data_dir: str, user_scaling: int, item_scaling: int
) -> tuple[AliasSample, np.ndarray, np.ndarray, np.int64]:
    fn_prefix = data_dir + "/" + CACHE_FN.format(user_scaling, item_scaling)
    sampler_cache = fn_prefix + "cached_sampler.pkl"

    if os.path.exists(data_dir):
        print(f"Using alias file: {sampler_cache}")
        with open(sampler_cache, "rb") as f:
            sampler, pos_users, pos_items, num_items, _ = pickle.load(f)
    else:
        raise ValueError(f"sampler directory does not exist: {data_dir}")

    return (sampler, pos_users, pos_items, num_items)


def generate_neg_dataset(
    pos_users: torch.Tensor,
    label_size: torch.Size,
    neg_ratio: int,
    num_items: np.int64,
    allow_collision: bool,
    sampler: AliasSample,
) -> TensorDataset:
    if allow_collision:
        neg_users = pos_users.repeat(neg_ratio)
        neg_items = torch.empty_like(neg_users, dtype=torch.int64).random_(
            0, int(num_items)
        )
    else:
        # Use sampler which had been generated in convert.py
        # The sampler avoids collision of item id between positive and negative data
        negatives = generate_negatives(sampler, neg_ratio, pos_users.numpy())
        negatives = torch.from_numpy(negatives)
        neg_users = negatives[:, 0]
        neg_items = negatives[:, 1]
    neg_labels = torch.zeros(label_size, dtype=torch.float32).repeat(neg_ratio)

    return TensorDataset(neg_users, neg_items, neg_labels)


def load_train_pos_data(
    data_dir: str, user_scaling: int, item_scaling: int
) -> tuple[TensorDataset, int, np.int64, AliasSample]:
    sampler, pos_users, pos_items, num_items = load_sampler(
        data_dir, user_scaling, item_scaling
    )

    num_users = len(sampler.num_regions)
    pos_users = torch.from_numpy(pos_users).type(torch.LongTensor)
    pos_items = torch.from_numpy(pos_items).type(torch.LongTensor)
    pos_labels = torch.ones_like(pos_users, dtype=torch.float32)
    dataset = TensorDataset(pos_users, pos_items, pos_labels)

    return dataset, num_users, num_items, sampler


def load_eval_data(
    data_dir: str, num_users: int, user_scaling: int, item_scaling: int, neg_ratio: int
) -> tuple[TensorDataset, int]:
    # Load positive items
    pos_item_chunks = []
    for chunk_id in range(user_scaling):
        pos_ratings = torch.from_numpy(
            np.load(
                f"{data_dir}/testx{user_scaling}x{item_scaling}_{chunk_id}.npz",
                encoding="bytes",
            )["arr_0"]
        )
        pos_item_chunks.append(pos_ratings[:, 1].reshape(-1, 1))

    # Load negative items
    neg_item_chunks = []
    for chunk_id in range(user_scaling):
        neg_ratings = torch.from_numpy(
            np.load(
                f"{data_dir}/test_negx{user_scaling}x{item_scaling}_{chunk_id}.npz",
                encoding="bytes",
            )["arr_0"]
        )
        neg_item_chunks.append(neg_ratings[:, 1].reshape(-1, neg_ratio))

    # Concat positive and negative items
    item_chunks = [
        torch.cat((negs, poses), dim=1)
        for negs, poses in zip(neg_item_chunks, pos_item_chunks)
    ]

    # Get indices of positive items in concatenated items
    pos_item_index_chunks = []
    for items, pos_items in zip(item_chunks, pos_item_chunks):
        is_positive_mask = items == pos_items
        pos_item_index_chunks.append(torch.argmax(is_positive_mask.long(), dim=1))

    # Create a mask to identify duplicate items to avoid them during evaluation
    dup_mask_chunks = []
    for items in item_chunks:
        stable_indices = torch.argsort(items, dim=1, stable=True)
        sorted_items = torch.gather(items, 1, stable_indices)

        is_duplicate_sorted = sorted_items[:, 1:] == sorted_items[:, :-1]
        dup_mask_sorted = torch.cat(
            [
                torch.zeros(is_duplicate_sorted.shape[0], 1, dtype=torch.bool),
                is_duplicate_sorted,
            ],
            dim=1,
        )

        # Unsort the mask back to the original item order
        inverse_indices = torch.argsort(stable_indices, dim=1)
        dup_mask = torch.gather(dup_mask_sorted, 1, inverse_indices)
        dup_mask_chunks.append(dup_mask)

    # Concatenate all chunks into final Tensors
    items = torch.cat(item_chunks, dim=0).long()
    dup_mask = torch.cat(dup_mask_chunks, dim=0)
    pos_item_indices = torch.cat(pos_item_index_chunks, dim=0)

    # Replicate each user ID for the number of item samples they have
    users = torch.arange(num_users, dtype=torch.long).unsqueeze(1)
    users = users.repeat(1, items.shape[1])

    dataset = TensorDataset(users, items, dup_mask, pos_item_indices)
    samples_per_user = items.size(1)

    return dataset, samples_per_user


def generate_padding(
    data_len: int, users_per_batch: int, samples_per_user: int
) -> TensorDataset:
    remainder_users = data_len % users_per_batch
    padding_users = users_per_batch - remainder_users if remainder_users > 0 else 0

    dummy_users = torch.zeros(padding_users, samples_per_user, dtype=torch.long)
    dummy_items = torch.zeros(padding_users, samples_per_user, dtype=torch.long)
    dummy_dup_mask = torch.zeros(padding_users, samples_per_user, dtype=torch.bool)
    dummy_pos_item_indices = torch.full((padding_users,), -1, dtype=torch.long)

    return TensorDataset(
        dummy_users, dummy_items, dummy_dup_mask, dummy_pos_item_indices
    )


def load_model(
    model: torch.nn.Module,
    optimizer: MNCoreOptimizer | None = None,
    model_path: str | os.PathLike = "./ncf_model.pth",
) -> None:
    print(f"loading model from {model_path}")
    weights = torch.load(model_path, weights_only=True)

    model.load_state_dict(weights["model"])
    if optimizer is not None:
        optimizer.load_state_dict(weights["optimizer"])


def save_model(
    model: torch.nn.Module,
    optimizer: MNCoreOptimizer,
    outdir: str,
    model_path: str | os.PathLike = "ncf_model.pth",
) -> None:
    torch.save(
        {
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
        },
        os.path.join(outdir, model_path),
    )

    print(f"model saved to {model_path}")

Listing 8.37 /opt/pfn/pfcomp/codegen/MLSDK/examples/ncf_training/utility.py

import argparse
import os
import pathlib
import random
import sys
from collections.abc import Callable
from typing import Any

import numpy as np
import tomllib
import torch
from fx2onnx import set_tensor_name
from mlsdk import (
    CacheOptions,
    CompiledFunction,
    Context,
    MNCoreOptimizer,
    get_tensor_name,
    set_buffer_name_in_optimizer,
    set_tensor_name_in_module,
    storage,
)


def set_deterministic_mode(seed: int) -> None:
    # Set seed
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    # Set cudnn.benchmark mode and specify the use of deterministic algorithms
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)


def register_model(
    context: Context,
    name: str,
    model: torch.nn.Module,
) -> None:
    if (
        get_tensor_name(next(model.parameters())) is None
    ):  # in case the model obj isn't registered to the context
        set_tensor_name_in_module(model, name)
        for p in model.parameters():
            context.register_param(p)
        for b in model.buffers():
            context.register_buffer(b)


def compile_fn(  # noqa: CFQ002
    context: Context,
    target_fn: Callable[
        [
            dict[str, torch.Tensor],
        ],
        dict[str, torch.Tensor],
    ],  # compiled fn
    model: torch.nn.Module | dict[str, torch.nn.Module],
    sample_input: dict[str, torch.Tensor],
    outdir: str = "/tmp/example_output",
    model_name: str = "example",
    is_train: bool = True,
    optimizers: (
        list[MNCoreOptimizer] | None
    ) = None,  # list[] is for multiple optimizers
    option_json: str = "/opt/pfn/pfcomp/codegen/preset_options/O1.json",
    preset_options_dir: str | None = None,
    enable_cache: bool = False,
    **kwargs: Any,  # used in `compile_args` in Context.compile()
) -> CompiledFunction:

    if preset_options_dir is None:
        preset_options_dir = pathlib.Path.cwd().parent.parent.parent / "preset_options"

    compile_options = {"option_json": option_json}

    compile_args = {
        "function": target_fn,
        "inputs": sample_input,
        "options": compile_options,
    }

    codegen_base_dir = storage.path(outdir)
    compile_args["codegen_dir"] = codegen_base_dir / model_name

    if enable_cache:
        compile_args["cache_options"] = CacheOptions(
            f"{outdir}/{model_name}/cache",
            enable_app_cache=True,
            enable_onnx_cache=True,
            enable_codegen_cache=True,
            enable_gpfn2obj_cache=True,
        )

    if isinstance(model, torch.nn.Module):
        register_model(context, model_name, model)
    else:  # if isinstance(models, dict[str, torch.nn.Module]):
        for name, actual_model in model.items():
            register_model(context, name, actual_model)

    if is_train:
        if optimizers is None:  # in case that optimizer.step() will be done at the host
            if isinstance(model, torch.nn.Module):
                for n, p in model.named_parameters():
                    p.grad = torch.nn.Parameter(
                        torch.zeros_like(p), requires_grad=p.requires_grad
                    )
                    set_tensor_name(p.grad, f"{model_name}@{n}@grad".replace(".", "_"))
                    context.register_param(p.grad)
            else:
                for name, actual_model in model.items():
                    for n, p in actual_model.named_parameters():
                        p.grad = torch.nn.Parameter(
                            torch.zeros_like(p), requires_grad=p.requires_grad
                        )
                        set_tensor_name(p.grad, f"{name}@{n}".replace(".", "_"))
                        context.register_param(p.grad)
        else:
            for idx, optimizer in enumerate(optimizers):
                optimizer_name = "optimizer" + str(idx)
                set_buffer_name_in_optimizer(optimizer, optimizer_name)
                context.register_optimizer_buffers(optimizer)

    compile_args.update(kwargs)

    return context.compile(**compile_args)


# for type hint of the configs from toml
class TomlValue:
    str | int | float | bool | list["TomlValue"] | dict[str, "TomlValue"]


class TomlDict:
    dict[str, TomlValue]


def read_configs_from_toml(
    toml_path: str,
) -> TomlDict:

    configs_dict = None
    with open(toml_path, mode="rb") as f:
        configs_dict = tomllib.load(f)

    return configs_dict


def str2bool(v: bool | str) -> bool:
    if v.lower() in ("yes", "true", "on", "enable", "y", "t", "1"):
        return True
    elif v.lower() in ("no", "false", "off", "disable", "n", "f", "0"):
        return False
    elif isinstance(v, str | bool):
        return v
    else:
        raise argparse.ArgumentTypeError("Str or boolean value expected")


def apply_toml_defaults(
    configs: TomlDict | str | os.PathLike,
    parser: argparse.ArgumentParser,
) -> None:

    if isinstance(configs, dict):
        for k, v in configs.items():
            if isinstance(v, dict):  # in case v is (nested) dict
                apply_toml_defaults(v, parser)
            else:
                # just checking whether v is list is enough for array args
                # because array in toml is converted to the list by tomllib
                args_type = None
                if isinstance(v, list):
                    args_type = type(v[0])
                elif isinstance(v, bool):
                    args_type = str2bool
                else:
                    args_type = type(v)
                parser.add_argument(
                    f"--{k}",
                    default=v,
                    type=args_type,
                    nargs="*" if isinstance(v, list) else "?",
                )
    elif isinstance(configs, str | os.PathLike):
        configs_dict = read_configs_from_toml(configs)

        apply_toml_defaults(configs_dict, parser)
    else:
        sys.exit("")

Listing 8.38 /opt/pfn/pfcomp/codegen/MLSDK/examples/ncf_training/requirements_datagen.txt

absl-py==0.7.0
numpy==1.16.2
pandas==0.24.2
protobuf==3.19.6
scikit-image==0.14.2
scikit-learn==0.20.3
scipy==1.2.1
six==1.12.0
tensorflow==1.13.1

Listing 8.39 /opt/pfn/pfcomp/codegen/MLSDK/examples/ncf_training/requirements_training.txt

scipy==1.16.0
torch==2.9.0
numpy==2.3.0
numpy_indexed==0.3.7
pandas==2.3.0
mlperf_compliance==0.0.10

Listing 8.40 /opt/pfn/pfcomp/codegen/MLSDK/examples/ncf_training/configs.toml

title = "ncf_training"


[model]
factors    = 64
mlp_layers = [256, 256, 128, 64]
save_path  = "./ncf_model.pth"
load_path  = ""
model_path = "./ncf_model.pth"


[dataset]
dataset                  = "ml-1m"
user_scaling             = 1       # this value must be the same as specified in preparation.sh
item_scaling             = 1       # this value must be the same as specified in preparation.sh
train_neg_ratio          = 4
eval_neg_ratio           = 999
allow_collision_with_pos = false
loader_num_workers       = 2


[training]
epoch         = 20
learning_rate = 0.0002


[evaluation]
topk = 10


[misc]
seed             = 0
train_batch_size = 65536 # 2**16
eval_batch_size  = 16384 # 2**14