unshackle/scripts/bench_subtitle_backends.py

#!/usr/bin/env python3
"""
Benchmark subtitle conversion backends to (re-)tune the preference ranks in
``unshackle/core/tracks/subtitle_convert.py``.

Runs every backend that can read each input file, converting to a target format (default
SRT), and reports cue count, leaked ASS override tags, and output size — so you can compare
fidelity per (source, target) pair on real files. Read-only: copies inputs to a temp dir.

Usage:
    uv run python scripts/bench_subtitle_backends.py <file-or-dir> [<file-or-dir> ...] [--target SRT]

Example:
    uv run python scripts/bench_subtitle_backends.py downloads/
"""

from __future__ import annotations

import argparse
import re
import shutil
import tempfile
from pathlib import Path

from unshackle.core.tracks import subtitle_convert as sc
from unshackle.core.tracks.subtitle import Subtitle

Codec = Subtitle.Codec

EXT_TO_CODEC = {
    ".srt": Codec.SubRip,
    ".vtt": Codec.WebVTT,
    ".ass": Codec.SubStationAlphav4,
    ".ssa": Codec.SubStationAlpha,
    ".ttml": Codec.TimedTextMarkupLang,
    ".smi": Codec.SAMI,
    ".sami": Codec.SAMI,
}


def gather(paths: list[str]) -> list[Path]:
    files: list[Path] = []
    for p in paths:
        path = Path(p)
        if path.is_dir():
            files.extend(f for f in path.rglob("*") if f.suffix.lower() in EXT_TO_CODEC)
        elif path.suffix.lower() in EXT_TO_CODEC:
            files.append(path)
    return sorted(files)


def metrics(text: str) -> tuple[int, int, int]:
    cues = len(re.findall(r"-->", text))
    ass_residue = len(re.findall(r"\{\\", text))
    return cues, ass_residue, len(text)


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("paths", nargs="+", help="subtitle files or directories")
    ap.add_argument("--target", default="SRT", help="target codec value (SRT, VTT, ASS, ...)")
    args = ap.parse_args()

    target = Codec(args.target.upper())
    files = gather(args.paths)
    if not files:
        print("No subtitle files found.")
        return

    tmp = Path(tempfile.mkdtemp(prefix="subbench_"))
    print(f"{'file':40} {'source':10} {'backend':12} {'ok':3} {'cues':>5} {'resid':>5} {'bytes':>7}")
    for f in files:
        source = EXT_TO_CODEC[f.suffix.lower()]
        if source == target:
            continue
        for backend in sc.REGISTRY:
            if not (backend.is_available() and backend.can_convert(source, target)):
                continue
            work = tmp / f"{f.stem}.{backend.name}{f.suffix}"
            shutil.copy2(f, work)
            sub = Subtitle(url="x", language="en", codec=source)
            sub.path = work
            try:
                # Call the backend directly so each row reflects only that backend (no fallback).
                out = work.with_suffix(f".{target.value.lower()}")
                backend.convert(sub, target, out)
                cues, resid, size = metrics(out.read_text("utf8", errors="replace"))
                print(
                    f"{f.name[:40]:40} {source.name[:10]:10} {backend.name:12} {'Y':3} {cues:>5} {resid:>5} {size:>7}"
                )
            except Exception as e:  # noqa: BLE001 - benchmark reports failures, does not raise
                print(
                    f"{f.name[:40]:40} {source.name[:10]:10} {backend.name:12} {'N':3} {'-':>5} {'-':>5} {'-':>7}  {type(e).__name__}"
                )


if __name__ == "__main__":
    main()