feat(subtitle): data-driven conversion registry + SubtitleEdit 5 support

Replace the hardcoded conversion if/elif in Subtitle.convert with a capability-matrix backend registry (subtitle_convert.py): each backend declares the source->target pairs it supports plus a rank, and run_conversion tries them in order as a real fallback chain. conversion_method pins a backend but still falls back (pin-then-fallback). - Detect the cross-platform SubtitleEdit 5+ CLI (seconv) and use its --flag syntax for convert, SDH stripping, and reverse-RTL - Protect styled ASS/SSA from automatic SRT downconversion; honor an explicit --sub-format / sidecar_format - Read segmented fVTT (wvtt) and fTTML (stpp) directly from fragmented MP4 - Improve ASS/SSA font detection: inline \fn overrides, Format-located Fontname column, @-prefix strip, case-insensitive de-dup; covers SSA too - Update SUBTITLE_CONFIG.md, example yaml, README; add regression tests and a backend benchmark script
2026-06-10 03:02:09 +00:00 · 2026-06-07 22:21:25 -06:00
parent e6613e8ed8
commit 29232925d5
11 changed files with 871 additions and 357 deletions
--- a/scripts/bench_subtitle_backends.py
+++ b/scripts/bench_subtitle_backends.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+"""
+Benchmark subtitle conversion backends to (re-)tune the preference ranks in
+``unshackle/core/tracks/subtitle_convert.py``.
+
+Runs every backend that can read each input file, converting to a target format (default
+SRT), and reports cue count, leaked ASS override tags, and output size — so you can compare
+fidelity per (source, target) pair on real files. Read-only: copies inputs to a temp dir.
+
+Usage:
+    uv run python scripts/bench_subtitle_backends.py <file-or-dir> [<file-or-dir> ...] [--target SRT]
+
+Example:
+    uv run python scripts/bench_subtitle_backends.py downloads/
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import shutil
+import tempfile
+from pathlib import Path
+
+from unshackle.core.tracks import subtitle_convert as sc
+from unshackle.core.tracks.subtitle import Subtitle
+
+Codec = Subtitle.Codec
+
+EXT_TO_CODEC = {
+    ".srt": Codec.SubRip,
+    ".vtt": Codec.WebVTT,
+    ".ass": Codec.SubStationAlphav4,
+    ".ssa": Codec.SubStationAlpha,
+    ".ttml": Codec.TimedTextMarkupLang,
+    ".smi": Codec.SAMI,
+    ".sami": Codec.SAMI,
+}
+
+
+def gather(paths: list[str]) -> list[Path]:
+    files: list[Path] = []
+    for p in paths:
+        path = Path(p)
+        if path.is_dir():
+            files.extend(f for f in path.rglob("*") if f.suffix.lower() in EXT_TO_CODEC)
+        elif path.suffix.lower() in EXT_TO_CODEC:
+            files.append(path)
+    return sorted(files)
+
+
+def metrics(text: str) -> tuple[int, int, int]:
+    cues = len(re.findall(r"-->", text))
+    ass_residue = len(re.findall(r"\{\\", text))
+    return cues, ass_residue, len(text)
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("paths", nargs="+", help="subtitle files or directories")
+    ap.add_argument("--target", default="SRT", help="target codec value (SRT, VTT, ASS, ...)")
+    args = ap.parse_args()
+
+    target = Codec(args.target.upper())
+    files = gather(args.paths)
+    if not files:
+        print("No subtitle files found.")
+        return
+
+    tmp = Path(tempfile.mkdtemp(prefix="subbench_"))
+    print(f"{'file':40} {'source':10} {'backend':12} {'ok':3} {'cues':>5} {'resid':>5} {'bytes':>7}")
+    for f in files:
+        source = EXT_TO_CODEC[f.suffix.lower()]
+        if source == target:
+            continue
+        for backend in sc.REGISTRY:
+            if not (backend.is_available() and backend.can_convert(source, target)):
+                continue
+            work = tmp / f"{f.stem}.{backend.name}{f.suffix}"
+            shutil.copy2(f, work)
+            sub = Subtitle(url="x", language="en", codec=source)
+            sub.path = work
+            try:
+                # Call the backend directly so each row reflects only that backend (no fallback).
+                out = work.with_suffix(f".{target.value.lower()}")
+                backend.convert(sub, target, out)
+                cues, resid, size = metrics(out.read_text("utf8", errors="replace"))
+                print(
+                    f"{f.name[:40]:40} {source.name[:10]:10} {backend.name:12} {'Y':3} {cues:>5} {resid:>5} {size:>7}"
+                )
+            except Exception as e:  # noqa: BLE001 - benchmark reports failures, does not raise
+                print(
+                    f"{f.name[:40]:40} {source.name[:10]:10} {backend.name:12} {'N':3} {'-':>5} {'-':>5} {'-':>7}  {type(e).__name__}"
+                )
+
+
+if __name__ == "__main__":
+    main()