feat(subtitle): data-driven conversion registry + SubtitleEdit 5 support

Replace the hardcoded conversion if/elif in Subtitle.convert with a capability-matrix backend registry (subtitle_convert.py): each backend declares the source->target pairs it supports plus a rank, and run_conversion tries them in order as a real fallback chain. conversion_method pins a backend but still falls back (pin-then-fallback).

- Detect the cross-platform SubtitleEdit 5+ CLI (seconv) and use its --flag syntax for convert, SDH stripping, and reverse-RTL
- Protect styled ASS/SSA from automatic SRT downconversion; honor an explicit --sub-format / sidecar_format
- Read segmented fVTT (wvtt) and fTTML (stpp) directly from fragmented MP4
- Improve ASS/SSA font detection: inline \fn overrides, Format-located Fontname column, @-prefix strip, case-insensitive de-dup; covers SSA too
- Update SUBTITLE_CONFIG.md, example yaml, README; add regression tests and a backend benchmark script
This commit is contained in:
imSp4rky
2026-06-07 22:21:25 -06:00
parent e6613e8ed8
commit 29232925d5
11 changed files with 871 additions and 357 deletions

View File

@@ -0,0 +1,98 @@
#!/usr/bin/env python3
"""
Benchmark subtitle conversion backends to (re-)tune the preference ranks in
``unshackle/core/tracks/subtitle_convert.py``.
Runs every backend that can read each input file, converting to a target format (default
SRT), and reports cue count, leaked ASS override tags, and output size — so you can compare
fidelity per (source, target) pair on real files. Read-only: copies inputs to a temp dir.
Usage:
uv run python scripts/bench_subtitle_backends.py <file-or-dir> [<file-or-dir> ...] [--target SRT]
Example:
uv run python scripts/bench_subtitle_backends.py downloads/
"""
from __future__ import annotations
import argparse
import re
import shutil
import tempfile
from pathlib import Path
from unshackle.core.tracks import subtitle_convert as sc
from unshackle.core.tracks.subtitle import Subtitle
Codec = Subtitle.Codec
EXT_TO_CODEC = {
".srt": Codec.SubRip,
".vtt": Codec.WebVTT,
".ass": Codec.SubStationAlphav4,
".ssa": Codec.SubStationAlpha,
".ttml": Codec.TimedTextMarkupLang,
".smi": Codec.SAMI,
".sami": Codec.SAMI,
}
def gather(paths: list[str]) -> list[Path]:
files: list[Path] = []
for p in paths:
path = Path(p)
if path.is_dir():
files.extend(f for f in path.rglob("*") if f.suffix.lower() in EXT_TO_CODEC)
elif path.suffix.lower() in EXT_TO_CODEC:
files.append(path)
return sorted(files)
def metrics(text: str) -> tuple[int, int, int]:
cues = len(re.findall(r"-->", text))
ass_residue = len(re.findall(r"\{\\", text))
return cues, ass_residue, len(text)
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("paths", nargs="+", help="subtitle files or directories")
ap.add_argument("--target", default="SRT", help="target codec value (SRT, VTT, ASS, ...)")
args = ap.parse_args()
target = Codec(args.target.upper())
files = gather(args.paths)
if not files:
print("No subtitle files found.")
return
tmp = Path(tempfile.mkdtemp(prefix="subbench_"))
print(f"{'file':40} {'source':10} {'backend':12} {'ok':3} {'cues':>5} {'resid':>5} {'bytes':>7}")
for f in files:
source = EXT_TO_CODEC[f.suffix.lower()]
if source == target:
continue
for backend in sc.REGISTRY:
if not (backend.is_available() and backend.can_convert(source, target)):
continue
work = tmp / f"{f.stem}.{backend.name}{f.suffix}"
shutil.copy2(f, work)
sub = Subtitle(url="x", language="en", codec=source)
sub.path = work
try:
# Call the backend directly so each row reflects only that backend (no fallback).
out = work.with_suffix(f".{target.value.lower()}")
backend.convert(sub, target, out)
cues, resid, size = metrics(out.read_text("utf8", errors="replace"))
print(
f"{f.name[:40]:40} {source.name[:10]:10} {backend.name:12} {'Y':3} {cues:>5} {resid:>5} {size:>7}"
)
except Exception as e: # noqa: BLE001 - benchmark reports failures, does not raise
print(
f"{f.name[:40]:40} {source.name[:10]:10} {backend.name:12} {'N':3} {'-':>5} {'-':>5} {'-':>7} {type(e).__name__}"
)
if __name__ == "__main__":
main()