feat(dl): extract closed captions from HLS manifests and improve CC extraction

- Parse CLOSED-CAPTIONS entries from HLS manifests and attach CC metadata (language, name, instream_id) to video tracks
- Move CC extraction to run after decryption instead of before, fixing extraction failures on encrypted streams
- Extract CCs even when other subtitle tracks exist, using manifest CC language info instead of guessing
- Try ccextractor on the original file before repacking to preserve container-level CC data (e.g. c608 boxes) that ffmpeg remux strips
- Display deduplicated closed captions in --list output and download progress, positioned after subtitles
- Add closed_captions field to Video track class
This commit is contained in:
Andy
2026-03-05 15:57:29 -07:00
parent 7dd6323be5
commit 15acaea208
4 changed files with 156 additions and 103 deletions

View File

@@ -25,6 +25,7 @@ import click
import jsonpickle import jsonpickle
import yaml import yaml
from construct import ConstError from construct import ConstError
from langcodes import Language
from pymediainfo import MediaInfo from pymediainfo import MediaInfo
from pyplayready.cdm import Cdm as PlayReadyCdm from pyplayready.cdm import Cdm as PlayReadyCdm
from pyplayready.device import Device as PlayReadyDevice from pyplayready.device import Device as PlayReadyDevice
@@ -2025,49 +2026,6 @@ class dl:
dl_time = time_elapsed_since(dl_start_time) dl_time = time_elapsed_since(dl_start_time)
console.print(Padding(f"Track downloads finished in [progress.elapsed]{dl_time}[/]", (0, 5))) console.print(Padding(f"Track downloads finished in [progress.elapsed]{dl_time}[/]", (0, 5)))
video_track_n = 0
while (
not title.tracks.subtitles
and not no_subs
and not (hasattr(service, "NO_SUBTITLES") and service.NO_SUBTITLES)
and not video_only
and not no_video
and len(title.tracks.videos) > video_track_n
and any(
x.get("codec_name", "").startswith("eia_")
for x in ffprobe(title.tracks.videos[video_track_n].path).get("streams", [])
)
):
with console.status(f"Checking Video track {video_track_n + 1} for Closed Captions..."):
try:
# TODO: Figure out the real language, it might be different
# EIA-CC tracks sadly don't carry language information :(
# TODO: Figure out if the CC language is original lang or not.
# Will need to figure out above first to do so.
video_track = title.tracks.videos[video_track_n]
track_id = f"ccextractor-{video_track.id}"
cc_lang = title.language or video_track.language
cc = video_track.ccextractor(
track_id=track_id,
out_path=config.directories.temp
/ config.filenames.subtitle.format(id=track_id, language=cc_lang),
language=cc_lang,
original=False,
)
if cc:
# will not appear in track listings as it's added after all times it lists
title.tracks.add(cc)
self.log.info(f"Extracted a Closed Caption from Video track {video_track_n + 1}")
else:
self.log.info(f"No Closed Captions were found in Video track {video_track_n + 1}")
except EnvironmentError:
self.log.error(
"Cannot extract Closed Captions as the ccextractor executable was not found..."
)
break
video_track_n += 1
# Subtitle output mode configuration (for sidecar originals) # Subtitle output mode configuration (for sidecar originals)
subtitle_output_mode = config.subtitle.get("output_mode", "mux") subtitle_output_mode = config.subtitle.get("output_mode", "mux")
sidecar_format = config.subtitle.get("sidecar_format", "srt") sidecar_format = config.subtitle.get("sidecar_format", "srt")
@@ -2133,6 +2091,57 @@ class dl:
if has_decrypted: if has_decrypted:
self.log.info(f"Decrypted tracks with {decrypt_tool}") self.log.info(f"Decrypted tracks with {decrypt_tool}")
# Extract Closed Captions from decrypted video tracks
if (
not no_subs
and not (hasattr(service, "NO_SUBTITLES") and service.NO_SUBTITLES)
and not video_only
and not no_video
):
for video_track_n, video_track in enumerate(title.tracks.videos):
has_manifest_cc = bool(getattr(video_track, "closed_captions", None))
has_eia_cc = (
not has_manifest_cc
and not title.tracks.subtitles
and any(
x.get("codec_name", "").startswith("eia_")
for x in ffprobe(video_track.path).get("streams", [])
)
)
if not has_manifest_cc and not has_eia_cc:
continue
with console.status(f"Checking Video track {video_track_n + 1} for Closed Captions..."):
try:
cc_lang = (
Language.get(video_track.closed_captions[0]["language"])
if has_manifest_cc and video_track.closed_captions[0].get("language")
else title.language or video_track.language
)
track_id = f"ccextractor-{video_track.id}"
cc = video_track.ccextractor(
track_id=track_id,
out_path=config.directories.temp
/ config.filenames.subtitle.format(id=track_id, language=cc_lang),
language=cc_lang,
original=False,
)
if cc:
cc.cc = True
title.tracks.add(cc)
self.log.info(
f"Extracted a Closed Caption from Video track {video_track_n + 1}"
)
else:
self.log.info(
f"No Closed Captions were found in Video track {video_track_n + 1}"
)
except EnvironmentError:
self.log.error(
"Cannot extract Closed Captions as the ccextractor executable was not found..."
)
break
# Now repack the decrypted tracks # Now repack the decrypted tracks
with console.status("Repackaging tracks with FFMPEG..."): with console.status("Repackaging tracks with FFMPEG..."):
has_repacked = False has_repacked = False

View File

@@ -112,6 +112,15 @@ class HLS:
session_drm = HLS.get_all_drm(session_keys) session_drm = HLS.get_all_drm(session_keys)
audio_codecs_by_group_id: dict[str, Audio.Codec] = {} audio_codecs_by_group_id: dict[str, Audio.Codec] = {}
cc_by_group_id: dict[str, list[dict[str, Any]]] = {}
for media in self.manifest.media:
if media.type == "CLOSED-CAPTIONS":
cc_by_group_id.setdefault(media.group_id, []).append({
"language": media.language,
"name": media.name,
"instream_id": media.instream_id,
"characteristics": media.characteristics,
})
tracks = Tracks() tracks = Tracks()
for playlist in self.manifest.playlists: for playlist in self.manifest.playlists:
@@ -161,6 +170,9 @@ class HLS:
width=playlist.stream_info.resolution[0] if playlist.stream_info.resolution else None, width=playlist.stream_info.resolution[0] if playlist.stream_info.resolution else None,
height=playlist.stream_info.resolution[1] if playlist.stream_info.resolution else None, height=playlist.stream_info.resolution[1] if playlist.stream_info.resolution else None,
fps=playlist.stream_info.frame_rate, fps=playlist.stream_info.frame_rate,
closed_captions=cc_by_group_id.get(
(playlist.stream_info.closed_captions or "").strip('"'), []
),
) )
if primary_track_type is Video if primary_track_type is Video
else {} else {}

View File

@@ -103,53 +103,78 @@ class Tracks:
tree = Tree("", hide_root=True) tree = Tree("", hide_root=True)
for track_type in self.TRACK_ORDER_MAP: for track_type in self.TRACK_ORDER_MAP:
tracks = list(x for x in all_tracks if isinstance(x, track_type)) tracks = list(x for x in all_tracks if isinstance(x, track_type))
if not tracks: if tracks:
continue num_tracks = len(tracks)
num_tracks = len(tracks) track_type_plural = track_type.__name__ + ("s" if track_type != Audio and num_tracks != 1 else "")
track_type_plural = track_type.__name__ + ("s" if track_type != Audio and num_tracks != 1 else "") tracks_tree = tree.add(f"[repr.number]{num_tracks}[/] {track_type_plural}")
tracks_tree = tree.add(f"[repr.number]{num_tracks}[/] {track_type_plural}") for track in tracks:
for track in tracks: if add_progress and track_type not in (Chapter, Attachment):
if add_progress and track_type not in (Chapter, Attachment): progress = Progress(
progress = Progress( SpinnerColumn(finished_text=""),
SpinnerColumn(finished_text=""), BarColumn(),
BarColumn(), "",
"", TimeRemainingColumn(compact=True, elapsed_when_finished=True),
TimeRemainingColumn(compact=True, elapsed_when_finished=True), "",
"", TextColumn("[progress.data.speed]{task.fields[downloaded]}"),
TextColumn("[progress.data.speed]{task.fields[downloaded]}"), console=console,
console=console, speed_estimate_period=10,
speed_estimate_period=10, )
task = progress.add_task("", downloaded="-")
state = {"total": 100.0}
def update_track_progress(
task_id: int = task,
_state: dict[str, float] = state,
_progress: Progress = progress,
**kwargs,
) -> None:
"""
Ensure terminal status states render as a fully completed bar.
Some downloaders can report completed slightly below total
before emitting the final "Downloaded" state.
"""
if "total" in kwargs and kwargs["total"] is not None:
_state["total"] = kwargs["total"]
downloaded_state = kwargs.get("downloaded")
if downloaded_state in {"Downloaded", "Decrypted", "[yellow]SKIPPED"}:
kwargs["completed"] = _state["total"]
_progress.update(task_id=task_id, **kwargs)
progress_callables.append(update_track_progress)
track_table = Table.grid()
track_table.add_row(str(track)[6:], style="text2")
track_table.add_row(progress)
tracks_tree.add(track_table)
else:
tracks_tree.add(str(track)[6:], style="text2")
# Show Closed Captions right after Subtitles (even if no subtitle tracks exist)
if track_type is Subtitle:
seen_cc: set[str] = set()
unique_cc: list[str] = []
for video in (x for x in all_tracks if isinstance(x, Video)):
for cc in getattr(video, "closed_captions", []):
lang = cc.get("language", "und")
name = cc.get("name", "")
instream_id = cc.get("instream_id", "")
key = f"{lang}|{instream_id}"
if key in seen_cc:
continue
seen_cc.add(key)
parts = [f"[CC] | {lang}"]
if name:
parts.append(name)
if instream_id:
parts.append(instream_id)
unique_cc.append(" | ".join(parts))
if unique_cc:
cc_tree = tree.add(
f"[repr.number]{len(unique_cc)}[/] Closed Caption{'s' if len(unique_cc) != 1 else ''}"
) )
task = progress.add_task("", downloaded="-") for cc_str in unique_cc:
state = {"total": 100.0} cc_tree.add(cc_str, style="text2")
def update_track_progress(
task_id: int = task,
_state: dict[str, float] = state,
_progress: Progress = progress,
**kwargs,
) -> None:
"""
Ensure terminal status states render as a fully completed bar.
Some downloaders can report completed slightly below total
before emitting the final "Downloaded" state.
"""
if "total" in kwargs and kwargs["total"] is not None:
_state["total"] = kwargs["total"]
downloaded_state = kwargs.get("downloaded")
if downloaded_state in {"Downloaded", "Decrypted", "[yellow]SKIPPED"}:
kwargs["completed"] = _state["total"]
_progress.update(task_id=task_id, **kwargs)
progress_callables.append(update_track_progress)
track_table = Table.grid()
track_table.add_row(str(track)[6:], style="text2")
track_table.add_row(progress)
tracks_tree.add(track_table)
else:
tracks_tree.add(str(track)[6:], style="text2")
return tree, progress_callables return tree, progress_callables

View File

@@ -200,6 +200,7 @@ class Video(Track):
height: Optional[int] = None, height: Optional[int] = None,
fps: Optional[Union[str, int, float]] = None, fps: Optional[Union[str, int, float]] = None,
scan_type: Optional[Video.ScanType] = None, scan_type: Optional[Video.ScanType] = None,
closed_captions: Optional[list[dict[str, Any]]] = None,
**kwargs: Any, **kwargs: Any,
) -> None: ) -> None:
""" """
@@ -264,6 +265,7 @@ class Video(Track):
raise ValueError("Expected fps to be a number, float, or a string as numerator/denominator form, " + str(e)) raise ValueError("Expected fps to be a number, float, or a string as numerator/denominator form, " + str(e))
self.scan_type = scan_type self.scan_type = scan_type
self.closed_captions: list[dict[str, Any]] = closed_captions or []
self.needs_duration_fix = False self.needs_duration_fix = False
def __str__(self) -> str: def __str__(self) -> str:
@@ -346,22 +348,27 @@ class Video(Track):
if not binaries.CCExtractor: if not binaries.CCExtractor:
raise EnvironmentError("ccextractor executable was not found.") raise EnvironmentError("ccextractor executable was not found.")
# ccextractor often fails in weird ways unless we repack
self.repackage()
out_path = Path(out_path) out_path = Path(out_path)
try: def _run_ccextractor() -> bool:
subprocess.run( try:
[binaries.CCExtractor, "-trim", "-nobom", "-noru", "-ru1", "-o", out_path, self.path], subprocess.run(
check=True, [binaries.CCExtractor, "-trim", "-nobom", "-noru", "-ru1", "-o", out_path, self.path],
stdout=subprocess.PIPE, check=True,
stderr=subprocess.PIPE, stdout=subprocess.PIPE,
) stderr=subprocess.PIPE,
except subprocess.CalledProcessError as e: )
out_path.unlink(missing_ok=True) except subprocess.CalledProcessError as e:
if not e.returncode == 10: # No captions found out_path.unlink(missing_ok=True)
raise if e.returncode != 10: # 10 = No captions found
raise
return out_path.exists()
# Try on the original file first (preserves container-level CC data like c608 boxes),
# then fall back to repacked file (ccextractor can fail on some container formats).
if not _run_ccextractor():
self.repackage()
_run_ccextractor()
if out_path.exists(): if out_path.exists():
cc_track = Subtitle( cc_track = Subtitle(