feat(dl): extract closed captions from HLS manifests and improve CC extraction

- Parse CLOSED-CAPTIONS entries from HLS manifests and attach CC metadata (language, name, instream_id) to video tracks
- Move CC extraction to run after decryption instead of before, fixing extraction failures on encrypted streams
- Extract CCs even when other subtitle tracks exist, using manifest CC language info instead of guessing
- Try ccextractor on the original file before repacking to preserve container-level CC data (e.g. c608 boxes) that ffmpeg remux strips
- Display deduplicated closed captions in --list output and download progress, positioned after subtitles
- Add closed_captions field to Video track class
This commit is contained in:
Andy
2026-03-05 15:57:29 -07:00
parent 7dd6323be5
commit 15acaea208
4 changed files with 156 additions and 103 deletions

View File

@@ -25,6 +25,7 @@ import click
import jsonpickle
import yaml
from construct import ConstError
from langcodes import Language
from pymediainfo import MediaInfo
from pyplayready.cdm import Cdm as PlayReadyCdm
from pyplayready.device import Device as PlayReadyDevice
@@ -2025,49 +2026,6 @@ class dl:
dl_time = time_elapsed_since(dl_start_time)
console.print(Padding(f"Track downloads finished in [progress.elapsed]{dl_time}[/]", (0, 5)))
video_track_n = 0
while (
not title.tracks.subtitles
and not no_subs
and not (hasattr(service, "NO_SUBTITLES") and service.NO_SUBTITLES)
and not video_only
and not no_video
and len(title.tracks.videos) > video_track_n
and any(
x.get("codec_name", "").startswith("eia_")
for x in ffprobe(title.tracks.videos[video_track_n].path).get("streams", [])
)
):
with console.status(f"Checking Video track {video_track_n + 1} for Closed Captions..."):
try:
# TODO: Figure out the real language, it might be different
# EIA-CC tracks sadly don't carry language information :(
# TODO: Figure out if the CC language is original lang or not.
# Will need to figure out above first to do so.
video_track = title.tracks.videos[video_track_n]
track_id = f"ccextractor-{video_track.id}"
cc_lang = title.language or video_track.language
cc = video_track.ccextractor(
track_id=track_id,
out_path=config.directories.temp
/ config.filenames.subtitle.format(id=track_id, language=cc_lang),
language=cc_lang,
original=False,
)
if cc:
# will not appear in track listings as it's added after all times it lists
title.tracks.add(cc)
self.log.info(f"Extracted a Closed Caption from Video track {video_track_n + 1}")
else:
self.log.info(f"No Closed Captions were found in Video track {video_track_n + 1}")
except EnvironmentError:
self.log.error(
"Cannot extract Closed Captions as the ccextractor executable was not found..."
)
break
video_track_n += 1
# Subtitle output mode configuration (for sidecar originals)
subtitle_output_mode = config.subtitle.get("output_mode", "mux")
sidecar_format = config.subtitle.get("sidecar_format", "srt")
@@ -2133,6 +2091,57 @@ class dl:
if has_decrypted:
self.log.info(f"Decrypted tracks with {decrypt_tool}")
# Extract Closed Captions from decrypted video tracks
if (
not no_subs
and not (hasattr(service, "NO_SUBTITLES") and service.NO_SUBTITLES)
and not video_only
and not no_video
):
for video_track_n, video_track in enumerate(title.tracks.videos):
has_manifest_cc = bool(getattr(video_track, "closed_captions", None))
has_eia_cc = (
not has_manifest_cc
and not title.tracks.subtitles
and any(
x.get("codec_name", "").startswith("eia_")
for x in ffprobe(video_track.path).get("streams", [])
)
)
if not has_manifest_cc and not has_eia_cc:
continue
with console.status(f"Checking Video track {video_track_n + 1} for Closed Captions..."):
try:
cc_lang = (
Language.get(video_track.closed_captions[0]["language"])
if has_manifest_cc and video_track.closed_captions[0].get("language")
else title.language or video_track.language
)
track_id = f"ccextractor-{video_track.id}"
cc = video_track.ccextractor(
track_id=track_id,
out_path=config.directories.temp
/ config.filenames.subtitle.format(id=track_id, language=cc_lang),
language=cc_lang,
original=False,
)
if cc:
cc.cc = True
title.tracks.add(cc)
self.log.info(
f"Extracted a Closed Caption from Video track {video_track_n + 1}"
)
else:
self.log.info(
f"No Closed Captions were found in Video track {video_track_n + 1}"
)
except EnvironmentError:
self.log.error(
"Cannot extract Closed Captions as the ccextractor executable was not found..."
)
break
# Now repack the decrypted tracks
with console.status("Repackaging tracks with FFMPEG..."):
has_repacked = False

View File

@@ -112,6 +112,15 @@ class HLS:
session_drm = HLS.get_all_drm(session_keys)
audio_codecs_by_group_id: dict[str, Audio.Codec] = {}
cc_by_group_id: dict[str, list[dict[str, Any]]] = {}
for media in self.manifest.media:
if media.type == "CLOSED-CAPTIONS":
cc_by_group_id.setdefault(media.group_id, []).append({
"language": media.language,
"name": media.name,
"instream_id": media.instream_id,
"characteristics": media.characteristics,
})
tracks = Tracks()
for playlist in self.manifest.playlists:
@@ -161,6 +170,9 @@ class HLS:
width=playlist.stream_info.resolution[0] if playlist.stream_info.resolution else None,
height=playlist.stream_info.resolution[1] if playlist.stream_info.resolution else None,
fps=playlist.stream_info.frame_rate,
closed_captions=cc_by_group_id.get(
(playlist.stream_info.closed_captions or "").strip('"'), []
),
)
if primary_track_type is Video
else {}

View File

@@ -103,8 +103,7 @@ class Tracks:
tree = Tree("", hide_root=True)
for track_type in self.TRACK_ORDER_MAP:
tracks = list(x for x in all_tracks if isinstance(x, track_type))
if not tracks:
continue
if tracks:
num_tracks = len(tracks)
track_type_plural = track_type.__name__ + ("s" if track_type != Audio and num_tracks != 1 else "")
tracks_tree = tree.add(f"[repr.number]{num_tracks}[/] {track_type_plural}")
@@ -151,6 +150,32 @@ class Tracks:
else:
tracks_tree.add(str(track)[6:], style="text2")
# Show Closed Captions right after Subtitles (even if no subtitle tracks exist)
if track_type is Subtitle:
seen_cc: set[str] = set()
unique_cc: list[str] = []
for video in (x for x in all_tracks if isinstance(x, Video)):
for cc in getattr(video, "closed_captions", []):
lang = cc.get("language", "und")
name = cc.get("name", "")
instream_id = cc.get("instream_id", "")
key = f"{lang}|{instream_id}"
if key in seen_cc:
continue
seen_cc.add(key)
parts = [f"[CC] | {lang}"]
if name:
parts.append(name)
if instream_id:
parts.append(instream_id)
unique_cc.append(" | ".join(parts))
if unique_cc:
cc_tree = tree.add(
f"[repr.number]{len(unique_cc)}[/] Closed Caption{'s' if len(unique_cc) != 1 else ''}"
)
for cc_str in unique_cc:
cc_tree.add(cc_str, style="text2")
return tree, progress_callables
def exists(self, by_id: Optional[str] = None, by_url: Optional[Union[str, list[str]]] = None) -> bool:

View File

@@ -200,6 +200,7 @@ class Video(Track):
height: Optional[int] = None,
fps: Optional[Union[str, int, float]] = None,
scan_type: Optional[Video.ScanType] = None,
closed_captions: Optional[list[dict[str, Any]]] = None,
**kwargs: Any,
) -> None:
"""
@@ -264,6 +265,7 @@ class Video(Track):
raise ValueError("Expected fps to be a number, float, or a string as numerator/denominator form, " + str(e))
self.scan_type = scan_type
self.closed_captions: list[dict[str, Any]] = closed_captions or []
self.needs_duration_fix = False
def __str__(self) -> str:
@@ -346,11 +348,9 @@ class Video(Track):
if not binaries.CCExtractor:
raise EnvironmentError("ccextractor executable was not found.")
# ccextractor often fails in weird ways unless we repack
self.repackage()
out_path = Path(out_path)
def _run_ccextractor() -> bool:
try:
subprocess.run(
[binaries.CCExtractor, "-trim", "-nobom", "-noru", "-ru1", "-o", out_path, self.path],
@@ -360,8 +360,15 @@ class Video(Track):
)
except subprocess.CalledProcessError as e:
out_path.unlink(missing_ok=True)
if not e.returncode == 10: # No captions found
if e.returncode != 10: # 10 = No captions found
raise
return out_path.exists()
# Try on the original file first (preserves container-level CC data like c608 boxes),
# then fall back to repacked file (ccextractor can fail on some container formats).
if not _run_ccextractor():
self.repackage()
_run_ccextractor()
if out_path.exists():
cc_track = Subtitle(