mirror of
https://github.com/unshackle-dl/unshackle.git
synced 2026-03-12 17:39:01 +00:00
feat(dl): extract closed captions from HLS manifests and improve CC extraction
- Parse CLOSED-CAPTIONS entries from HLS manifests and attach CC metadata (language, name, instream_id) to video tracks - Move CC extraction to run after decryption instead of before, fixing extraction failures on encrypted streams - Extract CCs even when other subtitle tracks exist, using manifest CC language info instead of guessing - Try ccextractor on the original file before repacking to preserve container-level CC data (e.g. c608 boxes) that ffmpeg remux strips - Display deduplicated closed captions in --list output and download progress, positioned after subtitles - Add closed_captions field to Video track class
This commit is contained in:
@@ -25,6 +25,7 @@ import click
|
|||||||
import jsonpickle
|
import jsonpickle
|
||||||
import yaml
|
import yaml
|
||||||
from construct import ConstError
|
from construct import ConstError
|
||||||
|
from langcodes import Language
|
||||||
from pymediainfo import MediaInfo
|
from pymediainfo import MediaInfo
|
||||||
from pyplayready.cdm import Cdm as PlayReadyCdm
|
from pyplayready.cdm import Cdm as PlayReadyCdm
|
||||||
from pyplayready.device import Device as PlayReadyDevice
|
from pyplayready.device import Device as PlayReadyDevice
|
||||||
@@ -2025,49 +2026,6 @@ class dl:
|
|||||||
dl_time = time_elapsed_since(dl_start_time)
|
dl_time = time_elapsed_since(dl_start_time)
|
||||||
console.print(Padding(f"Track downloads finished in [progress.elapsed]{dl_time}[/]", (0, 5)))
|
console.print(Padding(f"Track downloads finished in [progress.elapsed]{dl_time}[/]", (0, 5)))
|
||||||
|
|
||||||
video_track_n = 0
|
|
||||||
|
|
||||||
while (
|
|
||||||
not title.tracks.subtitles
|
|
||||||
and not no_subs
|
|
||||||
and not (hasattr(service, "NO_SUBTITLES") and service.NO_SUBTITLES)
|
|
||||||
and not video_only
|
|
||||||
and not no_video
|
|
||||||
and len(title.tracks.videos) > video_track_n
|
|
||||||
and any(
|
|
||||||
x.get("codec_name", "").startswith("eia_")
|
|
||||||
for x in ffprobe(title.tracks.videos[video_track_n].path).get("streams", [])
|
|
||||||
)
|
|
||||||
):
|
|
||||||
with console.status(f"Checking Video track {video_track_n + 1} for Closed Captions..."):
|
|
||||||
try:
|
|
||||||
# TODO: Figure out the real language, it might be different
|
|
||||||
# EIA-CC tracks sadly don't carry language information :(
|
|
||||||
# TODO: Figure out if the CC language is original lang or not.
|
|
||||||
# Will need to figure out above first to do so.
|
|
||||||
video_track = title.tracks.videos[video_track_n]
|
|
||||||
track_id = f"ccextractor-{video_track.id}"
|
|
||||||
cc_lang = title.language or video_track.language
|
|
||||||
cc = video_track.ccextractor(
|
|
||||||
track_id=track_id,
|
|
||||||
out_path=config.directories.temp
|
|
||||||
/ config.filenames.subtitle.format(id=track_id, language=cc_lang),
|
|
||||||
language=cc_lang,
|
|
||||||
original=False,
|
|
||||||
)
|
|
||||||
if cc:
|
|
||||||
# will not appear in track listings as it's added after all times it lists
|
|
||||||
title.tracks.add(cc)
|
|
||||||
self.log.info(f"Extracted a Closed Caption from Video track {video_track_n + 1}")
|
|
||||||
else:
|
|
||||||
self.log.info(f"No Closed Captions were found in Video track {video_track_n + 1}")
|
|
||||||
except EnvironmentError:
|
|
||||||
self.log.error(
|
|
||||||
"Cannot extract Closed Captions as the ccextractor executable was not found..."
|
|
||||||
)
|
|
||||||
break
|
|
||||||
video_track_n += 1
|
|
||||||
|
|
||||||
# Subtitle output mode configuration (for sidecar originals)
|
# Subtitle output mode configuration (for sidecar originals)
|
||||||
subtitle_output_mode = config.subtitle.get("output_mode", "mux")
|
subtitle_output_mode = config.subtitle.get("output_mode", "mux")
|
||||||
sidecar_format = config.subtitle.get("sidecar_format", "srt")
|
sidecar_format = config.subtitle.get("sidecar_format", "srt")
|
||||||
@@ -2133,6 +2091,57 @@ class dl:
|
|||||||
if has_decrypted:
|
if has_decrypted:
|
||||||
self.log.info(f"Decrypted tracks with {decrypt_tool}")
|
self.log.info(f"Decrypted tracks with {decrypt_tool}")
|
||||||
|
|
||||||
|
# Extract Closed Captions from decrypted video tracks
|
||||||
|
if (
|
||||||
|
not no_subs
|
||||||
|
and not (hasattr(service, "NO_SUBTITLES") and service.NO_SUBTITLES)
|
||||||
|
and not video_only
|
||||||
|
and not no_video
|
||||||
|
):
|
||||||
|
for video_track_n, video_track in enumerate(title.tracks.videos):
|
||||||
|
has_manifest_cc = bool(getattr(video_track, "closed_captions", None))
|
||||||
|
has_eia_cc = (
|
||||||
|
not has_manifest_cc
|
||||||
|
and not title.tracks.subtitles
|
||||||
|
and any(
|
||||||
|
x.get("codec_name", "").startswith("eia_")
|
||||||
|
for x in ffprobe(video_track.path).get("streams", [])
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if not has_manifest_cc and not has_eia_cc:
|
||||||
|
continue
|
||||||
|
|
||||||
|
with console.status(f"Checking Video track {video_track_n + 1} for Closed Captions..."):
|
||||||
|
try:
|
||||||
|
cc_lang = (
|
||||||
|
Language.get(video_track.closed_captions[0]["language"])
|
||||||
|
if has_manifest_cc and video_track.closed_captions[0].get("language")
|
||||||
|
else title.language or video_track.language
|
||||||
|
)
|
||||||
|
track_id = f"ccextractor-{video_track.id}"
|
||||||
|
cc = video_track.ccextractor(
|
||||||
|
track_id=track_id,
|
||||||
|
out_path=config.directories.temp
|
||||||
|
/ config.filenames.subtitle.format(id=track_id, language=cc_lang),
|
||||||
|
language=cc_lang,
|
||||||
|
original=False,
|
||||||
|
)
|
||||||
|
if cc:
|
||||||
|
cc.cc = True
|
||||||
|
title.tracks.add(cc)
|
||||||
|
self.log.info(
|
||||||
|
f"Extracted a Closed Caption from Video track {video_track_n + 1}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.log.info(
|
||||||
|
f"No Closed Captions were found in Video track {video_track_n + 1}"
|
||||||
|
)
|
||||||
|
except EnvironmentError:
|
||||||
|
self.log.error(
|
||||||
|
"Cannot extract Closed Captions as the ccextractor executable was not found..."
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
# Now repack the decrypted tracks
|
# Now repack the decrypted tracks
|
||||||
with console.status("Repackaging tracks with FFMPEG..."):
|
with console.status("Repackaging tracks with FFMPEG..."):
|
||||||
has_repacked = False
|
has_repacked = False
|
||||||
|
|||||||
@@ -112,6 +112,15 @@ class HLS:
|
|||||||
session_drm = HLS.get_all_drm(session_keys)
|
session_drm = HLS.get_all_drm(session_keys)
|
||||||
|
|
||||||
audio_codecs_by_group_id: dict[str, Audio.Codec] = {}
|
audio_codecs_by_group_id: dict[str, Audio.Codec] = {}
|
||||||
|
cc_by_group_id: dict[str, list[dict[str, Any]]] = {}
|
||||||
|
for media in self.manifest.media:
|
||||||
|
if media.type == "CLOSED-CAPTIONS":
|
||||||
|
cc_by_group_id.setdefault(media.group_id, []).append({
|
||||||
|
"language": media.language,
|
||||||
|
"name": media.name,
|
||||||
|
"instream_id": media.instream_id,
|
||||||
|
"characteristics": media.characteristics,
|
||||||
|
})
|
||||||
tracks = Tracks()
|
tracks = Tracks()
|
||||||
|
|
||||||
for playlist in self.manifest.playlists:
|
for playlist in self.manifest.playlists:
|
||||||
@@ -161,6 +170,9 @@ class HLS:
|
|||||||
width=playlist.stream_info.resolution[0] if playlist.stream_info.resolution else None,
|
width=playlist.stream_info.resolution[0] if playlist.stream_info.resolution else None,
|
||||||
height=playlist.stream_info.resolution[1] if playlist.stream_info.resolution else None,
|
height=playlist.stream_info.resolution[1] if playlist.stream_info.resolution else None,
|
||||||
fps=playlist.stream_info.frame_rate,
|
fps=playlist.stream_info.frame_rate,
|
||||||
|
closed_captions=cc_by_group_id.get(
|
||||||
|
(playlist.stream_info.closed_captions or "").strip('"'), []
|
||||||
|
),
|
||||||
)
|
)
|
||||||
if primary_track_type is Video
|
if primary_track_type is Video
|
||||||
else {}
|
else {}
|
||||||
|
|||||||
@@ -103,53 +103,78 @@ class Tracks:
|
|||||||
tree = Tree("", hide_root=True)
|
tree = Tree("", hide_root=True)
|
||||||
for track_type in self.TRACK_ORDER_MAP:
|
for track_type in self.TRACK_ORDER_MAP:
|
||||||
tracks = list(x for x in all_tracks if isinstance(x, track_type))
|
tracks = list(x for x in all_tracks if isinstance(x, track_type))
|
||||||
if not tracks:
|
if tracks:
|
||||||
continue
|
num_tracks = len(tracks)
|
||||||
num_tracks = len(tracks)
|
track_type_plural = track_type.__name__ + ("s" if track_type != Audio and num_tracks != 1 else "")
|
||||||
track_type_plural = track_type.__name__ + ("s" if track_type != Audio and num_tracks != 1 else "")
|
tracks_tree = tree.add(f"[repr.number]{num_tracks}[/] {track_type_plural}")
|
||||||
tracks_tree = tree.add(f"[repr.number]{num_tracks}[/] {track_type_plural}")
|
for track in tracks:
|
||||||
for track in tracks:
|
if add_progress and track_type not in (Chapter, Attachment):
|
||||||
if add_progress and track_type not in (Chapter, Attachment):
|
progress = Progress(
|
||||||
progress = Progress(
|
SpinnerColumn(finished_text=""),
|
||||||
SpinnerColumn(finished_text=""),
|
BarColumn(),
|
||||||
BarColumn(),
|
"•",
|
||||||
"•",
|
TimeRemainingColumn(compact=True, elapsed_when_finished=True),
|
||||||
TimeRemainingColumn(compact=True, elapsed_when_finished=True),
|
"•",
|
||||||
"•",
|
TextColumn("[progress.data.speed]{task.fields[downloaded]}"),
|
||||||
TextColumn("[progress.data.speed]{task.fields[downloaded]}"),
|
console=console,
|
||||||
console=console,
|
speed_estimate_period=10,
|
||||||
speed_estimate_period=10,
|
)
|
||||||
|
task = progress.add_task("", downloaded="-")
|
||||||
|
state = {"total": 100.0}
|
||||||
|
|
||||||
|
def update_track_progress(
|
||||||
|
task_id: int = task,
|
||||||
|
_state: dict[str, float] = state,
|
||||||
|
_progress: Progress = progress,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Ensure terminal status states render as a fully completed bar.
|
||||||
|
|
||||||
|
Some downloaders can report completed slightly below total
|
||||||
|
before emitting the final "Downloaded" state.
|
||||||
|
"""
|
||||||
|
if "total" in kwargs and kwargs["total"] is not None:
|
||||||
|
_state["total"] = kwargs["total"]
|
||||||
|
|
||||||
|
downloaded_state = kwargs.get("downloaded")
|
||||||
|
if downloaded_state in {"Downloaded", "Decrypted", "[yellow]SKIPPED"}:
|
||||||
|
kwargs["completed"] = _state["total"]
|
||||||
|
_progress.update(task_id=task_id, **kwargs)
|
||||||
|
|
||||||
|
progress_callables.append(update_track_progress)
|
||||||
|
track_table = Table.grid()
|
||||||
|
track_table.add_row(str(track)[6:], style="text2")
|
||||||
|
track_table.add_row(progress)
|
||||||
|
tracks_tree.add(track_table)
|
||||||
|
else:
|
||||||
|
tracks_tree.add(str(track)[6:], style="text2")
|
||||||
|
|
||||||
|
# Show Closed Captions right after Subtitles (even if no subtitle tracks exist)
|
||||||
|
if track_type is Subtitle:
|
||||||
|
seen_cc: set[str] = set()
|
||||||
|
unique_cc: list[str] = []
|
||||||
|
for video in (x for x in all_tracks if isinstance(x, Video)):
|
||||||
|
for cc in getattr(video, "closed_captions", []):
|
||||||
|
lang = cc.get("language", "und")
|
||||||
|
name = cc.get("name", "")
|
||||||
|
instream_id = cc.get("instream_id", "")
|
||||||
|
key = f"{lang}|{instream_id}"
|
||||||
|
if key in seen_cc:
|
||||||
|
continue
|
||||||
|
seen_cc.add(key)
|
||||||
|
parts = [f"[CC] | {lang}"]
|
||||||
|
if name:
|
||||||
|
parts.append(name)
|
||||||
|
if instream_id:
|
||||||
|
parts.append(instream_id)
|
||||||
|
unique_cc.append(" | ".join(parts))
|
||||||
|
if unique_cc:
|
||||||
|
cc_tree = tree.add(
|
||||||
|
f"[repr.number]{len(unique_cc)}[/] Closed Caption{'s' if len(unique_cc) != 1 else ''}"
|
||||||
)
|
)
|
||||||
task = progress.add_task("", downloaded="-")
|
for cc_str in unique_cc:
|
||||||
state = {"total": 100.0}
|
cc_tree.add(cc_str, style="text2")
|
||||||
|
|
||||||
def update_track_progress(
|
|
||||||
task_id: int = task,
|
|
||||||
_state: dict[str, float] = state,
|
|
||||||
_progress: Progress = progress,
|
|
||||||
**kwargs,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
Ensure terminal status states render as a fully completed bar.
|
|
||||||
|
|
||||||
Some downloaders can report completed slightly below total
|
|
||||||
before emitting the final "Downloaded" state.
|
|
||||||
"""
|
|
||||||
if "total" in kwargs and kwargs["total"] is not None:
|
|
||||||
_state["total"] = kwargs["total"]
|
|
||||||
|
|
||||||
downloaded_state = kwargs.get("downloaded")
|
|
||||||
if downloaded_state in {"Downloaded", "Decrypted", "[yellow]SKIPPED"}:
|
|
||||||
kwargs["completed"] = _state["total"]
|
|
||||||
_progress.update(task_id=task_id, **kwargs)
|
|
||||||
|
|
||||||
progress_callables.append(update_track_progress)
|
|
||||||
track_table = Table.grid()
|
|
||||||
track_table.add_row(str(track)[6:], style="text2")
|
|
||||||
track_table.add_row(progress)
|
|
||||||
tracks_tree.add(track_table)
|
|
||||||
else:
|
|
||||||
tracks_tree.add(str(track)[6:], style="text2")
|
|
||||||
|
|
||||||
return tree, progress_callables
|
return tree, progress_callables
|
||||||
|
|
||||||
|
|||||||
@@ -200,6 +200,7 @@ class Video(Track):
|
|||||||
height: Optional[int] = None,
|
height: Optional[int] = None,
|
||||||
fps: Optional[Union[str, int, float]] = None,
|
fps: Optional[Union[str, int, float]] = None,
|
||||||
scan_type: Optional[Video.ScanType] = None,
|
scan_type: Optional[Video.ScanType] = None,
|
||||||
|
closed_captions: Optional[list[dict[str, Any]]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
@@ -264,6 +265,7 @@ class Video(Track):
|
|||||||
raise ValueError("Expected fps to be a number, float, or a string as numerator/denominator form, " + str(e))
|
raise ValueError("Expected fps to be a number, float, or a string as numerator/denominator form, " + str(e))
|
||||||
|
|
||||||
self.scan_type = scan_type
|
self.scan_type = scan_type
|
||||||
|
self.closed_captions: list[dict[str, Any]] = closed_captions or []
|
||||||
self.needs_duration_fix = False
|
self.needs_duration_fix = False
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
@@ -346,22 +348,27 @@ class Video(Track):
|
|||||||
if not binaries.CCExtractor:
|
if not binaries.CCExtractor:
|
||||||
raise EnvironmentError("ccextractor executable was not found.")
|
raise EnvironmentError("ccextractor executable was not found.")
|
||||||
|
|
||||||
# ccextractor often fails in weird ways unless we repack
|
|
||||||
self.repackage()
|
|
||||||
|
|
||||||
out_path = Path(out_path)
|
out_path = Path(out_path)
|
||||||
|
|
||||||
try:
|
def _run_ccextractor() -> bool:
|
||||||
subprocess.run(
|
try:
|
||||||
[binaries.CCExtractor, "-trim", "-nobom", "-noru", "-ru1", "-o", out_path, self.path],
|
subprocess.run(
|
||||||
check=True,
|
[binaries.CCExtractor, "-trim", "-nobom", "-noru", "-ru1", "-o", out_path, self.path],
|
||||||
stdout=subprocess.PIPE,
|
check=True,
|
||||||
stderr=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
)
|
stderr=subprocess.PIPE,
|
||||||
except subprocess.CalledProcessError as e:
|
)
|
||||||
out_path.unlink(missing_ok=True)
|
except subprocess.CalledProcessError as e:
|
||||||
if not e.returncode == 10: # No captions found
|
out_path.unlink(missing_ok=True)
|
||||||
raise
|
if e.returncode != 10: # 10 = No captions found
|
||||||
|
raise
|
||||||
|
return out_path.exists()
|
||||||
|
|
||||||
|
# Try on the original file first (preserves container-level CC data like c608 boxes),
|
||||||
|
# then fall back to repacked file (ccextractor can fail on some container formats).
|
||||||
|
if not _run_ccextractor():
|
||||||
|
self.repackage()
|
||||||
|
_run_ccextractor()
|
||||||
|
|
||||||
if out_path.exists():
|
if out_path.exists():
|
||||||
cc_track = Subtitle(
|
cc_track = Subtitle(
|
||||||
|
|||||||
Reference in New Issue
Block a user