feat(dl): extract closed captions from HLS manifests and improve CC extraction

- Parse CLOSED-CAPTIONS entries from HLS manifests and attach CC metadata (language, name, instream_id) to video tracks - Move CC extraction to run after decryption instead of before, fixing extraction failures on encrypted streams - Extract CCs even when other subtitle tracks exist, using manifest CC language info instead of guessing - Try ccextractor on the original file before repacking to preserve container-level CC data (e.g. c608 boxes) that ffmpeg remux strips - Display deduplicated closed captions in --list output and download progress, positioned after subtitles - Add closed_captions field to Video track class
2026-06-15 21:47:24 +00:00 · 2026-03-05 15:57:29 -07:00
parent 7dd6323be5
commit 15acaea208
4 changed files with 156 additions and 103 deletions
--- a/unshackle/commands/dl.py
+++ b/unshackle/commands/dl.py
@@ -25,6 +25,7 @@ import click
 import jsonpickle
 import yaml
 from construct import ConstError
 from langcodes import Language
 from pymediainfo import MediaInfo
 from pyplayready.cdm import Cdm as PlayReadyCdm
 from pyplayready.device import Device as PlayReadyDevice
@@ -2025,49 +2026,6 @@ class dl:
                dl_time = time_elapsed_since(dl_start_time)
                console.print(Padding(f"Track downloads finished in [progress.elapsed]{dl_time}[/]", (0, 5)))
                video_track_n = 0
                while (
                    not title.tracks.subtitles
                    and not no_subs
                    and not (hasattr(service, "NO_SUBTITLES") and service.NO_SUBTITLES)
                    and not video_only
                    and not no_video
                    and len(title.tracks.videos) > video_track_n
                    and any(
                        x.get("codec_name", "").startswith("eia_")
                        for x in ffprobe(title.tracks.videos[video_track_n].path).get("streams", [])
                    )
                ):
                    with console.status(f"Checking Video track {video_track_n + 1} for Closed Captions..."):
                        try:
                            # TODO: Figure out the real language, it might be different
                            #       EIA-CC tracks sadly don't carry language information :(
                            # TODO: Figure out if the CC language is original lang or not.
                            #       Will need to figure out above first to do so.
                            video_track = title.tracks.videos[video_track_n]
                            track_id = f"ccextractor-{video_track.id}"
                            cc_lang = title.language or video_track.language
                            cc = video_track.ccextractor(
                                track_id=track_id,
                                out_path=config.directories.temp
                                / config.filenames.subtitle.format(id=track_id, language=cc_lang),
                                language=cc_lang,
                                original=False,
                            )
                            if cc:
                                # will not appear in track listings as it's added after all times it lists
                                title.tracks.add(cc)
                                self.log.info(f"Extracted a Closed Caption from Video track {video_track_n + 1}")
                            else:
                                self.log.info(f"No Closed Captions were found in Video track {video_track_n + 1}")
                        except EnvironmentError:
                            self.log.error(
                                "Cannot extract Closed Captions as the ccextractor executable was not found..."
                            )
                            break
                    video_track_n += 1
                # Subtitle output mode configuration (for sidecar originals)
                subtitle_output_mode = config.subtitle.get("output_mode", "mux")
                sidecar_format = config.subtitle.get("sidecar_format", "srt")
@@ -2133,6 +2091,57 @@ class dl:
                        if has_decrypted:
                            self.log.info(f"Decrypted tracks with {decrypt_tool}")
                # Extract Closed Captions from decrypted video tracks
                if (
                    not no_subs
                    and not (hasattr(service, "NO_SUBTITLES") and service.NO_SUBTITLES)
                    and not video_only
                    and not no_video
                ):
                    for video_track_n, video_track in enumerate(title.tracks.videos):
                        has_manifest_cc = bool(getattr(video_track, "closed_captions", None))
                        has_eia_cc = (
                            not has_manifest_cc
                            and not title.tracks.subtitles
                            and any(
                                x.get("codec_name", "").startswith("eia_")
                                for x in ffprobe(video_track.path).get("streams", [])
                            )
                        )
                        if not has_manifest_cc and not has_eia_cc:
                            continue
                        with console.status(f"Checking Video track {video_track_n + 1} for Closed Captions..."):
                            try:
                                cc_lang = (
                                    Language.get(video_track.closed_captions[0]["language"])
                                    if has_manifest_cc and video_track.closed_captions[0].get("language")
                                    else title.language or video_track.language
                                )
                                track_id = f"ccextractor-{video_track.id}"
                                cc = video_track.ccextractor(
                                    track_id=track_id,
                                    out_path=config.directories.temp
                                    / config.filenames.subtitle.format(id=track_id, language=cc_lang),
                                    language=cc_lang,
                                    original=False,
                                )
                                if cc:
                                    cc.cc = True
                                    title.tracks.add(cc)
                                    self.log.info(
                                        f"Extracted a Closed Caption from Video track {video_track_n + 1}"
                                    )
                                else:
                                    self.log.info(
                                        f"No Closed Captions were found in Video track {video_track_n + 1}"
                                    )
                            except EnvironmentError:
                                self.log.error(
                                    "Cannot extract Closed Captions as the ccextractor executable was not found..."
                                )
                                break
                # Now repack the decrypted tracks
                with console.status("Repackaging tracks with FFMPEG..."):
                    has_repacked = False
--- a/unshackle/core/manifests/hls.py
+++ b/unshackle/core/manifests/hls.py
@@ -112,6 +112,15 @@ class HLS:
        session_drm = HLS.get_all_drm(session_keys)
        audio_codecs_by_group_id: dict[str, Audio.Codec] = {}
        cc_by_group_id: dict[str, list[dict[str, Any]]] = {}
        for media in self.manifest.media:
            if media.type == "CLOSED-CAPTIONS":
                cc_by_group_id.setdefault(media.group_id, []).append({
                    "language": media.language,
                    "name": media.name,
                    "instream_id": media.instream_id,
                    "characteristics": media.characteristics,
                })
        tracks = Tracks()
        for playlist in self.manifest.playlists:
@@ -161,6 +170,9 @@ class HLS:
                            width=playlist.stream_info.resolution[0] if playlist.stream_info.resolution else None,
                            height=playlist.stream_info.resolution[1] if playlist.stream_info.resolution else None,
                            fps=playlist.stream_info.frame_rate,
                            closed_captions=cc_by_group_id.get(
                                (playlist.stream_info.closed_captions or "").strip('"'), []
                            ),
                        )
                        if primary_track_type is Video
                        else {}
--- a/unshackle/core/tracks/tracks.py
+++ b/unshackle/core/tracks/tracks.py
@@ -103,53 +103,78 @@ class Tracks:
        tree = Tree("", hide_root=True)
        for track_type in self.TRACK_ORDER_MAP:
            tracks = list(x for x in all_tracks if isinstance(x, track_type))
-            if not tracks:
+            if tracks:
-                continue
+                num_tracks = len(tracks)
-            num_tracks = len(tracks)
+                track_type_plural = track_type.__name__ + ("s" if track_type != Audio and num_tracks != 1 else "")
-            track_type_plural = track_type.__name__ + ("s" if track_type != Audio and num_tracks != 1 else "")
+                tracks_tree = tree.add(f"[repr.number]{num_tracks}[/] {track_type_plural}")
-            tracks_tree = tree.add(f"[repr.number]{num_tracks}[/] {track_type_plural}")
+                for track in tracks:
-            for track in tracks:
+                    if add_progress and track_type not in (Chapter, Attachment):
-                if add_progress and track_type not in (Chapter, Attachment):
+                        progress = Progress(
-                    progress = Progress(
+                            SpinnerColumn(finished_text=""),
-                        SpinnerColumn(finished_text=""),
+                            BarColumn(),
-                        BarColumn(),
+                            "•",
-                        "•",
+                            TimeRemainingColumn(compact=True, elapsed_when_finished=True),
-                        TimeRemainingColumn(compact=True, elapsed_when_finished=True),
+                            "•",
-                        "•",
+                            TextColumn("[progress.data.speed]{task.fields[downloaded]}"),
-                        TextColumn("[progress.data.speed]{task.fields[downloaded]}"),
+                            console=console,
-                        console=console,
+                            speed_estimate_period=10,
-                        speed_estimate_period=10,
+                        )
                        task = progress.add_task("", downloaded="-")
                        state = {"total": 100.0}
                        def update_track_progress(
                            task_id: int = task,
                            _state: dict[str, float] = state,
                            _progress: Progress = progress,
                            **kwargs,
                        ) -> None:
                            """
                            Ensure terminal status states render as a fully completed bar.
                            Some downloaders can report completed slightly below total
                            before emitting the final "Downloaded" state.
                            """
                            if "total" in kwargs and kwargs["total"] is not None:
                                _state["total"] = kwargs["total"]
                            downloaded_state = kwargs.get("downloaded")
                            if downloaded_state in {"Downloaded", "Decrypted", "[yellow]SKIPPED"}:
                                kwargs["completed"] = _state["total"]
                            _progress.update(task_id=task_id, **kwargs)
                        progress_callables.append(update_track_progress)
                        track_table = Table.grid()
                        track_table.add_row(str(track)[6:], style="text2")
                        track_table.add_row(progress)
                        tracks_tree.add(track_table)
                    else:
                        tracks_tree.add(str(track)[6:], style="text2")
            # Show Closed Captions right after Subtitles (even if no subtitle tracks exist)
            if track_type is Subtitle:
                seen_cc: set[str] = set()
                unique_cc: list[str] = []
                for video in (x for x in all_tracks if isinstance(x, Video)):
                    for cc in getattr(video, "closed_captions", []):
                        lang = cc.get("language", "und")
                        name = cc.get("name", "")
                        instream_id = cc.get("instream_id", "")
                        key = f"{lang}|{instream_id}"
                        if key in seen_cc:
                            continue
                        seen_cc.add(key)
                        parts = [f"[CC] | {lang}"]
                        if name:
                            parts.append(name)
                        if instream_id:
                            parts.append(instream_id)
                        unique_cc.append(" | ".join(parts))
                if unique_cc:
                    cc_tree = tree.add(
                        f"[repr.number]{len(unique_cc)}[/] Closed Caption{'s' if len(unique_cc) != 1 else ''}"
                    )
-                    task = progress.add_task("", downloaded="-")
+                    for cc_str in unique_cc:
-                    state = {"total": 100.0}
+                        cc_tree.add(cc_str, style="text2")
                    def update_track_progress(
                        task_id: int = task,
                        _state: dict[str, float] = state,
                        _progress: Progress = progress,
                        **kwargs,
                    ) -> None:
                        """
                        Ensure terminal status states render as a fully completed bar.
                        Some downloaders can report completed slightly below total
                        before emitting the final "Downloaded" state.
                        """
                        if "total" in kwargs and kwargs["total"] is not None:
                            _state["total"] = kwargs["total"]
                        downloaded_state = kwargs.get("downloaded")
                        if downloaded_state in {"Downloaded", "Decrypted", "[yellow]SKIPPED"}:
                            kwargs["completed"] = _state["total"]
                        _progress.update(task_id=task_id, **kwargs)
                    progress_callables.append(update_track_progress)
                    track_table = Table.grid()
                    track_table.add_row(str(track)[6:], style="text2")
                    track_table.add_row(progress)
                    tracks_tree.add(track_table)
                else:
                    tracks_tree.add(str(track)[6:], style="text2")
        return tree, progress_callables
--- a/unshackle/core/tracks/video.py
+++ b/unshackle/core/tracks/video.py
@@ -200,6 +200,7 @@ class Video(Track):
        height: Optional[int] = None,
        fps: Optional[Union[str, int, float]] = None,
        scan_type: Optional[Video.ScanType] = None,
        closed_captions: Optional[list[dict[str, Any]]] = None,
        **kwargs: Any,
    ) -> None:
        """
@@ -264,6 +265,7 @@ class Video(Track):
            raise ValueError("Expected fps to be a number, float, or a string as numerator/denominator form, " + str(e))
        self.scan_type = scan_type
        self.closed_captions: list[dict[str, Any]] = closed_captions or []
        self.needs_duration_fix = False
    def __str__(self) -> str:
@@ -346,22 +348,27 @@ class Video(Track):
        if not binaries.CCExtractor:
            raise EnvironmentError("ccextractor executable was not found.")
        # ccextractor often fails in weird ways unless we repack
        self.repackage()
        out_path = Path(out_path)
-        try:
+        def _run_ccextractor() -> bool:
-            subprocess.run(
+            try:
-                [binaries.CCExtractor, "-trim", "-nobom", "-noru", "-ru1", "-o", out_path, self.path],
+                subprocess.run(
-                check=True,
+                    [binaries.CCExtractor, "-trim", "-nobom", "-noru", "-ru1", "-o", out_path, self.path],
-                stdout=subprocess.PIPE,
+                    check=True,
-                stderr=subprocess.PIPE,
+                    stdout=subprocess.PIPE,
-            )
+                    stderr=subprocess.PIPE,
-        except subprocess.CalledProcessError as e:
+                )
-            out_path.unlink(missing_ok=True)
+            except subprocess.CalledProcessError as e:
-            if not e.returncode == 10:  # No captions found
+                out_path.unlink(missing_ok=True)
-                raise
+                if e.returncode != 10:  # 10 = No captions found
                    raise
            return out_path.exists()
        # Try on the original file first (preserves container-level CC data like c608 boxes),
        # then fall back to repacked file (ccextractor can fail on some container formats).
        if not _run_ccextractor():
            self.repackage()
            _run_ccextractor()
        if out_path.exists():
            cc_track = Subtitle(