feat(dl): extract closed captions from HLS manifests and improve CC extraction

- Parse CLOSED-CAPTIONS entries from HLS manifests and attach CC metadata (language, name, instream_id) to video tracks - Move CC extraction to run after decryption instead of before, fixing extraction failures on encrypted streams - Extract CCs even when other subtitle tracks exist, using manifest CC language info instead of guessing - Try ccextractor on the original file before repacking to preserve container-level CC data (e.g. c608 boxes) that ffmpeg remux strips - Display deduplicated closed captions in --list output and download progress, positioned after subtitles - Add closed_captions field to Video track class
2026-03-12 09:29:02 +00:00 · 2026-03-05 15:57:29 -07:00
parent 7dd6323be5
commit 15acaea208
4 changed files with 156 additions and 103 deletions
--- a/unshackle/commands/dl.py
+++ b/unshackle/commands/dl.py
@@ -25,6 +25,7 @@ import click
 import jsonpickle
 import yaml
 from construct import ConstError
+from langcodes import Language
 from pymediainfo import MediaInfo
 from pyplayready.cdm import Cdm as PlayReadyCdm
 from pyplayready.device import Device as PlayReadyDevice
@@ -2025,49 +2026,6 @@ class dl:
                dl_time = time_elapsed_since(dl_start_time)
                console.print(Padding(f"Track downloads finished in [progress.elapsed]{dl_time}[/]", (0, 5)))

-                video_track_n = 0
-
-                while (
-                    not title.tracks.subtitles
-                    and not no_subs
-                    and not (hasattr(service, "NO_SUBTITLES") and service.NO_SUBTITLES)
-                    and not video_only
-                    and not no_video
-                    and len(title.tracks.videos) > video_track_n
-                    and any(
-                        x.get("codec_name", "").startswith("eia_")
-                        for x in ffprobe(title.tracks.videos[video_track_n].path).get("streams", [])
-                    )
-                ):
-                    with console.status(f"Checking Video track {video_track_n + 1} for Closed Captions..."):
-                        try:
-                            # TODO: Figure out the real language, it might be different
-                            #       EIA-CC tracks sadly don't carry language information :(
-                            # TODO: Figure out if the CC language is original lang or not.
-                            #       Will need to figure out above first to do so.
-                            video_track = title.tracks.videos[video_track_n]
-                            track_id = f"ccextractor-{video_track.id}"
-                            cc_lang = title.language or video_track.language
-                            cc = video_track.ccextractor(
-                                track_id=track_id,
-                                out_path=config.directories.temp
-                                / config.filenames.subtitle.format(id=track_id, language=cc_lang),
-                                language=cc_lang,
-                                original=False,
-                            )
-                            if cc:
-                                # will not appear in track listings as it's added after all times it lists
-                                title.tracks.add(cc)
-                                self.log.info(f"Extracted a Closed Caption from Video track {video_track_n + 1}")
-                            else:
-                                self.log.info(f"No Closed Captions were found in Video track {video_track_n + 1}")
-                        except EnvironmentError:
-                            self.log.error(
-                                "Cannot extract Closed Captions as the ccextractor executable was not found..."
-                            )
-                            break
-                    video_track_n += 1
-
                # Subtitle output mode configuration (for sidecar originals)
                subtitle_output_mode = config.subtitle.get("output_mode", "mux")
                sidecar_format = config.subtitle.get("sidecar_format", "srt")
@@ -2133,6 +2091,57 @@ class dl:
                        if has_decrypted:
                            self.log.info(f"Decrypted tracks with {decrypt_tool}")

+                # Extract Closed Captions from decrypted video tracks
+                if (
+                    not no_subs
+                    and not (hasattr(service, "NO_SUBTITLES") and service.NO_SUBTITLES)
+                    and not video_only
+                    and not no_video
+                ):
+                    for video_track_n, video_track in enumerate(title.tracks.videos):
+                        has_manifest_cc = bool(getattr(video_track, "closed_captions", None))
+                        has_eia_cc = (
+                            not has_manifest_cc
+                            and not title.tracks.subtitles
+                            and any(
+                                x.get("codec_name", "").startswith("eia_")
+                                for x in ffprobe(video_track.path).get("streams", [])
+                            )
+                        )
+                        if not has_manifest_cc and not has_eia_cc:
+                            continue
+
+                        with console.status(f"Checking Video track {video_track_n + 1} for Closed Captions..."):
+                            try:
+                                cc_lang = (
+                                    Language.get(video_track.closed_captions[0]["language"])
+                                    if has_manifest_cc and video_track.closed_captions[0].get("language")
+                                    else title.language or video_track.language
+                                )
+                                track_id = f"ccextractor-{video_track.id}"
+                                cc = video_track.ccextractor(
+                                    track_id=track_id,
+                                    out_path=config.directories.temp
+                                    / config.filenames.subtitle.format(id=track_id, language=cc_lang),
+                                    language=cc_lang,
+                                    original=False,
+                                )
+                                if cc:
+                                    cc.cc = True
+                                    title.tracks.add(cc)
+                                    self.log.info(
+                                        f"Extracted a Closed Caption from Video track {video_track_n + 1}"
+                                    )
+                                else:
+                                    self.log.info(
+                                        f"No Closed Captions were found in Video track {video_track_n + 1}"
+                                    )
+                            except EnvironmentError:
+                                self.log.error(
+                                    "Cannot extract Closed Captions as the ccextractor executable was not found..."
+                                )
+                                break
+
                # Now repack the decrypted tracks
                with console.status("Repackaging tracks with FFMPEG..."):
                    has_repacked = False
--- a/unshackle/core/manifests/hls.py
+++ b/unshackle/core/manifests/hls.py
@@ -112,6 +112,15 @@ class HLS:
        session_drm = HLS.get_all_drm(session_keys)

        audio_codecs_by_group_id: dict[str, Audio.Codec] = {}
+        cc_by_group_id: dict[str, list[dict[str, Any]]] = {}
+        for media in self.manifest.media:
+            if media.type == "CLOSED-CAPTIONS":
+                cc_by_group_id.setdefault(media.group_id, []).append({
+                    "language": media.language,
+                    "name": media.name,
+                    "instream_id": media.instream_id,
+                    "characteristics": media.characteristics,
+                })
        tracks = Tracks()

        for playlist in self.manifest.playlists:
@@ -161,6 +170,9 @@ class HLS:
                            width=playlist.stream_info.resolution[0] if playlist.stream_info.resolution else None,
                            height=playlist.stream_info.resolution[1] if playlist.stream_info.resolution else None,
                            fps=playlist.stream_info.frame_rate,
+                            closed_captions=cc_by_group_id.get(
+                                (playlist.stream_info.closed_captions or "").strip('"'), []
+                            ),
                        )
                        if primary_track_type is Video
                        else {}
--- a/unshackle/core/tracks/tracks.py
+++ b/unshackle/core/tracks/tracks.py
@@ -103,8 +103,7 @@ class Tracks:
        tree = Tree("", hide_root=True)
        for track_type in self.TRACK_ORDER_MAP:
            tracks = list(x for x in all_tracks if isinstance(x, track_type))
-            if not tracks:
-                continue
+            if tracks:
                num_tracks = len(tracks)
                track_type_plural = track_type.__name__ + ("s" if track_type != Audio and num_tracks != 1 else "")
                tracks_tree = tree.add(f"[repr.number]{num_tracks}[/] {track_type_plural}")
@@ -151,6 +150,32 @@ class Tracks:
                    else:
                        tracks_tree.add(str(track)[6:], style="text2")

+            # Show Closed Captions right after Subtitles (even if no subtitle tracks exist)
+            if track_type is Subtitle:
+                seen_cc: set[str] = set()
+                unique_cc: list[str] = []
+                for video in (x for x in all_tracks if isinstance(x, Video)):
+                    for cc in getattr(video, "closed_captions", []):
+                        lang = cc.get("language", "und")
+                        name = cc.get("name", "")
+                        instream_id = cc.get("instream_id", "")
+                        key = f"{lang}|{instream_id}"
+                        if key in seen_cc:
+                            continue
+                        seen_cc.add(key)
+                        parts = [f"[CC] | {lang}"]
+                        if name:
+                            parts.append(name)
+                        if instream_id:
+                            parts.append(instream_id)
+                        unique_cc.append(" | ".join(parts))
+                if unique_cc:
+                    cc_tree = tree.add(
+                        f"[repr.number]{len(unique_cc)}[/] Closed Caption{'s' if len(unique_cc) != 1 else ''}"
+                    )
+                    for cc_str in unique_cc:
+                        cc_tree.add(cc_str, style="text2")
+
        return tree, progress_callables

    def exists(self, by_id: Optional[str] = None, by_url: Optional[Union[str, list[str]]] = None) -> bool:
--- a/unshackle/core/tracks/video.py
+++ b/unshackle/core/tracks/video.py
@@ -200,6 +200,7 @@ class Video(Track):
        height: Optional[int] = None,
        fps: Optional[Union[str, int, float]] = None,
        scan_type: Optional[Video.ScanType] = None,
+        closed_captions: Optional[list[dict[str, Any]]] = None,
        **kwargs: Any,
    ) -> None:
        """
@@ -264,6 +265,7 @@ class Video(Track):
            raise ValueError("Expected fps to be a number, float, or a string as numerator/denominator form, " + str(e))

        self.scan_type = scan_type
+        self.closed_captions: list[dict[str, Any]] = closed_captions or []
        self.needs_duration_fix = False

    def __str__(self) -> str:
@@ -346,11 +348,9 @@ class Video(Track):
        if not binaries.CCExtractor:
            raise EnvironmentError("ccextractor executable was not found.")

-        # ccextractor often fails in weird ways unless we repack
-        self.repackage()
-
        out_path = Path(out_path)

+        def _run_ccextractor() -> bool:
            try:
                subprocess.run(
                    [binaries.CCExtractor, "-trim", "-nobom", "-noru", "-ru1", "-o", out_path, self.path],
@@ -360,8 +360,15 @@ class Video(Track):
                )
            except subprocess.CalledProcessError as e:
                out_path.unlink(missing_ok=True)
-            if not e.returncode == 10:  # No captions found
+                if e.returncode != 10:  # 10 = No captions found
                    raise
+            return out_path.exists()
+
+        # Try on the original file first (preserves container-level CC data like c608 boxes),
+        # then fall back to repacked file (ccextractor can fail on some container formats).
+        if not _run_ccextractor():
+            self.repackage()
+            _run_ccextractor()

        if out_path.exists():
            cc_track = Subtitle(