feat(subtitle): Add filtering for unwanted cues in WebVTT subtitles

feat(dl): Add option to include forced subtitle tracks
2026-06-15 13:37:24 +00:00 · 2025-08-03 22:10:17 +00:00 · 2025-08-03 22:00:21 +00:00
2 changed files with 28 additions and 1 deletions
--- a/unshackle/commands/dl.py
+++ b/unshackle/commands/dl.py
@@ -148,6 +148,7 @@ class dl:
        help="Language wanted for Video, you would use this if the video language doesn't match the audio.",
    )
    @click.option("-sl", "--s-lang", type=LANGUAGE_RANGE, default=["all"], help="Language wanted for Subtitles.")
+    @click.option("-fs", "--forced-subs", is_flag=True, default=False, help="Include forced subtitle tracks.")
    @click.option(
        "--proxy",
        type=str,
@@ -405,6 +406,7 @@ class dl:
        lang: list[str],
        v_lang: list[str],
        s_lang: list[str],
+        forced_subs: bool,
        sub_format: Optional[Subtitle.Codec],
        video_only: bool,
        audio_only: bool,
@@ -672,7 +674,8 @@ class dl:
                            self.log.error(f"There's no {s_lang} Subtitle Track...")
                            sys.exit(1)

-                    title.tracks.select_subtitles(lambda x: not x.forced or is_close_match(x.language, lang))
+                    if not forced_subs:
+                        title.tracks.select_subtitles(lambda x: not x.forced or is_close_match(x.language, lang))

                # filter audio tracks
                # might have no audio tracks if part of the video, e.g. transport stream hls
--- a/unshackle/core/tracks/subtitle.py
+++ b/unshackle/core/tracks/subtitle.py
@@ -233,6 +233,7 @@ class Subtitle(Track):
            try:
                caption_set = pycaption.WebVTTReader().read(text)
                Subtitle.merge_same_cues(caption_set)
+                Subtitle.filter_unwanted_cues(caption_set)
                subtitle_text = pycaption.WebVTTWriter().write(caption_set)
                self.path.write_text(subtitle_text, encoding="utf8")
            except pycaption.exceptions.CaptionReadSyntaxError:
@@ -241,6 +242,7 @@ class Subtitle(Track):
                try:
                    caption_set = pycaption.WebVTTReader().read(text)
                    Subtitle.merge_same_cues(caption_set)
+                    Subtitle.filter_unwanted_cues(caption_set)
                    subtitle_text = pycaption.WebVTTWriter().write(caption_set)
                    self.path.write_text(subtitle_text, encoding="utf8")
                except Exception:
@@ -444,6 +446,8 @@ class Subtitle(Track):

        caption_set = self.parse(self.path.read_bytes(), self.codec)
        Subtitle.merge_same_cues(caption_set)
+        if codec == Subtitle.Codec.WebVTT:
+            Subtitle.filter_unwanted_cues(caption_set)
        subtitle_text = writer().write(caption_set)

        output_path.write_text(subtitle_text, encoding="utf8")
@@ -520,6 +524,8 @@ class Subtitle(Track):

            caption_set = self.parse(self.path.read_bytes(), self.codec)
            Subtitle.merge_same_cues(caption_set)
+            if codec == Subtitle.Codec.WebVTT:
+                Subtitle.filter_unwanted_cues(caption_set)
            subtitle_text = writer().write(caption_set)

            output_path.write_text(subtitle_text, encoding="utf8")
@@ -681,6 +687,24 @@ class Subtitle(Track):
            if merged_captions:
                caption_set.set_captions(lang, merged_captions)

+    @staticmethod
+    def filter_unwanted_cues(caption_set: pycaption.CaptionSet):
+        """
+        Filter out subtitle cues containing only &nbsp; or whitespace.
+        """
+        for lang in caption_set.get_languages():
+            captions = caption_set.get_captions(lang)
+            filtered_captions = pycaption.CaptionList()
+
+            for caption in captions:
+                text = caption.get_text().strip()
+                if not text or text == "&nbsp;" or all(c in " \t\n\r\xa0" for c in text.replace("&nbsp;", "\xa0")):
+                    continue
+
+                filtered_captions.append(caption)
+
+            caption_set.set_captions(lang, filtered_captions)
+
    @staticmethod
    def merge_segmented_wvtt(data: bytes, period_start: float = 0.0) -> tuple[CaptionList, Optional[str]]:
        """
Author	SHA1	Message	Date
Andy	f8a58d966b	feat(subtitle): Add filtering for unwanted cues in WebVTT subtitles	2025-08-03 22:10:17 +00:00
Andy	8d12b735ff	feat(dl): Add option to include forced subtitle tracks	2025-08-03 22:00:21 +00:00