mirror of
https://github.com/unshackle-dl/unshackle.git
synced 2026-03-09 16:09:01 +00:00
Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
abd8fc2eb9 | ||
|
|
e99cfddaec | ||
|
|
4e11f69a58 | ||
|
|
aec3333888 | ||
|
|
68ad76cbb0 | ||
|
|
18b0534020 | ||
|
|
d0cefa9d58 | ||
|
|
a01f335cfc | ||
|
|
b01fc3c8d1 | ||
|
|
44acfbdc89 |
33
CHANGELOG.md
33
CHANGELOG.md
@@ -5,6 +5,39 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [2.3.0] - 2026-01-18
|
||||
|
||||
### Added
|
||||
|
||||
- **Unicode Filenames Option**: New `unicode_filenames` config option to preserve native characters
|
||||
- Allows disabling ASCII transliteration in filenames
|
||||
- Preserves Korean, Japanese, Chinese, and other native language characters
|
||||
- Closes #49
|
||||
|
||||
### Fixed
|
||||
|
||||
- **WebVTT Cue Handling**: Handle WebVTT cue identifiers and overlapping multi-line cues
|
||||
- Added detection and sanitization for cue identifiers (Q0, Q1, etc.) before timing lines
|
||||
- Added merging of overlapping cues with different line positions into multi-line subtitles
|
||||
- Fixes parsing issues with pysubs2/pycaption on certain WebVTT files
|
||||
- **Widevine PSSH Filtering**: Filter Widevine PSSH by system ID instead of sorting
|
||||
- Fixes KeyError crash when unsupported DRM systems are present in init segments
|
||||
- **TTML Negative Values**: Handle negative values in multi-value TTML attributes
|
||||
- Fixes pycaption parse errors for attributes like `tts:extent="-5% 7.5%"`
|
||||
- Closes #47
|
||||
- **ASS Font Names**: Strip whitespace from ASS font names
|
||||
- Handles ASS subtitle files with spaces after commas in Style definitions
|
||||
- Fixes #57
|
||||
- **Shaka-Packager Error Messages**: Include shaka-packager binary path in error messages
|
||||
- **N_m3u8DL-RE Merge and Decryption**: Handle merge and decryption properly
|
||||
- Prevents audio corruption ("Box 'OG 2' size is too large") with DASH manifests
|
||||
- Fixes duplicate init segment writing when using N_m3u8DL-RE
|
||||
- **DASH Placeholder KIDs**: Handle placeholder KIDs and improve DRM init from segments
|
||||
- Detects and replaces placeholder/test KIDs in Widevine PSSH
|
||||
- Adds CENC namespace support for kid/default_KID attributes
|
||||
- **PlayReady PSSH Comparison**: Correct PSSH system ID comparison in PlayReady
|
||||
- Removes erroneous `.bytes` accessor from PSSH.SYSTEM_ID comparisons
|
||||
|
||||
## [2.2.0] - 2026-01-15
|
||||
|
||||
### Added
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "unshackle"
|
||||
version = "2.2.0"
|
||||
version = "2.3.0"
|
||||
description = "Modular Movie, TV, and Music Archival Software."
|
||||
authors = [{ name = "unshackle team" }]
|
||||
requires-python = ">=3.10,<3.13"
|
||||
|
||||
@@ -1567,7 +1567,7 @@ class dl:
|
||||
if subtitle.codec == Subtitle.Codec.SubStationAlphav4:
|
||||
for line in subtitle.path.read_text("utf8").splitlines():
|
||||
if line.startswith("Style: "):
|
||||
font_names.append(line.removesuffix("Style: ").split(",")[1])
|
||||
font_names.append(line.removeprefix("Style: ").split(",")[1].strip())
|
||||
|
||||
font_count, missing_fonts = self.attach_subtitle_fonts(
|
||||
font_names, title, temp_font_files
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "2.2.0"
|
||||
__version__ = "2.3.0"
|
||||
|
||||
@@ -95,6 +95,7 @@ class Config:
|
||||
self.update_check_interval: int = kwargs.get("update_check_interval", 24)
|
||||
self.scene_naming: bool = kwargs.get("scene_naming", True)
|
||||
self.series_year: bool = kwargs.get("series_year", True)
|
||||
self.unicode_filenames: bool = kwargs.get("unicode_filenames", False)
|
||||
|
||||
self.title_cache_time: int = kwargs.get("title_cache_time", 1800) # 30 minutes default
|
||||
self.title_cache_max_retention: int = kwargs.get("title_cache_max_retention", 86400) # 24 hours default
|
||||
|
||||
@@ -168,7 +168,7 @@ class PlayReady:
|
||||
pssh_boxes.extend(list(get_boxes(init_data, b"pssh")))
|
||||
tenc_boxes.extend(list(get_boxes(init_data, b"tenc")))
|
||||
|
||||
pssh = next((b for b in pssh_boxes if b.system_ID == PSSH.SYSTEM_ID.bytes), None)
|
||||
pssh = next((b for b in pssh_boxes if b.system_ID == PSSH.SYSTEM_ID), None)
|
||||
if not pssh:
|
||||
raise PlayReady.Exceptions.PSSHNotFound("PSSH was not found in track data.")
|
||||
|
||||
@@ -197,7 +197,7 @@ class PlayReady:
|
||||
if enc_key_id:
|
||||
kid = UUID(bytes=base64.b64decode(enc_key_id))
|
||||
|
||||
pssh = next((b for b in pssh_boxes if b.system_ID == PSSH.SYSTEM_ID.bytes), None)
|
||||
pssh = next((b for b in pssh_boxes if b.system_ID == PSSH.SYSTEM_ID), None)
|
||||
if not pssh:
|
||||
raise PlayReady.Exceptions.PSSHNotFound("PSSH was not found in track data.")
|
||||
|
||||
@@ -415,7 +415,7 @@ class PlayReady:
|
||||
p.wait()
|
||||
|
||||
if p.returncode != 0 or had_error:
|
||||
raise subprocess.CalledProcessError(p.returncode, arguments)
|
||||
raise subprocess.CalledProcessError(p.returncode, [binaries.ShakaPackager, *arguments])
|
||||
|
||||
path.unlink()
|
||||
if not stream_skipped:
|
||||
|
||||
@@ -100,9 +100,7 @@ class Widevine:
|
||||
pssh_boxes.extend(list(get_boxes(init_data, b"pssh")))
|
||||
tenc_boxes.extend(list(get_boxes(init_data, b"tenc")))
|
||||
|
||||
pssh_boxes.sort(key=lambda b: {PSSH.SystemId.Widevine: 0, PSSH.SystemId.PlayReady: 1}[b.system_ID])
|
||||
|
||||
pssh = next(iter(pssh_boxes), None)
|
||||
pssh = next((b for b in pssh_boxes if b.system_ID == PSSH.SystemId.Widevine), None)
|
||||
if not pssh:
|
||||
raise Widevine.Exceptions.PSSHNotFound("PSSH was not found in track data.")
|
||||
|
||||
@@ -141,9 +139,7 @@ class Widevine:
|
||||
if enc_key_id:
|
||||
kid = UUID(bytes=base64.b64decode(enc_key_id))
|
||||
|
||||
pssh_boxes.sort(key=lambda b: {PSSH.SystemId.Widevine: 0, PSSH.SystemId.PlayReady: 1}[b.system_ID])
|
||||
|
||||
pssh = next(iter(pssh_boxes), None)
|
||||
pssh = next((b for b in pssh_boxes if b.system_ID == PSSH.SystemId.Widevine), None)
|
||||
if not pssh:
|
||||
raise Widevine.Exceptions.PSSHNotFound("PSSH was not found in track data.")
|
||||
|
||||
@@ -371,7 +367,7 @@ class Widevine:
|
||||
p.wait()
|
||||
|
||||
if p.returncode != 0 or had_error:
|
||||
raise subprocess.CalledProcessError(p.returncode, arguments)
|
||||
raise subprocess.CalledProcessError(p.returncode, [binaries.ShakaPackager, *arguments])
|
||||
|
||||
path.unlink()
|
||||
if not stream_skipped:
|
||||
|
||||
@@ -5,6 +5,7 @@ import html
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
from copy import copy
|
||||
from functools import partial
|
||||
@@ -466,7 +467,7 @@ class DASH:
|
||||
track.data["dash"]["timescale"] = int(segment_timescale)
|
||||
track.data["dash"]["segment_durations"] = segment_durations
|
||||
|
||||
if not track.drm and isinstance(track, (Video, Audio)):
|
||||
if init_data and isinstance(track, (Video, Audio)):
|
||||
if isinstance(cdm, PlayReadyCdm):
|
||||
try:
|
||||
track.drm = [PlayReady.from_init_data(init_data)]
|
||||
@@ -527,8 +528,16 @@ class DASH:
|
||||
max_workers=max_workers,
|
||||
)
|
||||
|
||||
skip_merge = False
|
||||
if downloader.__name__ == "n_m3u8dl_re":
|
||||
downloader_args.update({"filename": track.id, "track": track})
|
||||
skip_merge = True
|
||||
downloader_args.update(
|
||||
{
|
||||
"filename": track.id,
|
||||
"track": track,
|
||||
"content_keys": drm.content_keys if drm else None,
|
||||
}
|
||||
)
|
||||
|
||||
debug_logger = get_debug_logger()
|
||||
if debug_logger:
|
||||
@@ -543,6 +552,7 @@ class DASH:
|
||||
"downloader": downloader.__name__,
|
||||
"has_drm": bool(track.drm),
|
||||
"drm_types": [drm.__class__.__name__ for drm in (track.drm or [])],
|
||||
"skip_merge": skip_merge,
|
||||
"save_path": str(save_path),
|
||||
"has_init_data": bool(init_data),
|
||||
},
|
||||
@@ -563,42 +573,56 @@ class DASH:
|
||||
control_file.unlink()
|
||||
|
||||
segments_to_merge = [x for x in sorted(save_dir.iterdir()) if x.is_file()]
|
||||
with open(save_path, "wb") as f:
|
||||
if init_data:
|
||||
f.write(init_data)
|
||||
if len(segments_to_merge) > 1:
|
||||
progress(downloaded="Merging", completed=0, total=len(segments_to_merge))
|
||||
for segment_file in segments_to_merge:
|
||||
segment_data = segment_file.read_bytes()
|
||||
# TODO: fix encoding after decryption?
|
||||
if (
|
||||
not drm
|
||||
and isinstance(track, Subtitle)
|
||||
and track.codec not in (Subtitle.Codec.fVTT, Subtitle.Codec.fTTML)
|
||||
):
|
||||
segment_data = try_ensure_utf8(segment_data)
|
||||
segment_data = (
|
||||
segment_data.decode("utf8")
|
||||
.replace("‎", html.unescape("‎"))
|
||||
.replace("‏", html.unescape("‏"))
|
||||
.encode("utf8")
|
||||
)
|
||||
f.write(segment_data)
|
||||
f.flush()
|
||||
segment_file.unlink()
|
||||
progress(advance=1)
|
||||
|
||||
if skip_merge:
|
||||
# N_m3u8DL-RE handles merging and decryption internally
|
||||
shutil.move(segments_to_merge[0], save_path)
|
||||
if drm:
|
||||
track.drm = None
|
||||
events.emit(events.Types.TRACK_DECRYPTED, track=track, drm=drm, segment=None)
|
||||
else:
|
||||
with open(save_path, "wb") as f:
|
||||
if init_data:
|
||||
f.write(init_data)
|
||||
if len(segments_to_merge) > 1:
|
||||
progress(downloaded="Merging", completed=0, total=len(segments_to_merge))
|
||||
for segment_file in segments_to_merge:
|
||||
segment_data = segment_file.read_bytes()
|
||||
# TODO: fix encoding after decryption?
|
||||
if (
|
||||
not drm
|
||||
and isinstance(track, Subtitle)
|
||||
and track.codec not in (Subtitle.Codec.fVTT, Subtitle.Codec.fTTML)
|
||||
):
|
||||
segment_data = try_ensure_utf8(segment_data)
|
||||
segment_data = (
|
||||
segment_data.decode("utf8")
|
||||
.replace("‎", html.unescape("‎"))
|
||||
.replace("‏", html.unescape("‏"))
|
||||
.encode("utf8")
|
||||
)
|
||||
f.write(segment_data)
|
||||
f.flush()
|
||||
segment_file.unlink()
|
||||
progress(advance=1)
|
||||
|
||||
track.path = save_path
|
||||
events.emit(events.Types.TRACK_DOWNLOADED, track=track)
|
||||
|
||||
if drm:
|
||||
if not skip_merge and drm:
|
||||
progress(downloaded="Decrypting", completed=0, total=100)
|
||||
drm.decrypt(save_path)
|
||||
track.drm = None
|
||||
events.emit(events.Types.TRACK_DECRYPTED, track=track, drm=drm, segment=None)
|
||||
progress(downloaded="Decrypting", advance=100)
|
||||
|
||||
save_dir.rmdir()
|
||||
# Clean up empty segment directory
|
||||
if save_dir.exists() and save_dir.name.endswith("_segments"):
|
||||
try:
|
||||
save_dir.rmdir()
|
||||
except OSError:
|
||||
# Directory might not be empty, try removing recursively
|
||||
shutil.rmtree(save_dir, ignore_errors=True)
|
||||
|
||||
progress(downloaded="Downloaded")
|
||||
|
||||
@@ -766,6 +790,11 @@ class DASH:
|
||||
@staticmethod
|
||||
def get_drm(protections: list[Element]) -> list[DRM_T]:
|
||||
drm: list[DRM_T] = []
|
||||
PLACEHOLDER_KIDS = {
|
||||
UUID("00000000-0000-0000-0000-000000000000"), # All zeros (key rotation default)
|
||||
UUID("00010203-0405-0607-0809-0a0b0c0d0e0f"), # Sequential 0x00-0x0f
|
||||
UUID("00010203-0405-0607-0809-101112131415"), # Shaka Packager test pattern
|
||||
}
|
||||
|
||||
for protection in protections:
|
||||
urn = (protection.get("schemeIdUri") or "").lower()
|
||||
@@ -775,17 +804,27 @@ class DASH:
|
||||
if not pssh_text:
|
||||
continue
|
||||
pssh = PSSH(pssh_text)
|
||||
kid_attr = protection.get("kid") or protection.get("{urn:mpeg:cenc:2013}kid")
|
||||
kid = UUID(bytes=base64.b64decode(kid_attr)) if kid_attr else None
|
||||
|
||||
kid = protection.get("kid")
|
||||
if kid:
|
||||
kid = UUID(bytes=base64.b64decode(kid))
|
||||
if not kid:
|
||||
default_kid_attr = protection.get("default_KID") or protection.get(
|
||||
"{urn:mpeg:cenc:2013}default_KID"
|
||||
)
|
||||
kid = UUID(default_kid_attr) if default_kid_attr else None
|
||||
|
||||
default_kid = protection.get("default_KID")
|
||||
if default_kid:
|
||||
kid = UUID(default_kid)
|
||||
if not kid:
|
||||
kid = next(
|
||||
(
|
||||
UUID(p.get("default_KID") or p.get("{urn:mpeg:cenc:2013}default_KID"))
|
||||
for p in protections
|
||||
if p.get("default_KID") or p.get("{urn:mpeg:cenc:2013}default_KID")
|
||||
),
|
||||
None,
|
||||
)
|
||||
|
||||
if not pssh.key_ids and not kid:
|
||||
kid = next((UUID(p.get("default_KID")) for p in protections if p.get("default_KID")), None)
|
||||
if kid and (not pssh.key_ids or all(k.int == 0 or k in PLACEHOLDER_KIDS for k in pssh.key_ids)):
|
||||
pssh.set_key_ids([kid])
|
||||
|
||||
drm.append(Widevine(pssh=pssh, kid=kid))
|
||||
|
||||
|
||||
@@ -91,6 +91,12 @@ class Subtitle(Track):
|
||||
return Subtitle.Codec.TimedTextMarkupLang
|
||||
raise ValueError(f"The Content Profile '{profile}' is not a supported Subtitle Codec")
|
||||
|
||||
# WebVTT sanitization patterns (compiled once for performance)
|
||||
_CUE_ID_PATTERN = re.compile(r"^[A-Za-z]+\d+$")
|
||||
_TIMING_START_PATTERN = re.compile(r"^\d+:\d+[:\.]")
|
||||
_TIMING_LINE_PATTERN = re.compile(r"^((?:\d+:)?\d+:\d+[.,]\d+)\s*-->\s*((?:\d+:)?\d+:\d+[.,]\d+)(.*)$")
|
||||
_LINE_POS_PATTERN = re.compile(r"line:(\d+(?:\.\d+)?%?)")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args: Any,
|
||||
@@ -239,6 +245,11 @@ class Subtitle(Track):
|
||||
|
||||
# Sanitize WebVTT timestamps before parsing
|
||||
text = Subtitle.sanitize_webvtt_timestamps(text)
|
||||
# Remove cue identifiers that confuse parsers like pysubs2
|
||||
text = Subtitle.sanitize_webvtt_cue_identifiers(text)
|
||||
# Merge overlapping cues with line positioning into single multi-line cues
|
||||
text = Subtitle.merge_overlapping_webvtt_cues(text)
|
||||
|
||||
preserve_formatting = config.subtitle.get("preserve_formatting", True)
|
||||
|
||||
if preserve_formatting:
|
||||
@@ -277,6 +288,240 @@ class Subtitle(Track):
|
||||
# Replace negative timestamps with 00:00:00.000
|
||||
return re.sub(r"(-\d+:\d+:\d+\.\d+)", "00:00:00.000", text)
|
||||
|
||||
@staticmethod
|
||||
def has_webvtt_cue_identifiers(text: str) -> bool:
|
||||
"""
|
||||
Check if WebVTT content has cue identifiers that need removal.
|
||||
|
||||
Parameters:
|
||||
text: The WebVTT content as string
|
||||
|
||||
Returns:
|
||||
True if cue identifiers are detected, False otherwise
|
||||
"""
|
||||
lines = text.split("\n")
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
line = line.strip()
|
||||
if Subtitle._CUE_ID_PATTERN.match(line):
|
||||
# Look ahead to see if next non-empty line is a timing line
|
||||
j = i + 1
|
||||
while j < len(lines) and not lines[j].strip():
|
||||
j += 1
|
||||
if j < len(lines) and ("-->" in lines[j] or Subtitle._TIMING_START_PATTERN.match(lines[j].strip())):
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def sanitize_webvtt_cue_identifiers(text: str) -> str:
|
||||
"""
|
||||
Remove WebVTT cue identifiers that can confuse subtitle parsers.
|
||||
|
||||
Some services use cue identifiers like "Q0", "Q1", etc.
|
||||
that appear on their own line before the timing line. These can be
|
||||
incorrectly parsed as part of the previous cue's text content by
|
||||
some parsers (like pysubs2).
|
||||
|
||||
Parameters:
|
||||
text: The WebVTT content as string
|
||||
|
||||
Returns:
|
||||
Sanitized WebVTT content with cue identifiers removed
|
||||
"""
|
||||
if not Subtitle.has_webvtt_cue_identifiers(text):
|
||||
return text
|
||||
|
||||
lines = text.split("\n")
|
||||
sanitized_lines = []
|
||||
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i].strip()
|
||||
|
||||
# Check if this line is a cue identifier followed by a timing line
|
||||
if Subtitle._CUE_ID_PATTERN.match(line):
|
||||
# Look ahead to see if next non-empty line is a timing line
|
||||
j = i + 1
|
||||
while j < len(lines) and not lines[j].strip():
|
||||
j += 1
|
||||
if j < len(lines) and ("-->" in lines[j] or Subtitle._TIMING_START_PATTERN.match(lines[j].strip())):
|
||||
# This is a cue identifier, skip it
|
||||
i += 1
|
||||
continue
|
||||
|
||||
sanitized_lines.append(lines[i])
|
||||
i += 1
|
||||
|
||||
return "\n".join(sanitized_lines)
|
||||
|
||||
@staticmethod
|
||||
def _parse_vtt_time(t: str) -> int:
|
||||
"""Parse WebVTT timestamp to milliseconds. Returns 0 for malformed input."""
|
||||
try:
|
||||
t = t.replace(",", ".")
|
||||
parts = t.split(":")
|
||||
if len(parts) == 2:
|
||||
m, s = parts
|
||||
h = "0"
|
||||
elif len(parts) >= 3:
|
||||
h, m, s = parts[:3]
|
||||
else:
|
||||
return 0
|
||||
sec_parts = s.split(".")
|
||||
secs = int(sec_parts[0])
|
||||
# Handle variable millisecond digits (e.g., .5 = 500ms, .50 = 500ms, .500 = 500ms)
|
||||
ms = int(sec_parts[1].ljust(3, "0")[:3]) if len(sec_parts) > 1 else 0
|
||||
return int(h) * 3600000 + int(m) * 60000 + secs * 1000 + ms
|
||||
except (ValueError, IndexError):
|
||||
return 0
|
||||
|
||||
@staticmethod
|
||||
def has_overlapping_webvtt_cues(text: str) -> bool:
|
||||
"""
|
||||
Check if WebVTT content has overlapping cues that need merging.
|
||||
|
||||
Detects cues with start times within 50ms of each other and the same end time,
|
||||
which indicates multi-line subtitles split into separate cues.
|
||||
|
||||
Parameters:
|
||||
text: The WebVTT content as string
|
||||
|
||||
Returns:
|
||||
True if overlapping cues are detected, False otherwise
|
||||
"""
|
||||
timings = []
|
||||
for line in text.split("\n"):
|
||||
match = Subtitle._TIMING_LINE_PATTERN.match(line)
|
||||
if match:
|
||||
start_str, end_str = match.group(1), match.group(2)
|
||||
timings.append((Subtitle._parse_vtt_time(start_str), Subtitle._parse_vtt_time(end_str)))
|
||||
|
||||
# Check for overlapping cues (within 50ms start, same end)
|
||||
for i in range(len(timings) - 1):
|
||||
curr_start, curr_end = timings[i]
|
||||
next_start, next_end = timings[i + 1]
|
||||
if abs(curr_start - next_start) <= 50 and curr_end == next_end:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def merge_overlapping_webvtt_cues(text: str) -> str:
|
||||
"""
|
||||
Merge WebVTT cues that have overlapping/near-identical times but different line positions.
|
||||
|
||||
Some services use separate cues for each line of a multi-line subtitle, with
|
||||
slightly different start times (1ms apart) and different line: positions.
|
||||
This merges them into single cues with proper line ordering based on the
|
||||
line: position (lower percentage = higher on screen = first line).
|
||||
|
||||
Parameters:
|
||||
text: The WebVTT content as string
|
||||
|
||||
Returns:
|
||||
WebVTT content with overlapping cues merged
|
||||
"""
|
||||
if not Subtitle.has_overlapping_webvtt_cues(text):
|
||||
return text
|
||||
|
||||
lines = text.split("\n")
|
||||
cues = []
|
||||
header_lines = []
|
||||
in_header = True
|
||||
i = 0
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
|
||||
if in_header:
|
||||
if "-->" in line:
|
||||
in_header = False
|
||||
else:
|
||||
header_lines.append(line)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
match = Subtitle._TIMING_LINE_PATTERN.match(line)
|
||||
if match:
|
||||
start_str, end_str, settings = match.groups()
|
||||
line_pos = 100.0 # Default to bottom
|
||||
line_match = Subtitle._LINE_POS_PATTERN.search(settings)
|
||||
if line_match:
|
||||
pos_str = line_match.group(1).rstrip("%")
|
||||
line_pos = float(pos_str)
|
||||
|
||||
content_lines = []
|
||||
i += 1
|
||||
while i < len(lines) and lines[i].strip() and "-->" not in lines[i]:
|
||||
content_lines.append(lines[i])
|
||||
i += 1
|
||||
|
||||
cues.append(
|
||||
{
|
||||
"start_ms": Subtitle._parse_vtt_time(start_str),
|
||||
"end_ms": Subtitle._parse_vtt_time(end_str),
|
||||
"start_str": start_str,
|
||||
"end_str": end_str,
|
||||
"line_pos": line_pos,
|
||||
"content": "\n".join(content_lines),
|
||||
"settings": settings,
|
||||
}
|
||||
)
|
||||
else:
|
||||
i += 1
|
||||
|
||||
# Merge overlapping cues (within 50ms of each other with same end time)
|
||||
merged_cues = []
|
||||
i = 0
|
||||
while i < len(cues):
|
||||
current = cues[i]
|
||||
group = [current]
|
||||
|
||||
j = i + 1
|
||||
while j < len(cues):
|
||||
other = cues[j]
|
||||
if abs(current["start_ms"] - other["start_ms"]) <= 50 and current["end_ms"] == other["end_ms"]:
|
||||
group.append(other)
|
||||
j += 1
|
||||
else:
|
||||
break
|
||||
|
||||
if len(group) > 1:
|
||||
# Sort by line position (lower % = higher on screen = first)
|
||||
group.sort(key=lambda x: x["line_pos"])
|
||||
# Use the earliest start time from the group
|
||||
earliest = min(group, key=lambda x: x["start_ms"])
|
||||
merged_cues.append(
|
||||
{
|
||||
"start_str": earliest["start_str"],
|
||||
"end_str": group[0]["end_str"],
|
||||
"content": "\n".join(c["content"] for c in group),
|
||||
"settings": "",
|
||||
}
|
||||
)
|
||||
else:
|
||||
merged_cues.append(
|
||||
{
|
||||
"start_str": current["start_str"],
|
||||
"end_str": current["end_str"],
|
||||
"content": current["content"],
|
||||
"settings": current["settings"],
|
||||
}
|
||||
)
|
||||
|
||||
i = j if len(group) > 1 else i + 1
|
||||
|
||||
result_lines = header_lines[:]
|
||||
if result_lines and result_lines[-1].strip():
|
||||
result_lines.append("")
|
||||
|
||||
for cue in merged_cues:
|
||||
result_lines.append(f"{cue['start_str']} --> {cue['end_str']}{cue['settings']}")
|
||||
result_lines.append(cue["content"])
|
||||
result_lines.append("")
|
||||
|
||||
return "\n".join(result_lines)
|
||||
|
||||
@staticmethod
|
||||
def sanitize_webvtt(text: str) -> str:
|
||||
"""
|
||||
@@ -631,7 +876,7 @@ class Subtitle(Track):
|
||||
text = try_ensure_utf8(data).decode("utf8")
|
||||
text = text.replace("tt:", "")
|
||||
# negative size values aren't allowed in TTML/DFXP spec, replace with 0
|
||||
text = re.sub(r'"(-\d+(\.\d+)?(px|em|%|c|pt))"', '"0"', text)
|
||||
text = re.sub(r"-(\d+(?:\.\d+)?)(px|em|%|c|pt)", r"0\2", text)
|
||||
caption_set = pycaption.DFXPReader().read(text)
|
||||
elif codec == Subtitle.Codec.fVTT:
|
||||
caption_lists: dict[str, pycaption.CaptionList] = defaultdict(pycaption.CaptionList)
|
||||
|
||||
@@ -120,9 +120,14 @@ def sanitize_filename(filename: str, spacer: str = ".") -> str:
|
||||
|
||||
The spacer is safer to be a '.' for older DDL and p2p sharing spaces.
|
||||
This includes web-served content via direct links and such.
|
||||
|
||||
Set `unicode_filenames: true` in config to preserve native language
|
||||
characters (Korean, Japanese, Chinese, etc.) instead of transliterating
|
||||
them to ASCII equivalents.
|
||||
"""
|
||||
# replace all non-ASCII characters with ASCII equivalents
|
||||
filename = unidecode(filename)
|
||||
# optionally replace non-ASCII characters with ASCII equivalents
|
||||
if not config.unicode_filenames:
|
||||
filename = unidecode(filename)
|
||||
|
||||
# remove or replace further characters as needed
|
||||
filename = "".join(c for c in filename if unicodedata.category(c) != "Mn") # hidden characters
|
||||
|
||||
Reference in New Issue
Block a user