diff --git a/CHANGELOG.md b/CHANGELOG.md index 9625999..343b09e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,64 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [2.3.0] - 2026-01-18 + +### Added + +- **Unicode Filenames Option**: New `unicode_filenames` config option to preserve native characters + - Allows disabling ASCII transliteration in filenames + - Preserves Korean, Japanese, Chinese, and other native language characters + - Closes #49 + +### Fixed + +- **WebVTT Cue Handling**: Handle WebVTT cue identifiers and overlapping multi-line cues + - Added detection and sanitization for cue identifiers (Q0, Q1, etc.) before timing lines + - Added merging of overlapping cues with different line positions into multi-line subtitles + - Fixes parsing issues with pysubs2/pycaption on certain WebVTT files +- **Widevine PSSH Filtering**: Filter Widevine PSSH by system ID instead of sorting + - Fixes KeyError crash when unsupported DRM systems are present in init segments +- **TTML Negative Values**: Handle negative values in multi-value TTML attributes + - Fixes pycaption parse errors for attributes like `tts:extent="-5% 7.5%"` + - Closes #47 +- **ASS Font Names**: Strip whitespace from ASS font names + - Handles ASS subtitle files with spaces after commas in Style definitions + - Fixes #57 +- **Shaka-Packager Error Messages**: Include shaka-packager binary path in error messages +- **N_m3u8DL-RE Merge and Decryption**: Handle merge and decryption properly + - Prevents audio corruption ("Box 'OG 2' size is too large") with DASH manifests + - Fixes duplicate init segment writing when using N_m3u8DL-RE +- **DASH Placeholder KIDs**: Handle placeholder KIDs and improve DRM init from segments + - Detects and replaces placeholder/test KIDs in Widevine PSSH + - Adds CENC namespace support for kid/default_KID attributes +- **PlayReady PSSH Comparison**: Correct PSSH system ID comparison in PlayReady + - Removes erroneous `.bytes` accessor from PSSH.SYSTEM_ID comparisons + +## [2.2.0] - 2026-01-15 + +### Added + +- **CDM-Aware PlayReady Fallback Detection**: Intelligent DRM fallback based on selected CDM + - Adds PlayReady PSSH/KID extraction from track and init data with CDM-aware ordering + - When PlayReady CDM is selected, tries PlayReady first then falls back to Widevine + - When Widevine CDM is selected (default), tries Widevine first then falls back to PlayReady +- **Comprehensive Debug Logging**: Enhanced debug logging for downloaders and muxing + - Added detailed debug logging to aria2c, curl_impersonate, n_m3u8dl_re, and requests downloaders + - Enhanced manifest parsers (DASH, HLS, ISM) with debug logging + - Added debug logging to track muxing operations + +### Fixed + +- **Hybrid DV+HDR10 Filename Detection**: Fixed HDR10 detection in hybrid Dolby Vision filenames + - Hybrid DV+HDR10 files were incorrectly named "DV.H.265" instead of "DV.HDR.H.265" + - Now checks both `hdr_format_full` and `hdr_format_commercial` fields for HDR10 indicators +- **Vault Adaptive Batch Sizing**: Improved bulk key operations with adaptive batch sizing + - Prevents query limit issues when retrieving large numbers of keys from vaults + - Dynamically adjusts batch sizes based on vault response characteristics +- **Test Command Improvements**: Enhanced test command error detection and sorting + - Improved error detection in test command output + - Added natural sorting for test results + ## [2.1.0] - 2025-11-27 ### Added diff --git a/pyproject.toml b/pyproject.toml index 5c91c9b..80d989d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "unshackle" -version = "2.1.0" +version = "2.3.0" description = "Modular Movie, TV, and Music Archival Software." authors = [{ name = "unshackle team" }] requires-python = ">=3.10,<3.13" diff --git a/unshackle/commands/dl.py b/unshackle/commands/dl.py index cfd09ec..71549a3 100644 --- a/unshackle/commands/dl.py +++ b/unshackle/commands/dl.py @@ -1567,7 +1567,7 @@ class dl: if subtitle.codec == Subtitle.Codec.SubStationAlphav4: for line in subtitle.path.read_text("utf8").splitlines(): if line.startswith("Style: "): - font_names.append(line.removesuffix("Style: ").split(",")[1]) + font_names.append(line.removeprefix("Style: ").split(",")[1].strip()) font_count, missing_fonts = self.attach_subtitle_fonts( font_names, title, temp_font_files diff --git a/unshackle/commands/util.py b/unshackle/commands/util.py index b9c6b84..612ac2b 100644 --- a/unshackle/commands/util.py +++ b/unshackle/commands/util.py @@ -1,3 +1,4 @@ +import re import subprocess from pathlib import Path @@ -8,6 +9,11 @@ from unshackle.core import binaries from unshackle.core.constants import context_settings +def _natural_sort_key(path: Path) -> list: + """Sort key for natural sorting (S01E01 before S01E10).""" + return [int(part) if part.isdigit() else part.lower() for part in re.split(r"(\d+)", path.name)] + + @click.group(short_help="Various helper scripts and programs.", context_settings=context_settings) def util() -> None: """Various helper scripts and programs.""" @@ -49,7 +55,7 @@ def crop(path: Path, aspect: str, letter: bool, offset: int, preview: bool) -> N raise click.ClickException('FFmpeg executable "ffmpeg" not found but is required.') if path.is_dir(): - paths = list(path.glob("*.mkv")) + list(path.glob("*.mp4")) + paths = sorted(list(path.glob("*.mkv")) + list(path.glob("*.mp4")), key=_natural_sort_key) else: paths = [path] for video_path in paths: @@ -140,7 +146,7 @@ def range_(path: Path, full: bool, preview: bool) -> None: raise click.ClickException('FFmpeg executable "ffmpeg" not found but is required.') if path.is_dir(): - paths = list(path.glob("*.mkv")) + list(path.glob("*.mp4")) + paths = sorted(list(path.glob("*.mkv")) + list(path.glob("*.mp4")), key=_natural_sort_key) else: paths = [path] for video_path in paths: @@ -225,16 +231,18 @@ def test(path: Path, map_: str) -> None: raise click.ClickException('FFmpeg executable "ffmpeg" not found but is required.') if path.is_dir(): - paths = list(path.glob("*.mkv")) + list(path.glob("*.mp4")) + paths = sorted(list(path.glob("*.mkv")) + list(path.glob("*.mp4")), key=_natural_sort_key) else: paths = [path] for video_path in paths: - print("Starting...") + print(f"Testing: {video_path.name}") p = subprocess.Popen( [ binaries.FFMPEG, "-hide_banner", "-benchmark", + "-err_detect", + "+crccheck+bitstream+buffer+careful+compliant+aggressive", "-i", str(video_path), "-map", @@ -255,13 +263,13 @@ def test(path: Path, map_: str) -> None: reached_output = True if not reached_output: continue - if line.startswith("["): # error of some kind + if line.startswith("[") and not line.startswith("[out#"): errors += 1 stream, error = line.split("] ", maxsplit=1) stream = stream.split(" @ ")[0] line = f"{stream} ERROR: {error}" print(line) p.stderr.close() - print(f"Finished with {errors} Errors, Cleaning up...") + print(f"Finished with {errors} error(s)") p.terminate() p.wait() diff --git a/unshackle/core/__init__.py b/unshackle/core/__init__.py index 9aa3f90..55e4709 100644 --- a/unshackle/core/__init__.py +++ b/unshackle/core/__init__.py @@ -1 +1 @@ -__version__ = "2.1.0" +__version__ = "2.3.0" diff --git a/unshackle/core/config.py b/unshackle/core/config.py index 6eb7b26..1c50d62 100644 --- a/unshackle/core/config.py +++ b/unshackle/core/config.py @@ -95,6 +95,7 @@ class Config: self.update_check_interval: int = kwargs.get("update_check_interval", 24) self.scene_naming: bool = kwargs.get("scene_naming", True) self.series_year: bool = kwargs.get("series_year", True) + self.unicode_filenames: bool = kwargs.get("unicode_filenames", False) self.title_cache_time: int = kwargs.get("title_cache_time", 1800) # 30 minutes default self.title_cache_max_retention: int = kwargs.get("title_cache_max_retention", 86400) # 24 hours default diff --git a/unshackle/core/downloaders/aria2c.py b/unshackle/core/downloaders/aria2c.py index bc43460..6f5b5d0 100644 --- a/unshackle/core/downloaders/aria2c.py +++ b/unshackle/core/downloaders/aria2c.py @@ -19,7 +19,7 @@ from unshackle.core import binaries from unshackle.core.config import config from unshackle.core.console import console from unshackle.core.constants import DOWNLOAD_CANCELLED -from unshackle.core.utilities import get_extension, get_free_port +from unshackle.core.utilities import get_debug_logger, get_extension, get_free_port def rpc(caller: Callable, secret: str, method: str, params: Optional[list[Any]] = None) -> Any: @@ -58,6 +58,8 @@ def download( proxy: Optional[str] = None, max_workers: Optional[int] = None, ) -> Generator[dict[str, Any], None, None]: + debug_logger = get_debug_logger() + if not urls: raise ValueError("urls must be provided and not empty") elif not isinstance(urls, (str, dict, list)): @@ -91,6 +93,13 @@ def download( urls = [urls] if not binaries.Aria2: + if debug_logger: + debug_logger.log( + level="ERROR", + operation="downloader_aria2c_binary_missing", + message="Aria2c executable not found in PATH or local binaries directory", + context={"searched_names": ["aria2c", "aria2"]}, + ) raise EnvironmentError("Aria2c executable not found...") if proxy and not proxy.lower().startswith("http://"): @@ -180,6 +189,28 @@ def download( continue arguments.extend(["--header", f"{header}: {value}"]) + if debug_logger: + first_url = urls[0] if isinstance(urls[0], str) else urls[0].get("url", "") + url_display = first_url[:200] + "..." if len(first_url) > 200 else first_url + debug_logger.log( + level="DEBUG", + operation="downloader_aria2c_start", + message="Starting Aria2c download", + context={ + "binary_path": str(binaries.Aria2), + "url_count": len(urls), + "first_url": url_display, + "output_dir": str(output_dir), + "filename": filename, + "max_concurrent_downloads": max_concurrent_downloads, + "max_connection_per_server": max_connection_per_server, + "split": split, + "file_allocation": file_allocation, + "has_proxy": bool(proxy), + "rpc_port": rpc_port, + }, + ) + yield dict(total=len(urls)) try: @@ -226,6 +257,20 @@ def download( textwrap.wrap(error, width=console.width - 20, initial_indent="") ) console.log(Text.from_ansi("\n[Aria2c]: " + error_pretty)) + if debug_logger: + debug_logger.log( + level="ERROR", + operation="downloader_aria2c_download_error", + message=f"Aria2c download failed: {dl['errorMessage']}", + context={ + "gid": dl["gid"], + "error_code": dl["errorCode"], + "error_message": dl["errorMessage"], + "used_uri": used_uri[:200] + "..." if len(used_uri) > 200 else used_uri, + "completed_length": dl.get("completedLength"), + "total_length": dl.get("totalLength"), + }, + ) raise ValueError(error) if number_stopped == len(urls): @@ -237,7 +282,31 @@ def download( p.wait() if p.returncode != 0: + if debug_logger: + debug_logger.log( + level="ERROR", + operation="downloader_aria2c_failed", + message=f"Aria2c exited with code {p.returncode}", + context={ + "returncode": p.returncode, + "url_count": len(urls), + "output_dir": str(output_dir), + }, + ) raise subprocess.CalledProcessError(p.returncode, arguments) + + if debug_logger: + debug_logger.log( + level="DEBUG", + operation="downloader_aria2c_complete", + message="Aria2c download completed successfully", + context={ + "url_count": len(urls), + "output_dir": str(output_dir), + "filename": filename, + }, + ) + except ConnectionResetError: # interrupted while passing URI to download raise KeyboardInterrupt() @@ -251,9 +320,20 @@ def download( DOWNLOAD_CANCELLED.set() # skip pending track downloads yield dict(downloaded="[yellow]CANCELLED") raise - except Exception: + except Exception as e: DOWNLOAD_CANCELLED.set() # skip pending track downloads yield dict(downloaded="[red]FAILED") + if debug_logger and not isinstance(e, (subprocess.CalledProcessError, ValueError)): + debug_logger.log( + level="ERROR", + operation="downloader_aria2c_exception", + message=f"Unexpected error during Aria2c download: {e}", + error=e, + context={ + "url_count": len(urls), + "output_dir": str(output_dir), + }, + ) raise finally: rpc(caller=partial(rpc_session.post, url=rpc_uri), secret=rpc_secret, method="aria2.shutdown") diff --git a/unshackle/core/downloaders/curl_impersonate.py b/unshackle/core/downloaders/curl_impersonate.py index 52dab7a..d278e91 100644 --- a/unshackle/core/downloaders/curl_impersonate.py +++ b/unshackle/core/downloaders/curl_impersonate.py @@ -11,7 +11,7 @@ from rich import filesize from unshackle.core.config import config from unshackle.core.constants import DOWNLOAD_CANCELLED -from unshackle.core.utilities import get_extension +from unshackle.core.utilities import get_debug_logger, get_extension MAX_ATTEMPTS = 5 RETRY_WAIT = 2 @@ -189,6 +189,8 @@ def curl_impersonate( if not isinstance(max_workers, (int, type(None))): raise TypeError(f"Expected max_workers to be {int}, not {type(max_workers)}") + debug_logger = get_debug_logger() + if not isinstance(urls, list): urls = [urls] @@ -209,6 +211,24 @@ def curl_impersonate( if proxy: session.proxies.update({"all": proxy}) + if debug_logger: + first_url = urls[0].get("url", "") if urls else "" + url_display = first_url[:200] + "..." if len(first_url) > 200 else first_url + debug_logger.log( + level="DEBUG", + operation="downloader_curl_impersonate_start", + message="Starting curl_impersonate download", + context={ + "url_count": len(urls), + "first_url": url_display, + "output_dir": str(output_dir), + "filename": filename, + "max_workers": max_workers, + "browser": BROWSER, + "has_proxy": bool(proxy), + }, + ) + yield dict(total=len(urls)) download_sizes = [] @@ -235,11 +255,23 @@ def curl_impersonate( # tell dl that it was cancelled # the pool is already shut down, so exiting loop is fine raise - except Exception: + except Exception as e: DOWNLOAD_CANCELLED.set() # skip pending track downloads yield dict(downloaded="[red]FAILING") pool.shutdown(wait=True, cancel_futures=True) yield dict(downloaded="[red]FAILED") + if debug_logger: + debug_logger.log( + level="ERROR", + operation="downloader_curl_impersonate_failed", + message=f"curl_impersonate download failed: {e}", + error=e, + context={ + "url_count": len(urls), + "output_dir": str(output_dir), + "browser": BROWSER, + }, + ) # tell dl that it failed # the pool is already shut down, so exiting loop is fine raise @@ -260,5 +292,17 @@ def curl_impersonate( last_speed_refresh = now download_sizes.clear() + if debug_logger: + debug_logger.log( + level="DEBUG", + operation="downloader_curl_impersonate_complete", + message="curl_impersonate download completed successfully", + context={ + "url_count": len(urls), + "output_dir": str(output_dir), + "filename": filename, + }, + ) + __all__ = ("curl_impersonate",) diff --git a/unshackle/core/downloaders/n_m3u8dl_re.py b/unshackle/core/downloaders/n_m3u8dl_re.py index a929e05..ddb8138 100644 --- a/unshackle/core/downloaders/n_m3u8dl_re.py +++ b/unshackle/core/downloaders/n_m3u8dl_re.py @@ -10,9 +10,11 @@ import requests from requests.cookies import cookiejar_from_dict, get_cookie_header from unshackle.core import binaries +from unshackle.core.binaries import FFMPEG, ShakaPackager, Mp4decrypt from unshackle.core.config import config from unshackle.core.console import console from unshackle.core.constants import DOWNLOAD_CANCELLED +from unshackle.core.utilities import get_debug_logger PERCENT_RE = re.compile(r"(\d+\.\d+%)") SPEED_RE = re.compile(r"(\d+\.\d+(?:MB|KB)ps)") @@ -66,12 +68,17 @@ def get_track_selection_args(track: Any) -> list[str]: parts = [] if track_type == "Audio": - if track_id := representation.get("id") or adaptation_set.get("audioTrackId"): - parts.append(rf"id={track_id}") + track_id = representation.get("id") or adaptation_set.get("audioTrackId") + lang = representation.get("lang") or adaptation_set.get("lang") + + if track_id: + parts.append(rf'"id=\b{track_id}\b"') + if lang: + parts.append(f"lang={lang}") else: if codecs := representation.get("codecs"): parts.append(f"codecs={codecs}") - if lang := representation.get("lang") or adaptation_set.get("lang"): + if lang: parts.append(f"lang={lang}") if bw := representation.get("bandwidth"): bitrate = int(bw) // 1000 @@ -178,15 +185,32 @@ def build_download_args( "--write-meta-json": False, "--no-log": True, } + if FFMPEG: + args["--ffmpeg-binary-path"] = str(FFMPEG) if proxy: args["--custom-proxy"] = proxy if skip_merge: args["--skip-merge"] = skip_merge if ad_keyword: args["--ad-keyword"] = ad_keyword + if content_keys: args["--key"] = next((f"{kid.hex}:{key.lower()}" for kid, key in content_keys.items()), None) - args["--decryption-engine"] = DECRYPTION_ENGINE.get(config.decryption.lower()) or "SHAKA_PACKAGER" + + decryption_config = config.decryption.lower() + engine_name = DECRYPTION_ENGINE.get(decryption_config) or "SHAKA_PACKAGER" + args["--decryption-engine"] = engine_name + + binary_path = None + if engine_name == "SHAKA_PACKAGER": + if ShakaPackager: + binary_path = str(ShakaPackager) + elif engine_name == "MP4DECRYPT": + if Mp4decrypt: + binary_path = str(Mp4decrypt) + if binary_path: + args["--decryption-binary-path"] = binary_path + if custom_args: args.update(custom_args) @@ -224,6 +248,8 @@ def download( content_keys: dict[str, Any] | None, skip_merge: bool | None = False, ) -> Generator[dict[str, Any], None, None]: + debug_logger = get_debug_logger() + if not urls: raise ValueError("urls must be provided and not empty") if not isinstance(urls, (str, dict, list)): @@ -250,6 +276,18 @@ def download( if not binaries.N_m3u8DL_RE: raise EnvironmentError("N_m3u8DL-RE executable not found...") + + decryption_engine = config.decryption.lower() + binary_path = None + + if content_keys: + if decryption_engine == "shaka": + binary_path = binaries.ShakaPackager + elif decryption_engine == "mp4decrypt": + binary_path = binaries.Mp4decrypt + + if binary_path: + binary_path = Path(binary_path) effective_max_workers = max_workers or min(32, (os.cpu_count() or 1) + 4) @@ -275,11 +313,49 @@ def download( skip_merge=skip_merge, ad_keyword=ad_keyword, ) - arguments.extend(get_track_selection_args(track)) + selection_args = get_track_selection_args(track) + arguments.extend(selection_args) + + log_file_path: Path | None = None + if debug_logger: + log_file_path = output_dir / f".n_m3u8dl_re_{filename}.log" + arguments.extend(["--log-file-path", str(log_file_path)]) + + track_url_display = track.url[:200] + "..." if len(track.url) > 200 else track.url + debug_logger.log( + level="DEBUG", + operation="downloader_n_m3u8dl_re_start", + message="Starting N_m3u8DL-RE download", + context={ + "binary_path": str(binaries.N_m3u8DL_RE), + "track_id": getattr(track, "id", None), + "track_type": track.__class__.__name__, + "track_url": track_url_display, + "output_dir": str(output_dir), + "filename": filename, + "thread_count": thread_count, + "retry_count": retry_count, + "has_content_keys": bool(content_keys), + "content_key_count": len(content_keys) if content_keys else 0, + "has_proxy": bool(proxy), + "skip_merge": skip_merge, + "has_custom_args": bool(track.downloader_args), + "selection_args": selection_args, + "descriptor": track.descriptor.name if hasattr(track, "descriptor") else None, + }, + ) + else: + arguments.extend(["--no-log", "true"]) yield {"total": 100} yield {"downloaded": "Parsing streams..."} + env = os.environ.copy() + + if binary_path and binary_path.exists(): + binary_dir = str(binary_path.parent) + env["PATH"] = binary_dir + os.pathsep + env["PATH"] + try: with subprocess.Popen( [binaries.N_m3u8DL_RE, *arguments], @@ -287,6 +363,7 @@ def download( stderr=subprocess.STDOUT, text=True, encoding="utf-8", + env=env, # Assign to virtual environment variables ) as process: last_line = "" track_type = track.__class__.__name__ @@ -297,12 +374,16 @@ def download( continue last_line = output + if ERROR_RE.search(output): + console.log(f"[N_m3u8DL-RE]: {output}") + if warn_match := WARN_RE.search(output): console.log(f"{track_type} {warn_match.group(1)}") continue if speed_match := SPEED_RE.search(output): - size = size_match.group(1) if (size_match := SIZE_RE.search(output)) else "" + size_match = SIZE_RE.search(output) + size = size_match.group(1) if size_match else "" yield {"downloaded": f"{speed_match.group(1)} {size}"} if percent_match := PERCENT_RE.search(output): @@ -310,11 +391,45 @@ def download( yield {"completed": progress} if progress < 100 else {"downloaded": "Merging"} process.wait() + if process.returncode != 0: + if debug_logger and log_file_path: + log_contents = "" + if log_file_path.exists(): + try: + log_contents = log_file_path.read_text(encoding="utf-8", errors="replace") + except Exception: + log_contents = "" + + debug_logger.log( + level="ERROR", + operation="downloader_n_m3u8dl_re_failed", + message=f"N_m3u8DL-RE exited with code {process.returncode}", + context={ + "returncode": process.returncode, + "track_id": getattr(track, "id", None), + "track_type": track.__class__.__name__, + "last_line": last_line, + "log_file_contents": log_contents, + }, + ) if error_match := ERROR_RE.search(last_line): raise ValueError(f"[N_m3u8DL-RE]: {error_match.group(1)}") raise subprocess.CalledProcessError(process.returncode, arguments) + if debug_logger: + debug_logger.log( + level="DEBUG", + operation="downloader_n_m3u8dl_re_complete", + message="N_m3u8DL-RE download completed successfully", + context={ + "track_id": getattr(track, "id", None), + "track_type": track.__class__.__name__, + "output_dir": str(output_dir), + "filename": filename, + }, + ) + except ConnectionResetError: # interrupted while passing URI to download raise KeyboardInterrupt() @@ -322,10 +437,35 @@ def download( DOWNLOAD_CANCELLED.set() # skip pending track downloads yield {"downloaded": "[yellow]CANCELLED"} raise - except Exception: + except Exception as e: DOWNLOAD_CANCELLED.set() # skip pending track downloads yield {"downloaded": "[red]FAILED"} + if debug_logger and log_file_path and not isinstance(e, (subprocess.CalledProcessError, ValueError)): + log_contents = "" + if log_file_path.exists(): + try: + log_contents = log_file_path.read_text(encoding="utf-8", errors="replace") + except Exception: + log_contents = "" + + debug_logger.log( + level="ERROR", + operation="downloader_n_m3u8dl_re_exception", + message=f"Unexpected error during N_m3u8DL-RE download: {e}", + error=e, + context={ + "track_id": getattr(track, "id", None), + "track_type": track.__class__.__name__, + "log_file_contents": log_contents, + }, + ) raise + finally: + if log_file_path and log_file_path.exists(): + try: + log_file_path.unlink() + except Exception: + pass def n_m3u8dl_re( @@ -382,4 +522,4 @@ def n_m3u8dl_re( ) -__all__ = ("n_m3u8dl_re",) +__all__ = ("n_m3u8dl_re",) \ No newline at end of file diff --git a/unshackle/core/downloaders/requests.py b/unshackle/core/downloaders/requests.py index 49c1759..06cab3d 100644 --- a/unshackle/core/downloaders/requests.py +++ b/unshackle/core/downloaders/requests.py @@ -12,7 +12,7 @@ from requests.adapters import HTTPAdapter from rich import filesize from unshackle.core.constants import DOWNLOAD_CANCELLED -from unshackle.core.utilities import get_extension +from unshackle.core.utilities import get_debug_logger, get_extension MAX_ATTEMPTS = 5 RETRY_WAIT = 2 @@ -215,6 +215,8 @@ def requests( if not isinstance(max_workers, (int, type(None))): raise TypeError(f"Expected max_workers to be {int}, not {type(max_workers)}") + debug_logger = get_debug_logger() + if not isinstance(urls, list): urls = [urls] @@ -241,6 +243,23 @@ def requests( if proxy: session.proxies.update({"all": proxy}) + if debug_logger: + first_url = urls[0].get("url", "") if urls else "" + url_display = first_url[:200] + "..." if len(first_url) > 200 else first_url + debug_logger.log( + level="DEBUG", + operation="downloader_requests_start", + message="Starting requests download", + context={ + "url_count": len(urls), + "first_url": url_display, + "output_dir": str(output_dir), + "filename": filename, + "max_workers": max_workers, + "has_proxy": bool(proxy), + }, + ) + yield dict(total=len(urls)) try: @@ -256,14 +275,37 @@ def requests( # tell dl that it was cancelled # the pool is already shut down, so exiting loop is fine raise - except Exception: + except Exception as e: DOWNLOAD_CANCELLED.set() # skip pending track downloads yield dict(downloaded="[red]FAILING") pool.shutdown(wait=True, cancel_futures=True) yield dict(downloaded="[red]FAILED") + if debug_logger: + debug_logger.log( + level="ERROR", + operation="downloader_requests_failed", + message=f"Requests download failed: {e}", + error=e, + context={ + "url_count": len(urls), + "output_dir": str(output_dir), + }, + ) # tell dl that it failed # the pool is already shut down, so exiting loop is fine raise + + if debug_logger: + debug_logger.log( + level="DEBUG", + operation="downloader_requests_complete", + message="Requests download completed successfully", + context={ + "url_count": len(urls), + "output_dir": str(output_dir), + "filename": filename, + }, + ) finally: DOWNLOAD_SIZES.clear() diff --git a/unshackle/core/drm/playready.py b/unshackle/core/drm/playready.py index b1fcea0..9091833 100644 --- a/unshackle/core/drm/playready.py +++ b/unshackle/core/drm/playready.py @@ -168,7 +168,7 @@ class PlayReady: pssh_boxes.extend(list(get_boxes(init_data, b"pssh"))) tenc_boxes.extend(list(get_boxes(init_data, b"tenc"))) - pssh = next((b for b in pssh_boxes if b.system_ID == PSSH.SYSTEM_ID.bytes), None) + pssh = next((b for b in pssh_boxes if b.system_ID == PSSH.SYSTEM_ID), None) if not pssh: raise PlayReady.Exceptions.PSSHNotFound("PSSH was not found in track data.") @@ -197,7 +197,7 @@ class PlayReady: if enc_key_id: kid = UUID(bytes=base64.b64decode(enc_key_id)) - pssh = next((b for b in pssh_boxes if b.system_ID == PSSH.SYSTEM_ID.bytes), None) + pssh = next((b for b in pssh_boxes if b.system_ID == PSSH.SYSTEM_ID), None) if not pssh: raise PlayReady.Exceptions.PSSHNotFound("PSSH was not found in track data.") @@ -415,7 +415,7 @@ class PlayReady: p.wait() if p.returncode != 0 or had_error: - raise subprocess.CalledProcessError(p.returncode, arguments) + raise subprocess.CalledProcessError(p.returncode, [binaries.ShakaPackager, *arguments]) path.unlink() if not stream_skipped: diff --git a/unshackle/core/drm/widevine.py b/unshackle/core/drm/widevine.py index 7fee1c9..6ca4fb5 100644 --- a/unshackle/core/drm/widevine.py +++ b/unshackle/core/drm/widevine.py @@ -100,9 +100,7 @@ class Widevine: pssh_boxes.extend(list(get_boxes(init_data, b"pssh"))) tenc_boxes.extend(list(get_boxes(init_data, b"tenc"))) - pssh_boxes.sort(key=lambda b: {PSSH.SystemId.Widevine: 0, PSSH.SystemId.PlayReady: 1}[b.system_ID]) - - pssh = next(iter(pssh_boxes), None) + pssh = next((b for b in pssh_boxes if b.system_ID == PSSH.SystemId.Widevine), None) if not pssh: raise Widevine.Exceptions.PSSHNotFound("PSSH was not found in track data.") @@ -141,9 +139,7 @@ class Widevine: if enc_key_id: kid = UUID(bytes=base64.b64decode(enc_key_id)) - pssh_boxes.sort(key=lambda b: {PSSH.SystemId.Widevine: 0, PSSH.SystemId.PlayReady: 1}[b.system_ID]) - - pssh = next(iter(pssh_boxes), None) + pssh = next((b for b in pssh_boxes if b.system_ID == PSSH.SystemId.Widevine), None) if not pssh: raise Widevine.Exceptions.PSSHNotFound("PSSH was not found in track data.") @@ -371,7 +367,7 @@ class Widevine: p.wait() if p.returncode != 0 or had_error: - raise subprocess.CalledProcessError(p.returncode, arguments) + raise subprocess.CalledProcessError(p.returncode, [binaries.ShakaPackager, *arguments]) path.unlink() if not stream_skipped: diff --git a/unshackle/core/manifests/dash.py b/unshackle/core/manifests/dash.py index 442ac96..ce0d2a7 100644 --- a/unshackle/core/manifests/dash.py +++ b/unshackle/core/manifests/dash.py @@ -5,6 +5,7 @@ import html import logging import math import re +import shutil import sys from copy import copy from functools import partial @@ -18,6 +19,7 @@ import requests from curl_cffi.requests import Session as CurlSession from langcodes import Language, tag_is_valid from lxml.etree import Element, ElementTree +from pyplayready.cdm import Cdm as PlayReadyCdm from pyplayready.system.pssh import PSSH as PR_PSSH from pywidevine.cdm import Cdm as WidevineCdm from pywidevine.pssh import PSSH @@ -28,7 +30,7 @@ from unshackle.core.downloaders import requests as requests_downloader from unshackle.core.drm import DRM_T, PlayReady, Widevine from unshackle.core.events import events from unshackle.core.tracks import Audio, Subtitle, Tracks, Video -from unshackle.core.utilities import is_close_match, try_ensure_utf8 +from unshackle.core.utilities import get_debug_logger, is_close_match, try_ensure_utf8 from unshackle.core.utils.xml import load_xml @@ -465,12 +467,23 @@ class DASH: track.data["dash"]["timescale"] = int(segment_timescale) track.data["dash"]["segment_durations"] = segment_durations - if not track.drm and isinstance(track, (Video, Audio)): - try: - track.drm = [Widevine.from_init_data(init_data)] - except Widevine.Exceptions.PSSHNotFound: - # it might not have Widevine DRM, or might not have found the PSSH - log.warning("No Widevine PSSH was found for this track, is it DRM free?") + if init_data and isinstance(track, (Video, Audio)): + if isinstance(cdm, PlayReadyCdm): + try: + track.drm = [PlayReady.from_init_data(init_data)] + except PlayReady.Exceptions.PSSHNotFound: + try: + track.drm = [Widevine.from_init_data(init_data)] + except Widevine.Exceptions.PSSHNotFound: + log.warning("No PlayReady or Widevine PSSH was found for this track, is it DRM free?") + else: + try: + track.drm = [Widevine.from_init_data(init_data)] + except Widevine.Exceptions.PSSHNotFound: + try: + track.drm = [PlayReady.from_init_data(init_data)] + except PlayReady.Exceptions.PSSHNotFound: + log.warning("No Widevine or PlayReady PSSH was found for this track, is it DRM free?") if track.drm: track_kid = track_kid or track.get_key_id(url=segments[0][0], session=session) @@ -515,8 +528,35 @@ class DASH: max_workers=max_workers, ) + skip_merge = False if downloader.__name__ == "n_m3u8dl_re": - downloader_args.update({"filename": track.id, "track": track}) + skip_merge = True + downloader_args.update( + { + "filename": track.id, + "track": track, + "content_keys": drm.content_keys if drm else None, + } + ) + + debug_logger = get_debug_logger() + if debug_logger: + debug_logger.log( + level="DEBUG", + operation="manifest_dash_download_start", + message="Starting DASH manifest download", + context={ + "track_id": getattr(track, "id", None), + "track_type": track.__class__.__name__, + "total_segments": len(segments), + "downloader": downloader.__name__, + "has_drm": bool(track.drm), + "drm_types": [drm.__class__.__name__ for drm in (track.drm or [])], + "skip_merge": skip_merge, + "save_path": str(save_path), + "has_init_data": bool(init_data), + }, + ) for status_update in downloader(**downloader_args): file_downloaded = status_update.get("file_downloaded") @@ -533,42 +573,56 @@ class DASH: control_file.unlink() segments_to_merge = [x for x in sorted(save_dir.iterdir()) if x.is_file()] - with open(save_path, "wb") as f: - if init_data: - f.write(init_data) - if len(segments_to_merge) > 1: - progress(downloaded="Merging", completed=0, total=len(segments_to_merge)) - for segment_file in segments_to_merge: - segment_data = segment_file.read_bytes() - # TODO: fix encoding after decryption? - if ( - not drm - and isinstance(track, Subtitle) - and track.codec not in (Subtitle.Codec.fVTT, Subtitle.Codec.fTTML) - ): - segment_data = try_ensure_utf8(segment_data) - segment_data = ( - segment_data.decode("utf8") - .replace("‎", html.unescape("‎")) - .replace("‏", html.unescape("‏")) - .encode("utf8") - ) - f.write(segment_data) - f.flush() - segment_file.unlink() - progress(advance=1) + + if skip_merge: + # N_m3u8DL-RE handles merging and decryption internally + shutil.move(segments_to_merge[0], save_path) + if drm: + track.drm = None + events.emit(events.Types.TRACK_DECRYPTED, track=track, drm=drm, segment=None) + else: + with open(save_path, "wb") as f: + if init_data: + f.write(init_data) + if len(segments_to_merge) > 1: + progress(downloaded="Merging", completed=0, total=len(segments_to_merge)) + for segment_file in segments_to_merge: + segment_data = segment_file.read_bytes() + # TODO: fix encoding after decryption? + if ( + not drm + and isinstance(track, Subtitle) + and track.codec not in (Subtitle.Codec.fVTT, Subtitle.Codec.fTTML) + ): + segment_data = try_ensure_utf8(segment_data) + segment_data = ( + segment_data.decode("utf8") + .replace("‎", html.unescape("‎")) + .replace("‏", html.unescape("‏")) + .encode("utf8") + ) + f.write(segment_data) + f.flush() + segment_file.unlink() + progress(advance=1) track.path = save_path events.emit(events.Types.TRACK_DOWNLOADED, track=track) - if drm: + if not skip_merge and drm: progress(downloaded="Decrypting", completed=0, total=100) drm.decrypt(save_path) track.drm = None events.emit(events.Types.TRACK_DECRYPTED, track=track, drm=drm, segment=None) progress(downloaded="Decrypting", advance=100) - save_dir.rmdir() + # Clean up empty segment directory + if save_dir.exists() and save_dir.name.endswith("_segments"): + try: + save_dir.rmdir() + except OSError: + # Directory might not be empty, try removing recursively + shutil.rmtree(save_dir, ignore_errors=True) progress(downloaded="Downloaded") @@ -736,6 +790,11 @@ class DASH: @staticmethod def get_drm(protections: list[Element]) -> list[DRM_T]: drm: list[DRM_T] = [] + PLACEHOLDER_KIDS = { + UUID("00000000-0000-0000-0000-000000000000"), # All zeros (key rotation default) + UUID("00010203-0405-0607-0809-0a0b0c0d0e0f"), # Sequential 0x00-0x0f + UUID("00010203-0405-0607-0809-101112131415"), # Shaka Packager test pattern + } for protection in protections: urn = (protection.get("schemeIdUri") or "").lower() @@ -745,17 +804,27 @@ class DASH: if not pssh_text: continue pssh = PSSH(pssh_text) + kid_attr = protection.get("kid") or protection.get("{urn:mpeg:cenc:2013}kid") + kid = UUID(bytes=base64.b64decode(kid_attr)) if kid_attr else None - kid = protection.get("kid") - if kid: - kid = UUID(bytes=base64.b64decode(kid)) + if not kid: + default_kid_attr = protection.get("default_KID") or protection.get( + "{urn:mpeg:cenc:2013}default_KID" + ) + kid = UUID(default_kid_attr) if default_kid_attr else None - default_kid = protection.get("default_KID") - if default_kid: - kid = UUID(default_kid) + if not kid: + kid = next( + ( + UUID(p.get("default_KID") or p.get("{urn:mpeg:cenc:2013}default_KID")) + for p in protections + if p.get("default_KID") or p.get("{urn:mpeg:cenc:2013}default_KID") + ), + None, + ) - if not pssh.key_ids and not kid: - kid = next((UUID(p.get("default_KID")) for p in protections if p.get("default_KID")), None) + if kid and (not pssh.key_ids or all(k.int == 0 or k in PLACEHOLDER_KIDS for k in pssh.key_ids)): + pssh.set_key_ids([kid]) drm.append(Widevine(pssh=pssh, kid=kid)) diff --git a/unshackle/core/manifests/hls.py b/unshackle/core/manifests/hls.py index 6f49c6a..fb0320b 100644 --- a/unshackle/core/manifests/hls.py +++ b/unshackle/core/manifests/hls.py @@ -32,7 +32,7 @@ from unshackle.core.downloaders import requests as requests_downloader from unshackle.core.drm import DRM_T, ClearKey, PlayReady, Widevine from unshackle.core.events import events from unshackle.core.tracks import Audio, Subtitle, Tracks, Video -from unshackle.core.utilities import get_extension, is_close_match, try_ensure_utf8 +from unshackle.core.utilities import get_debug_logger, get_extension, is_close_match, try_ensure_utf8 class HLS: @@ -350,6 +350,24 @@ class HLS: } ) + debug_logger = get_debug_logger() + if debug_logger: + debug_logger.log( + level="DEBUG", + operation="manifest_hls_download_start", + message="Starting HLS manifest download", + context={ + "track_id": getattr(track, "id", None), + "track_type": track.__class__.__name__, + "total_segments": total_segments, + "downloader": downloader.__name__, + "has_drm": bool(session_drm), + "drm_type": session_drm.__class__.__name__ if session_drm else None, + "skip_merge": skip_merge, + "save_path": str(save_path), + }, + ) + for status_update in downloader(**downloader_args): file_downloaded = status_update.get("file_downloaded") if file_downloaded: diff --git a/unshackle/core/manifests/ism.py b/unshackle/core/manifests/ism.py index 346c9e6..8cb6a3b 100644 --- a/unshackle/core/manifests/ism.py +++ b/unshackle/core/manifests/ism.py @@ -21,7 +21,7 @@ from unshackle.core.constants import DOWNLOAD_CANCELLED, DOWNLOAD_LICENCE_ONLY, from unshackle.core.drm import DRM_T, PlayReady, Widevine from unshackle.core.events import events from unshackle.core.tracks import Audio, Subtitle, Track, Tracks, Video -from unshackle.core.utilities import try_ensure_utf8 +from unshackle.core.utilities import get_debug_logger, try_ensure_utf8 from unshackle.core.utils.xml import load_xml @@ -283,6 +283,24 @@ class ISM: } ) + debug_logger = get_debug_logger() + if debug_logger: + debug_logger.log( + level="DEBUG", + operation="manifest_ism_download_start", + message="Starting ISM manifest download", + context={ + "track_id": getattr(track, "id", None), + "track_type": track.__class__.__name__, + "total_segments": len(segments), + "downloader": downloader.__name__, + "has_drm": bool(session_drm), + "drm_type": session_drm.__class__.__name__ if session_drm else None, + "skip_merge": skip_merge, + "save_path": str(save_path), + }, + ) + for status_update in downloader(**downloader_args): file_downloaded = status_update.get("file_downloaded") if file_downloaded: diff --git a/unshackle/core/titles/episode.py b/unshackle/core/titles/episode.py index 6592b60..b260ce9 100644 --- a/unshackle/core/titles/episode.py +++ b/unshackle/core/titles/episode.py @@ -185,7 +185,10 @@ class Episode(Title): if hdr_format: if hdr_format_full.startswith("Dolby Vision"): name += " DV" - if any(indicator in hdr_format_full for indicator in ["HDR10", "SMPTE ST 2086"]): + if any( + indicator in (hdr_format_full + " " + hdr_format) + for indicator in ["HDR10", "SMPTE ST 2086"] + ): name += " HDR" else: name += f" {DYNAMIC_RANGE_MAP.get(hdr_format)} " diff --git a/unshackle/core/titles/movie.py b/unshackle/core/titles/movie.py index 1545b18..bda68df 100644 --- a/unshackle/core/titles/movie.py +++ b/unshackle/core/titles/movie.py @@ -136,7 +136,10 @@ class Movie(Title): if hdr_format: if hdr_format_full.startswith("Dolby Vision"): name += " DV" - if any(indicator in hdr_format_full for indicator in ["HDR10", "SMPTE ST 2086"]): + if any( + indicator in (hdr_format_full + " " + hdr_format) + for indicator in ["HDR10", "SMPTE ST 2086"] + ): name += " HDR" else: name += f" {DYNAMIC_RANGE_MAP.get(hdr_format)} " diff --git a/unshackle/core/tracks/subtitle.py b/unshackle/core/tracks/subtitle.py index e807bff..b11181c 100644 --- a/unshackle/core/tracks/subtitle.py +++ b/unshackle/core/tracks/subtitle.py @@ -91,6 +91,12 @@ class Subtitle(Track): return Subtitle.Codec.TimedTextMarkupLang raise ValueError(f"The Content Profile '{profile}' is not a supported Subtitle Codec") + # WebVTT sanitization patterns (compiled once for performance) + _CUE_ID_PATTERN = re.compile(r"^[A-Za-z]+\d+$") + _TIMING_START_PATTERN = re.compile(r"^\d+:\d+[:\.]") + _TIMING_LINE_PATTERN = re.compile(r"^((?:\d+:)?\d+:\d+[.,]\d+)\s*-->\s*((?:\d+:)?\d+:\d+[.,]\d+)(.*)$") + _LINE_POS_PATTERN = re.compile(r"line:(\d+(?:\.\d+)?%?)") + def __init__( self, *args: Any, @@ -239,6 +245,11 @@ class Subtitle(Track): # Sanitize WebVTT timestamps before parsing text = Subtitle.sanitize_webvtt_timestamps(text) + # Remove cue identifiers that confuse parsers like pysubs2 + text = Subtitle.sanitize_webvtt_cue_identifiers(text) + # Merge overlapping cues with line positioning into single multi-line cues + text = Subtitle.merge_overlapping_webvtt_cues(text) + preserve_formatting = config.subtitle.get("preserve_formatting", True) if preserve_formatting: @@ -277,6 +288,240 @@ class Subtitle(Track): # Replace negative timestamps with 00:00:00.000 return re.sub(r"(-\d+:\d+:\d+\.\d+)", "00:00:00.000", text) + @staticmethod + def has_webvtt_cue_identifiers(text: str) -> bool: + """ + Check if WebVTT content has cue identifiers that need removal. + + Parameters: + text: The WebVTT content as string + + Returns: + True if cue identifiers are detected, False otherwise + """ + lines = text.split("\n") + + for i, line in enumerate(lines): + line = line.strip() + if Subtitle._CUE_ID_PATTERN.match(line): + # Look ahead to see if next non-empty line is a timing line + j = i + 1 + while j < len(lines) and not lines[j].strip(): + j += 1 + if j < len(lines) and ("-->" in lines[j] or Subtitle._TIMING_START_PATTERN.match(lines[j].strip())): + return True + return False + + @staticmethod + def sanitize_webvtt_cue_identifiers(text: str) -> str: + """ + Remove WebVTT cue identifiers that can confuse subtitle parsers. + + Some services use cue identifiers like "Q0", "Q1", etc. + that appear on their own line before the timing line. These can be + incorrectly parsed as part of the previous cue's text content by + some parsers (like pysubs2). + + Parameters: + text: The WebVTT content as string + + Returns: + Sanitized WebVTT content with cue identifiers removed + """ + if not Subtitle.has_webvtt_cue_identifiers(text): + return text + + lines = text.split("\n") + sanitized_lines = [] + + i = 0 + while i < len(lines): + line = lines[i].strip() + + # Check if this line is a cue identifier followed by a timing line + if Subtitle._CUE_ID_PATTERN.match(line): + # Look ahead to see if next non-empty line is a timing line + j = i + 1 + while j < len(lines) and not lines[j].strip(): + j += 1 + if j < len(lines) and ("-->" in lines[j] or Subtitle._TIMING_START_PATTERN.match(lines[j].strip())): + # This is a cue identifier, skip it + i += 1 + continue + + sanitized_lines.append(lines[i]) + i += 1 + + return "\n".join(sanitized_lines) + + @staticmethod + def _parse_vtt_time(t: str) -> int: + """Parse WebVTT timestamp to milliseconds. Returns 0 for malformed input.""" + try: + t = t.replace(",", ".") + parts = t.split(":") + if len(parts) == 2: + m, s = parts + h = "0" + elif len(parts) >= 3: + h, m, s = parts[:3] + else: + return 0 + sec_parts = s.split(".") + secs = int(sec_parts[0]) + # Handle variable millisecond digits (e.g., .5 = 500ms, .50 = 500ms, .500 = 500ms) + ms = int(sec_parts[1].ljust(3, "0")[:3]) if len(sec_parts) > 1 else 0 + return int(h) * 3600000 + int(m) * 60000 + secs * 1000 + ms + except (ValueError, IndexError): + return 0 + + @staticmethod + def has_overlapping_webvtt_cues(text: str) -> bool: + """ + Check if WebVTT content has overlapping cues that need merging. + + Detects cues with start times within 50ms of each other and the same end time, + which indicates multi-line subtitles split into separate cues. + + Parameters: + text: The WebVTT content as string + + Returns: + True if overlapping cues are detected, False otherwise + """ + timings = [] + for line in text.split("\n"): + match = Subtitle._TIMING_LINE_PATTERN.match(line) + if match: + start_str, end_str = match.group(1), match.group(2) + timings.append((Subtitle._parse_vtt_time(start_str), Subtitle._parse_vtt_time(end_str))) + + # Check for overlapping cues (within 50ms start, same end) + for i in range(len(timings) - 1): + curr_start, curr_end = timings[i] + next_start, next_end = timings[i + 1] + if abs(curr_start - next_start) <= 50 and curr_end == next_end: + return True + + return False + + @staticmethod + def merge_overlapping_webvtt_cues(text: str) -> str: + """ + Merge WebVTT cues that have overlapping/near-identical times but different line positions. + + Some services use separate cues for each line of a multi-line subtitle, with + slightly different start times (1ms apart) and different line: positions. + This merges them into single cues with proper line ordering based on the + line: position (lower percentage = higher on screen = first line). + + Parameters: + text: The WebVTT content as string + + Returns: + WebVTT content with overlapping cues merged + """ + if not Subtitle.has_overlapping_webvtt_cues(text): + return text + + lines = text.split("\n") + cues = [] + header_lines = [] + in_header = True + i = 0 + + while i < len(lines): + line = lines[i] + + if in_header: + if "-->" in line: + in_header = False + else: + header_lines.append(line) + i += 1 + continue + + match = Subtitle._TIMING_LINE_PATTERN.match(line) + if match: + start_str, end_str, settings = match.groups() + line_pos = 100.0 # Default to bottom + line_match = Subtitle._LINE_POS_PATTERN.search(settings) + if line_match: + pos_str = line_match.group(1).rstrip("%") + line_pos = float(pos_str) + + content_lines = [] + i += 1 + while i < len(lines) and lines[i].strip() and "-->" not in lines[i]: + content_lines.append(lines[i]) + i += 1 + + cues.append( + { + "start_ms": Subtitle._parse_vtt_time(start_str), + "end_ms": Subtitle._parse_vtt_time(end_str), + "start_str": start_str, + "end_str": end_str, + "line_pos": line_pos, + "content": "\n".join(content_lines), + "settings": settings, + } + ) + else: + i += 1 + + # Merge overlapping cues (within 50ms of each other with same end time) + merged_cues = [] + i = 0 + while i < len(cues): + current = cues[i] + group = [current] + + j = i + 1 + while j < len(cues): + other = cues[j] + if abs(current["start_ms"] - other["start_ms"]) <= 50 and current["end_ms"] == other["end_ms"]: + group.append(other) + j += 1 + else: + break + + if len(group) > 1: + # Sort by line position (lower % = higher on screen = first) + group.sort(key=lambda x: x["line_pos"]) + # Use the earliest start time from the group + earliest = min(group, key=lambda x: x["start_ms"]) + merged_cues.append( + { + "start_str": earliest["start_str"], + "end_str": group[0]["end_str"], + "content": "\n".join(c["content"] for c in group), + "settings": "", + } + ) + else: + merged_cues.append( + { + "start_str": current["start_str"], + "end_str": current["end_str"], + "content": current["content"], + "settings": current["settings"], + } + ) + + i = j if len(group) > 1 else i + 1 + + result_lines = header_lines[:] + if result_lines and result_lines[-1].strip(): + result_lines.append("") + + for cue in merged_cues: + result_lines.append(f"{cue['start_str']} --> {cue['end_str']}{cue['settings']}") + result_lines.append(cue["content"]) + result_lines.append("") + + return "\n".join(result_lines) + @staticmethod def sanitize_webvtt(text: str) -> str: """ @@ -565,13 +810,18 @@ class Subtitle(Track): if binaries.SubtitleEdit and self.codec not in (Subtitle.Codec.fTTML, Subtitle.Codec.fVTT): sub_edit_format = { - Subtitle.Codec.SubStationAlphav4: "AdvancedSubStationAlpha", - Subtitle.Codec.TimedTextMarkupLang: "TimedText1.0", - }.get(codec, codec.name) + Subtitle.Codec.SubRip: "subrip", + Subtitle.Codec.SubStationAlpha: "substationalpha", + Subtitle.Codec.SubStationAlphav4: "advancedsubstationalpha", + Subtitle.Codec.TimedTextMarkupLang: "timedtext1.0", + Subtitle.Codec.WebVTT: "webvtt", + Subtitle.Codec.SAMI: "sami", + Subtitle.Codec.MicroDVD: "microdvd", + }.get(codec, codec.name.lower()) sub_edit_args = [ - binaries.SubtitleEdit, - "/Convert", - self.path, + str(binaries.SubtitleEdit), + "/convert", + str(self.path), sub_edit_format, f"/outputfilename:{output_path.name}", "/encoding:utf8", @@ -631,7 +881,7 @@ class Subtitle(Track): text = try_ensure_utf8(data).decode("utf8") text = text.replace("tt:", "") # negative size values aren't allowed in TTML/DFXP spec, replace with 0 - text = re.sub(r'"(-\d+(\.\d+)?(px|em|%|c|pt))"', '"0"', text) + text = re.sub(r"-(\d+(?:\.\d+)?)(px|em|%|c|pt)", r"0\2", text) caption_set = pycaption.DFXPReader().read(text) elif codec == Subtitle.Codec.fVTT: caption_lists: dict[str, pycaption.CaptionList] = defaultdict(pycaption.CaptionList) @@ -962,18 +1212,26 @@ class Subtitle(Track): except Exception: pass # Fall through to other methods - if binaries.SubtitleEdit: - if self.codec == Subtitle.Codec.SubStationAlphav4: - output_format = "AdvancedSubStationAlpha" - elif self.codec == Subtitle.Codec.TimedTextMarkupLang: - output_format = "TimedText1.0" - else: - output_format = self.codec.name + conversion_method = config.subtitle.get("conversion_method", "auto") + use_subtitleedit = sdh_method == "subtitleedit" or ( + sdh_method == "auto" and conversion_method in ("auto", "subtitleedit") + ) + + if binaries.SubtitleEdit and use_subtitleedit: + output_format = { + Subtitle.Codec.SubRip: "subrip", + Subtitle.Codec.SubStationAlpha: "substationalpha", + Subtitle.Codec.SubStationAlphav4: "advancedsubstationalpha", + Subtitle.Codec.TimedTextMarkupLang: "timedtext1.0", + Subtitle.Codec.WebVTT: "webvtt", + Subtitle.Codec.SAMI: "sami", + Subtitle.Codec.MicroDVD: "microdvd", + }.get(self.codec, self.codec.name.lower()) subprocess.run( [ - binaries.SubtitleEdit, - "/Convert", - self.path, + str(binaries.SubtitleEdit), + "/convert", + str(self.path), output_format, "/encoding:utf8", "/overwrite", @@ -981,6 +1239,7 @@ class Subtitle(Track): ], check=True, stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, ) else: if config.subtitle.get("convert_before_strip", True) and self.codec != Subtitle.Codec.SubRip: @@ -1022,18 +1281,21 @@ class Subtitle(Track): if not binaries.SubtitleEdit: raise EnvironmentError("SubtitleEdit executable not found...") - if self.codec == Subtitle.Codec.SubStationAlphav4: - output_format = "AdvancedSubStationAlpha" - elif self.codec == Subtitle.Codec.TimedTextMarkupLang: - output_format = "TimedText1.0" - else: - output_format = self.codec.name + output_format = { + Subtitle.Codec.SubRip: "subrip", + Subtitle.Codec.SubStationAlpha: "substationalpha", + Subtitle.Codec.SubStationAlphav4: "advancedsubstationalpha", + Subtitle.Codec.TimedTextMarkupLang: "timedtext1.0", + Subtitle.Codec.WebVTT: "webvtt", + Subtitle.Codec.SAMI: "sami", + Subtitle.Codec.MicroDVD: "microdvd", + }.get(self.codec, self.codec.name.lower()) subprocess.run( [ - binaries.SubtitleEdit, - "/Convert", - self.path, + str(binaries.SubtitleEdit), + "/convert", + str(self.path), output_format, "/ReverseRtlStartEnd", "/encoding:utf8", @@ -1041,6 +1303,7 @@ class Subtitle(Track): ], check=True, stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, ) diff --git a/unshackle/core/tracks/track.py b/unshackle/core/tracks/track.py index 0b1a38f..3e0d7df 100644 --- a/unshackle/core/tracks/track.py +++ b/unshackle/core/tracks/track.py @@ -295,12 +295,23 @@ class Track: try: if not self.drm and track_type in ("Video", "Audio"): # the service might not have explicitly defined the `drm` property - # try find widevine DRM information from the init data of URL - try: - self.drm = [Widevine.from_track(self, session)] - except Widevine.Exceptions.PSSHNotFound: - # it might not have Widevine DRM, or might not have found the PSSH - log.warning("No Widevine PSSH was found for this track, is it DRM free?") + # try find DRM information from the init data of URL based on CDM type + if isinstance(cdm, PlayReadyCdm): + try: + self.drm = [PlayReady.from_track(self, session)] + except PlayReady.Exceptions.PSSHNotFound: + try: + self.drm = [Widevine.from_track(self, session)] + except Widevine.Exceptions.PSSHNotFound: + log.warning("No PlayReady or Widevine PSSH was found for this track, is it DRM free?") + else: + try: + self.drm = [Widevine.from_track(self, session)] + except Widevine.Exceptions.PSSHNotFound: + try: + self.drm = [PlayReady.from_track(self, session)] + except PlayReady.Exceptions.PSSHNotFound: + log.warning("No Widevine or PlayReady PSSH was found for this track, is it DRM free?") if self.drm: track_kid = self.get_key_id(session=session) diff --git a/unshackle/core/tracks/tracks.py b/unshackle/core/tracks/tracks.py index eeacd47..fd9d78d 100644 --- a/unshackle/core/tracks/tracks.py +++ b/unshackle/core/tracks/tracks.py @@ -22,7 +22,7 @@ from unshackle.core.tracks.chapters import Chapter, Chapters from unshackle.core.tracks.subtitle import Subtitle from unshackle.core.tracks.track import Track from unshackle.core.tracks.video import Video -from unshackle.core.utilities import is_close_match, sanitize_filename +from unshackle.core.utilities import get_debug_logger, is_close_match, sanitize_filename from unshackle.core.utils.collections import as_list, flatten @@ -507,6 +507,35 @@ class Tracks: if not output_path: raise ValueError("No tracks provided, at least one track must be provided.") + debug_logger = get_debug_logger() + if debug_logger: + debug_logger.log( + level="DEBUG", + operation="mux_start", + message="Starting mkvmerge muxing", + context={ + "title": title, + "output_path": str(output_path), + "video_count": len(self.videos), + "audio_count": len(self.audio), + "subtitle_count": len(self.subtitles), + "attachment_count": len(self.attachments), + "has_chapters": bool(self.chapters), + "video_tracks": [ + {"id": v.id, "codec": getattr(v, "codec", None), "language": str(v.language)} + for v in self.videos + ], + "audio_tracks": [ + {"id": a.id, "codec": getattr(a, "codec", None), "language": str(a.language)} + for a in self.audio + ], + "subtitle_tracks": [ + {"id": s.id, "codec": getattr(s, "codec", None), "language": str(s.language)} + for s in self.subtitles + ], + }, + ) + # let potential failures go to caller, caller should handle try: errors = [] @@ -516,7 +545,33 @@ class Tracks: errors.append(line) if "progress" in line: progress(total=100, completed=int(line.strip()[14:-1])) - return output_path, p.wait(), errors + + returncode = p.wait() + + if debug_logger: + if returncode != 0 or errors: + debug_logger.log( + level="ERROR", + operation="mux_failed", + message=f"mkvmerge exited with code {returncode}", + context={ + "returncode": returncode, + "output_path": str(output_path), + "errors": errors, + }, + ) + else: + debug_logger.log( + level="DEBUG", + operation="mux_complete", + message="mkvmerge muxing completed successfully", + context={ + "output_path": str(output_path), + "output_exists": output_path.exists() if output_path else False, + }, + ) + + return output_path, returncode, errors finally: if chapters_path: chapters_path.unlink() diff --git a/unshackle/core/utilities.py b/unshackle/core/utilities.py index 5aaf6f0..7a78535 100644 --- a/unshackle/core/utilities.py +++ b/unshackle/core/utilities.py @@ -120,9 +120,14 @@ def sanitize_filename(filename: str, spacer: str = ".") -> str: The spacer is safer to be a '.' for older DDL and p2p sharing spaces. This includes web-served content via direct links and such. + + Set `unicode_filenames: true` in config to preserve native language + characters (Korean, Japanese, Chinese, etc.) instead of transliterating + them to ASCII equivalents. """ - # replace all non-ASCII characters with ASCII equivalents - filename = unidecode(filename) + # optionally replace non-ASCII characters with ASCII equivalents + if not config.unicode_filenames: + filename = unidecode(filename) # remove or replace further characters as needed filename = "".join(c for c in filename if unicodedata.category(c) != "Mn") # hidden characters diff --git a/unshackle/vaults/API.py b/unshackle/vaults/API.py index 0cc52fe..dad9607 100644 --- a/unshackle/vaults/API.py +++ b/unshackle/vaults/API.py @@ -114,32 +114,71 @@ class API(Vault): return added or updated def add_keys(self, service: str, kid_keys: dict[Union[UUID, str], str]) -> int: - data = self.session.post( - url=f"{self.uri}/{service.lower()}", - json={"content_keys": {str(kid).replace("-", ""): key for kid, key in kid_keys.items()}}, - headers={"Accept": "application/json"}, - ).json() + # Normalize keys + normalized_keys = {str(kid).replace("-", ""): key for kid, key in kid_keys.items()} + kid_list = list(normalized_keys.keys()) - code = int(data.get("code", 0)) - message = data.get("message") - error = { - 0: None, - 1: Exceptions.AuthRejected, - 2: Exceptions.TooManyRequests, - 3: Exceptions.ServiceTagInvalid, - 4: Exceptions.KeyIdInvalid, - 5: Exceptions.ContentKeyInvalid, - }.get(code, ValueError) + if not kid_list: + return 0 - if error: - raise error(f"{message} ({code})") + # Try batches starting at 500, stepping down by 100 on failure, fallback to 1 + batch_size = 500 + total_added = 0 + i = 0 - # each kid:key that was new to the vault (optional) - added = int(data.get("added")) - # each key for a kid that was changed/updated (optional) - updated = int(data.get("updated")) + while i < len(kid_list): + batch_kids = kid_list[i : i + batch_size] + batch_keys = {kid: normalized_keys[kid] for kid in batch_kids} - return added + updated + try: + response = self.session.post( + url=f"{self.uri}/{service.lower()}", + json={"content_keys": batch_keys}, + headers={"Accept": "application/json"}, + ) + + # Check for HTTP errors that suggest batch is too large + if response.status_code in (413, 414, 400) and batch_size > 1: + if batch_size > 100: + batch_size -= 100 + else: + batch_size = 1 + continue + + data = response.json() + except Exception: + # JSON decode error or connection issue - try smaller batch + if batch_size > 1: + if batch_size > 100: + batch_size -= 100 + else: + batch_size = 1 + continue + raise + + code = int(data.get("code", 0)) + message = data.get("message") + error = { + 0: None, + 1: Exceptions.AuthRejected, + 2: Exceptions.TooManyRequests, + 3: Exceptions.ServiceTagInvalid, + 4: Exceptions.KeyIdInvalid, + 5: Exceptions.ContentKeyInvalid, + }.get(code, ValueError) + + if error: + raise error(f"{message} ({code})") + + # each kid:key that was new to the vault (optional) + added = int(data.get("added", 0)) + # each key for a kid that was changed/updated (optional) + updated = int(data.get("updated", 0)) + + total_added += added + updated + i += batch_size + + return total_added def get_services(self) -> Iterator[str]: data = self.session.post(url=self.uri, headers={"Accept": "application/json"}).json() diff --git a/unshackle/vaults/SQLite.py b/unshackle/vaults/SQLite.py index f1922d7..a3f6447 100644 --- a/unshackle/vaults/SQLite.py +++ b/unshackle/vaults/SQLite.py @@ -119,9 +119,25 @@ class SQLite(Vault): cursor = conn.cursor() try: - placeholders = ",".join(["?"] * len(kid_keys)) - cursor.execute(f"SELECT kid FROM `{service}` WHERE kid IN ({placeholders})", list(kid_keys.keys())) - existing_kids = {row[0] for row in cursor.fetchall()} + # Query existing KIDs in batches to avoid SQLite variable limit + # Try larger batch first (newer SQLite supports 32766), fall back to 500 if needed + existing_kids: set[str] = set() + kid_list = list(kid_keys.keys()) + batch_size = 32000 + + i = 0 + while i < len(kid_list): + batch = kid_list[i : i + batch_size] + placeholders = ",".join(["?"] * len(batch)) + try: + cursor.execute(f"SELECT kid FROM `{service}` WHERE kid IN ({placeholders})", batch) + existing_kids.update(row[0] for row in cursor.fetchall()) + i += batch_size + except sqlite3.OperationalError as e: + if "too many SQL variables" in str(e) and batch_size > 500: + batch_size = 500 + continue + raise new_keys = {kid: key for kid, key in kid_keys.items() if kid not in existing_kids} diff --git a/uv.lock b/uv.lock index f2ad4bb..ee38efe 100644 --- a/uv.lock +++ b/uv.lock @@ -1565,7 +1565,7 @@ wheels = [ [[package]] name = "unshackle" -version = "2.1.0" +version = "2.3.0" source = { editable = "." } dependencies = [ { name = "aiohttp-swagger3" },