fix(subtitle): decompress gzip/zlib responses for subtitle downloads

The requests downloader used decode_content=False on raw socket reads, which skipped HTTP content-encoding decompression. Subtitle files served with Content-Encoding: gzip were saved as raw compressed bytes, then mangled by try_ensure_utf8 falling back to CP1252 decoding. Remove decode_content=False from the raw read path — the speed gain comes from raw.read() itself, not from skipping decompression. Also add gzip/zlib magic byte detection in try_ensure_utf8 as a safety net for any edge cases where compressed data reaches encoding detection.
2026-07-15 12:27:24 +00:00 · 2026-03-24 17:44:23 -06:00
parent 99be88dc08
commit c930abc6fd
2 changed files with 19 additions and 1 deletions
--- a/unshackle/core/downloaders/requests.py
+++ b/unshackle/core/downloaders/requests.py
@@ -138,7 +138,6 @@ def download(
                        chunks = stream.stream()
                    elif use_raw:
                        # requests.Session: raw socket read — 30-35% faster than iter_content
-                        stream.raw.decode_content = False
                        _read = stream.raw.read

                        def _chunks() -> Generator[bytes, None, None]:
--- a/unshackle/core/utilities.py
+++ b/unshackle/core/utilities.py
@@ -1,5 +1,6 @@
 import ast
 import contextlib
+import gzip
 import importlib.util
 import json
 import logging
@@ -10,6 +11,7 @@ import sys
 import time
 import traceback
 import unicodedata
+import zlib
 from collections import defaultdict
 from datetime import datetime, timezone
 from pathlib import Path
@@ -478,12 +480,29 @@ def try_ensure_utf8(data: bytes) -> bytes:
    """
    Try to ensure that the given data is encoded in UTF-8.

+    Automatically decompresses gzip/deflate/zlib data before encoding detection.
+    This handles cases where HTTP responses are saved with raw Content-Encoding
+    (e.g., when decode_content=False is used for performance).
+
    Parameters:
        data: Input data that may or may not yet be UTF-8 or another encoding.

    Returns the input data encoded in UTF-8 if successful. If unable to detect the
    encoding of the input data, then the original data is returned as-received.
    """
+    # Decompress gzip data (magic bytes: 1f 8b)
+    if data[:2] == b"\x1f\x8b":
+        try:
+            data = gzip.decompress(data)
+        except Exception:
+            pass
+    # Decompress raw deflate/zlib data (common zlib headers: 78 01, 78 5e, 78 9c, 78 da)
+    elif data[:1] == b"\x78" and len(data) > 1 and data[1:2] in (b"\x01", b"\x5e", b"\x9c", b"\xda"):
+        try:
+            data = zlib.decompress(data)
+        except Exception:
+            pass
+
    try:
        data.decode("utf8")
        return data