fix(subtitle): decompress gzip/zlib responses for subtitle downloads

The requests downloader used decode_content=False on raw socket reads, which skipped HTTP content-encoding decompression. Subtitle files served with Content-Encoding: gzip were saved as raw compressed bytes, then mangled by try_ensure_utf8 falling back to CP1252 decoding.

Remove decode_content=False from the raw read path — the speed gain comes from raw.read() itself, not from skipping decompression. Also add gzip/zlib magic byte detection in try_ensure_utf8 as a safety net for any edge cases where compressed data reaches encoding detection.
This commit is contained in:
Andy
2026-03-24 17:44:23 -06:00
parent 99be88dc08
commit c930abc6fd
2 changed files with 19 additions and 1 deletions

View File

@@ -138,7 +138,6 @@ def download(
chunks = stream.stream() chunks = stream.stream()
elif use_raw: elif use_raw:
# requests.Session: raw socket read — 30-35% faster than iter_content # requests.Session: raw socket read — 30-35% faster than iter_content
stream.raw.decode_content = False
_read = stream.raw.read _read = stream.raw.read
def _chunks() -> Generator[bytes, None, None]: def _chunks() -> Generator[bytes, None, None]:

View File

@@ -1,5 +1,6 @@
import ast import ast
import contextlib import contextlib
import gzip
import importlib.util import importlib.util
import json import json
import logging import logging
@@ -10,6 +11,7 @@ import sys
import time import time
import traceback import traceback
import unicodedata import unicodedata
import zlib
from collections import defaultdict from collections import defaultdict
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
@@ -478,12 +480,29 @@ def try_ensure_utf8(data: bytes) -> bytes:
""" """
Try to ensure that the given data is encoded in UTF-8. Try to ensure that the given data is encoded in UTF-8.
Automatically decompresses gzip/deflate/zlib data before encoding detection.
This handles cases where HTTP responses are saved with raw Content-Encoding
(e.g., when decode_content=False is used for performance).
Parameters: Parameters:
data: Input data that may or may not yet be UTF-8 or another encoding. data: Input data that may or may not yet be UTF-8 or another encoding.
Returns the input data encoded in UTF-8 if successful. If unable to detect the Returns the input data encoded in UTF-8 if successful. If unable to detect the
encoding of the input data, then the original data is returned as-received. encoding of the input data, then the original data is returned as-received.
""" """
# Decompress gzip data (magic bytes: 1f 8b)
if data[:2] == b"\x1f\x8b":
try:
data = gzip.decompress(data)
except Exception:
pass
# Decompress raw deflate/zlib data (common zlib headers: 78 01, 78 5e, 78 9c, 78 da)
elif data[:1] == b"\x78" and len(data) > 1 and data[1:2] in (b"\x01", b"\x5e", b"\x9c", b"\xda"):
try:
data = zlib.decompress(data)
except Exception:
pass
try: try:
data.decode("utf8") data.decode("utf8")
return data return data