mirror of
https://github.com/unshackle-dl/unshackle.git
synced 2026-05-16 21:59:26 +00:00
fix(subtitle): decompress gzip/zlib responses for subtitle downloads
The requests downloader used decode_content=False on raw socket reads, which skipped HTTP content-encoding decompression. Subtitle files served with Content-Encoding: gzip were saved as raw compressed bytes, then mangled by try_ensure_utf8 falling back to CP1252 decoding. Remove decode_content=False from the raw read path — the speed gain comes from raw.read() itself, not from skipping decompression. Also add gzip/zlib magic byte detection in try_ensure_utf8 as a safety net for any edge cases where compressed data reaches encoding detection.
This commit is contained in:
@@ -138,7 +138,6 @@ def download(
|
|||||||
chunks = stream.stream()
|
chunks = stream.stream()
|
||||||
elif use_raw:
|
elif use_raw:
|
||||||
# requests.Session: raw socket read — 30-35% faster than iter_content
|
# requests.Session: raw socket read — 30-35% faster than iter_content
|
||||||
stream.raw.decode_content = False
|
|
||||||
_read = stream.raw.read
|
_read = stream.raw.read
|
||||||
|
|
||||||
def _chunks() -> Generator[bytes, None, None]:
|
def _chunks() -> Generator[bytes, None, None]:
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import ast
|
import ast
|
||||||
import contextlib
|
import contextlib
|
||||||
|
import gzip
|
||||||
import importlib.util
|
import importlib.util
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
@@ -10,6 +11,7 @@ import sys
|
|||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
import zlib
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -478,12 +480,29 @@ def try_ensure_utf8(data: bytes) -> bytes:
|
|||||||
"""
|
"""
|
||||||
Try to ensure that the given data is encoded in UTF-8.
|
Try to ensure that the given data is encoded in UTF-8.
|
||||||
|
|
||||||
|
Automatically decompresses gzip/deflate/zlib data before encoding detection.
|
||||||
|
This handles cases where HTTP responses are saved with raw Content-Encoding
|
||||||
|
(e.g., when decode_content=False is used for performance).
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
data: Input data that may or may not yet be UTF-8 or another encoding.
|
data: Input data that may or may not yet be UTF-8 or another encoding.
|
||||||
|
|
||||||
Returns the input data encoded in UTF-8 if successful. If unable to detect the
|
Returns the input data encoded in UTF-8 if successful. If unable to detect the
|
||||||
encoding of the input data, then the original data is returned as-received.
|
encoding of the input data, then the original data is returned as-received.
|
||||||
"""
|
"""
|
||||||
|
# Decompress gzip data (magic bytes: 1f 8b)
|
||||||
|
if data[:2] == b"\x1f\x8b":
|
||||||
|
try:
|
||||||
|
data = gzip.decompress(data)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Decompress raw deflate/zlib data (common zlib headers: 78 01, 78 5e, 78 9c, 78 da)
|
||||||
|
elif data[:1] == b"\x78" and len(data) > 1 and data[1:2] in (b"\x01", b"\x5e", b"\x9c", b"\xda"):
|
||||||
|
try:
|
||||||
|
data = zlib.decompress(data)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
data.decode("utf8")
|
data.decode("utf8")
|
||||||
return data
|
return data
|
||||||
|
|||||||
Reference in New Issue
Block a user