diff --git a/unshackle/core/downloaders/requests.py b/unshackle/core/downloaders/requests.py index fd23a41..65a61e8 100644 --- a/unshackle/core/downloaders/requests.py +++ b/unshackle/core/downloaders/requests.py @@ -138,7 +138,6 @@ def download( chunks = stream.stream() elif use_raw: # requests.Session: raw socket read — 30-35% faster than iter_content - stream.raw.decode_content = False _read = stream.raw.read def _chunks() -> Generator[bytes, None, None]: diff --git a/unshackle/core/utilities.py b/unshackle/core/utilities.py index b57ca14..727c41c 100644 --- a/unshackle/core/utilities.py +++ b/unshackle/core/utilities.py @@ -1,5 +1,6 @@ import ast import contextlib +import gzip import importlib.util import json import logging @@ -10,6 +11,7 @@ import sys import time import traceback import unicodedata +import zlib from collections import defaultdict from datetime import datetime, timezone from pathlib import Path @@ -478,12 +480,29 @@ def try_ensure_utf8(data: bytes) -> bytes: """ Try to ensure that the given data is encoded in UTF-8. + Automatically decompresses gzip/deflate/zlib data before encoding detection. + This handles cases where HTTP responses are saved with raw Content-Encoding + (e.g., when decode_content=False is used for performance). + Parameters: data: Input data that may or may not yet be UTF-8 or another encoding. Returns the input data encoded in UTF-8 if successful. If unable to detect the encoding of the input data, then the original data is returned as-received. """ + # Decompress gzip data (magic bytes: 1f 8b) + if data[:2] == b"\x1f\x8b": + try: + data = gzip.decompress(data) + except Exception: + pass + # Decompress raw deflate/zlib data (common zlib headers: 78 01, 78 5e, 78 9c, 78 da) + elif data[:1] == b"\x78" and len(data) > 1 and data[1:2] in (b"\x01", b"\x5e", b"\x9c", b"\xda"): + try: + data = zlib.decompress(data) + except Exception: + pass + try: data.decode("utf8") return data