mirror of
https://github.com/unshackle-dl/unshackle.git
synced 2026-05-17 06:09:29 +00:00
perf(downloader): optimize hot loop and threading efficiency
Replace list.pop(0) with deque.popleft() for O(1) speed tracker eviction, skip urllib3 decode chain with decode_content=False on raw reads, use running total instead of sum() for progress reporting, add explicit stream.close() on CurlSession path, replace busy-poll loop with concurrent.futures.wait(FIRST_COMPLETED), skip ThreadPoolExecutor for single-URL downloads, and DRY up duplicated raw/iter_content progress logic into a unified chunk iterator.
This commit is contained in:
@@ -1,6 +1,8 @@
|
|||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
from collections import deque
|
||||||
|
from concurrent.futures import FIRST_COMPLETED, wait
|
||||||
from concurrent.futures.thread import ThreadPoolExecutor
|
from concurrent.futures.thread import ThreadPoolExecutor
|
||||||
from http.cookiejar import CookieJar
|
from http.cookiejar import CookieJar
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -74,7 +76,7 @@ def download(
|
|||||||
session = session or Session()
|
session = session or Session()
|
||||||
|
|
||||||
if _speed_tracker is None:
|
if _speed_tracker is None:
|
||||||
_speed_tracker = {"sizes": [], "last_refresh": time.time()}
|
_speed_tracker = {"sizes": deque(), "last_refresh": time.time()}
|
||||||
|
|
||||||
save_dir = save_path.parent
|
save_dir = save_path.parent
|
||||||
control_file = save_path.with_name(f"{save_path.name}.!dev")
|
control_file = save_path.with_name(f"{save_path.name}.!dev")
|
||||||
@@ -95,7 +97,6 @@ def download(
|
|||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
written = 0
|
written = 0
|
||||||
download_sizes: list[int] = []
|
|
||||||
last_speed_refresh = _time()
|
last_speed_refresh = _time()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -127,33 +128,30 @@ def download(
|
|||||||
# Cache f.write for hot loop
|
# Cache f.write for hot loop
|
||||||
_write = f.write
|
_write = f.write
|
||||||
|
|
||||||
|
# Build chunk iterator — raw reads for requests.Session, iter_content for CurlSession
|
||||||
if use_raw:
|
if use_raw:
|
||||||
# Raw socket read — 30-35% faster than iter_content (benchmarked)
|
stream.raw.decode_content = False
|
||||||
# Safe in worker threads with Queue-based event dispatch
|
|
||||||
_read = stream.raw.read
|
_read = stream.raw.read
|
||||||
|
|
||||||
|
def _chunks() -> Generator[bytes, None, None]:
|
||||||
while True:
|
while True:
|
||||||
chunk = _read(chunk_size)
|
chunk = _read(chunk_size)
|
||||||
if not chunk:
|
if not chunk:
|
||||||
break
|
break
|
||||||
_write(chunk)
|
yield chunk
|
||||||
download_size = len(chunk)
|
|
||||||
written += download_size
|
|
||||||
|
|
||||||
if not segmented:
|
|
||||||
yield dict(advance=1)
|
|
||||||
now = _time()
|
|
||||||
time_since = now - last_speed_refresh
|
|
||||||
download_sizes.append(download_size)
|
|
||||||
if time_since > PROGRESS_WINDOW or download_size < chunk_size:
|
|
||||||
data_size = sum(download_sizes)
|
|
||||||
download_speed = math.ceil(data_size / (time_since or 1))
|
|
||||||
yield dict(downloaded=f"{filesize.decimal(download_speed)}/s")
|
|
||||||
last_speed_refresh = now
|
|
||||||
download_sizes.clear()
|
|
||||||
stream.close()
|
stream.close()
|
||||||
|
|
||||||
|
chunks = _chunks()
|
||||||
else:
|
else:
|
||||||
# CurlSession: use iter_content (raw not available)
|
def _chunks_iter() -> Generator[bytes, None, None]:
|
||||||
for chunk in stream.iter_content(chunk_size=chunk_size):
|
yield from stream.iter_content(chunk_size=chunk_size)
|
||||||
|
stream.close()
|
||||||
|
|
||||||
|
chunks = _chunks_iter()
|
||||||
|
|
||||||
|
# Unified write + progress loop
|
||||||
|
_data_accumulated = 0
|
||||||
|
for chunk in chunks:
|
||||||
_write(chunk)
|
_write(chunk)
|
||||||
download_size = len(chunk)
|
download_size = len(chunk)
|
||||||
written += download_size
|
written += download_size
|
||||||
@@ -162,13 +160,12 @@ def download(
|
|||||||
yield dict(advance=1)
|
yield dict(advance=1)
|
||||||
now = _time()
|
now = _time()
|
||||||
time_since = now - last_speed_refresh
|
time_since = now - last_speed_refresh
|
||||||
download_sizes.append(download_size)
|
_data_accumulated += download_size
|
||||||
if time_since > PROGRESS_WINDOW or download_size < chunk_size:
|
if time_since > PROGRESS_WINDOW or download_size < chunk_size:
|
||||||
data_size = sum(download_sizes)
|
download_speed = math.ceil(_data_accumulated / (time_since or 1))
|
||||||
download_speed = math.ceil(data_size / (time_since or 1))
|
|
||||||
yield dict(downloaded=f"{filesize.decimal(download_speed)}/s")
|
yield dict(downloaded=f"{filesize.decimal(download_speed)}/s")
|
||||||
last_speed_refresh = now
|
last_speed_refresh = now
|
||||||
download_sizes.clear()
|
_data_accumulated = 0
|
||||||
|
|
||||||
# Truncate to actual written size in case pre-allocation overshot
|
# Truncate to actual written size in case pre-allocation overshot
|
||||||
if content_length > 0 and written != content_length:
|
if content_length > 0 and written != content_length:
|
||||||
@@ -187,7 +184,7 @@ def download(
|
|||||||
sizes.append((now, written))
|
sizes.append((now, written))
|
||||||
cutoff = now - SPEED_ROLLING_WINDOW
|
cutoff = now - SPEED_ROLLING_WINDOW
|
||||||
while sizes and sizes[0][0] < cutoff:
|
while sizes and sizes[0][0] < cutoff:
|
||||||
sizes.pop(0)
|
sizes.popleft()
|
||||||
time_since = now - _speed_tracker["last_refresh"]
|
time_since = now - _speed_tracker["last_refresh"]
|
||||||
if sizes and time_since > PROGRESS_WINDOW:
|
if sizes and time_since > PROGRESS_WINDOW:
|
||||||
window_start = sizes[0][0]
|
window_start = sizes[0][0]
|
||||||
@@ -338,9 +335,18 @@ def requests(
|
|||||||
yield dict(total=len(urls))
|
yield dict(total=len(urls))
|
||||||
|
|
||||||
# Per-call speed tracker — shared across threads within this call only
|
# Per-call speed tracker — shared across threads within this call only
|
||||||
speed_tracker: dict[str, Any] = {"sizes": [], "last_refresh": time.time()}
|
speed_tracker: dict[str, Any] = {"sizes": deque(), "last_refresh": time.time()}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Fast path: single URL — no thread pool overhead
|
||||||
|
if len(urls) == 1:
|
||||||
|
yield from download(
|
||||||
|
session=session,
|
||||||
|
segmented=segmented_batch,
|
||||||
|
_speed_tracker=speed_tracker,
|
||||||
|
**urls[0],
|
||||||
|
)
|
||||||
|
else:
|
||||||
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
||||||
event_queue: Queue[dict[str, Any]] = Queue()
|
event_queue: Queue[dict[str, Any]] = Queue()
|
||||||
|
|
||||||
@@ -364,9 +370,9 @@ def requests(
|
|||||||
except Empty:
|
except Empty:
|
||||||
break
|
break
|
||||||
|
|
||||||
done = {future for future in pending if future.done()}
|
# Wait efficiently for next future completion (OS condition variable)
|
||||||
for future in done:
|
completed, pending = wait(pending, timeout=0.1, return_when=FIRST_COMPLETED)
|
||||||
pending.remove(future)
|
for future in completed:
|
||||||
exc = future.exception()
|
exc = future.exception()
|
||||||
if isinstance(exc, KeyboardInterrupt):
|
if isinstance(exc, KeyboardInterrupt):
|
||||||
DOWNLOAD_CANCELLED.set()
|
DOWNLOAD_CANCELLED.set()
|
||||||
@@ -392,12 +398,6 @@ def requests(
|
|||||||
)
|
)
|
||||||
raise exc
|
raise exc
|
||||||
|
|
||||||
if pending:
|
|
||||||
try:
|
|
||||||
yield event_queue.get(timeout=0.1)
|
|
||||||
except Empty:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Drain any remaining events from workers that just finished
|
# Drain any remaining events from workers that just finished
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user