mirror of
https://github.com/unshackle-dl/unshackle.git
synced 2026-05-16 21:59:26 +00:00
fix(dash): add SIDX parsing for SegmentBase manifests and deduplicate multi-period segments
Multi-period DASH manifests using SegmentBase with shared BaseURLs were downloading the entire file once per period, causing massive file size inflation. Parse the SIDX box to extract proper per-segment byte ranges and deduplicate identical segments across periods.
This commit is contained in:
@@ -6,6 +6,7 @@ import logging
|
||||
import math
|
||||
import re
|
||||
import shutil
|
||||
import struct
|
||||
import sys
|
||||
from copy import copy
|
||||
from functools import partial
|
||||
@@ -330,7 +331,7 @@ class DASH:
|
||||
period_count = len(content_periods)
|
||||
|
||||
if period_count > 1:
|
||||
log.info(f"Multi-period manifest detected with {period_count} content periods")
|
||||
log.debug(f"Multi-period manifest detected with {period_count} content periods")
|
||||
|
||||
for period_idx, content_period in enumerate(content_periods):
|
||||
# Find the matching representation in this period
|
||||
@@ -371,7 +372,9 @@ class DASH:
|
||||
if p_kid and track_kid and p_kid != track_kid:
|
||||
log.debug(f"Period {content_period.get('id', period_idx)} has different KID: {p_kid}")
|
||||
|
||||
segments.extend(p_segments)
|
||||
for seg in p_segments:
|
||||
if seg not in segments:
|
||||
segments.append(seg)
|
||||
segment_durations.extend(p_durations)
|
||||
|
||||
if not segments:
|
||||
@@ -568,6 +571,49 @@ class DASH:
|
||||
|
||||
progress(downloaded="Downloaded")
|
||||
|
||||
@staticmethod
|
||||
def _parse_sidx(data: bytes, index_range: str) -> list[str]:
|
||||
"""Parse a SIDX box to extract per-segment byte ranges."""
|
||||
# Find the sidx box in the data
|
||||
offset = 0
|
||||
while offset < len(data) - 8:
|
||||
box_size = struct.unpack(">I", data[offset:offset + 4])[0]
|
||||
if box_size < 8 or data[offset + 4:offset + 8] != b"sidx":
|
||||
offset += max(box_size, 8)
|
||||
continue
|
||||
|
||||
pos = offset + 8
|
||||
version = data[pos]
|
||||
pos += 4 # version + flags
|
||||
pos += 4 # reference_ID
|
||||
pos += 4 # timescale
|
||||
|
||||
if version == 0:
|
||||
first_offset = struct.unpack(">I", data[pos + 4:pos + 8])[0]
|
||||
pos += 8
|
||||
else:
|
||||
first_offset = struct.unpack(">Q", data[pos + 8:pos + 16])[0]
|
||||
pos += 16
|
||||
|
||||
pos += 2 # reserved
|
||||
reference_count = struct.unpack(">H", data[pos:pos + 2])[0]
|
||||
pos += 2
|
||||
|
||||
idx_end = int(index_range.split("-")[1])
|
||||
current_offset = idx_end + 1 + first_offset
|
||||
segments = []
|
||||
|
||||
for _ in range(reference_count):
|
||||
ref_size = struct.unpack(">I", data[pos:pos + 4])[0] & 0x7FFFFFFF
|
||||
pos += 12 # ref_info + subseg_duration + SAP fields
|
||||
seg_end = current_offset + ref_size - 1
|
||||
segments.append(f"{current_offset}-{seg_end}")
|
||||
current_offset = seg_end + 1
|
||||
|
||||
return segments
|
||||
|
||||
return []
|
||||
|
||||
@staticmethod
|
||||
def _is_content_period(period: Element, filtered_period_ids: list[str]) -> bool:
|
||||
"""Check if a period is a valid content period (not an ad, not filtered, not trick mode)."""
|
||||
@@ -768,7 +814,19 @@ class DASH:
|
||||
if total_size:
|
||||
media_range = f"{len(init_data)}-{total_size}"
|
||||
|
||||
segments.append((rep_base_url, media_range))
|
||||
# Parse SIDX box from indexRange to get per-segment byte ranges
|
||||
index_range = segment_base.get("indexRange")
|
||||
if index_range:
|
||||
sidx_res = session.get(url=rep_base_url, headers={"Range": f"bytes={index_range}"})
|
||||
sidx_res.raise_for_status()
|
||||
sidx_segments = DASH._parse_sidx(sidx_res.content, index_range)
|
||||
if sidx_segments:
|
||||
for seg_range in sidx_segments:
|
||||
segments.append((rep_base_url, seg_range))
|
||||
else:
|
||||
segments.append((rep_base_url, media_range))
|
||||
else:
|
||||
segments.append((rep_base_url, media_range))
|
||||
elif rep_base_url:
|
||||
segments.append((rep_base_url, None))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user