fix(dash): add SIDX parsing for SegmentBase manifests and deduplicate multi-period segments

Multi-period DASH manifests using SegmentBase with shared BaseURLs were downloading the entire file once per period, causing massive file size inflation. Parse the SIDX box to extract proper per-segment byte ranges and deduplicate identical segments across periods.
This commit is contained in:
imSp4rky
2026-04-06 02:45:17 +00:00
parent fef68202e9
commit c5aa57c9db

View File

@@ -6,6 +6,7 @@ import logging
import math import math
import re import re
import shutil import shutil
import struct
import sys import sys
from copy import copy from copy import copy
from functools import partial from functools import partial
@@ -330,7 +331,7 @@ class DASH:
period_count = len(content_periods) period_count = len(content_periods)
if period_count > 1: if period_count > 1:
log.info(f"Multi-period manifest detected with {period_count} content periods") log.debug(f"Multi-period manifest detected with {period_count} content periods")
for period_idx, content_period in enumerate(content_periods): for period_idx, content_period in enumerate(content_periods):
# Find the matching representation in this period # Find the matching representation in this period
@@ -371,7 +372,9 @@ class DASH:
if p_kid and track_kid and p_kid != track_kid: if p_kid and track_kid and p_kid != track_kid:
log.debug(f"Period {content_period.get('id', period_idx)} has different KID: {p_kid}") log.debug(f"Period {content_period.get('id', period_idx)} has different KID: {p_kid}")
segments.extend(p_segments) for seg in p_segments:
if seg not in segments:
segments.append(seg)
segment_durations.extend(p_durations) segment_durations.extend(p_durations)
if not segments: if not segments:
@@ -568,6 +571,49 @@ class DASH:
progress(downloaded="Downloaded") progress(downloaded="Downloaded")
@staticmethod
def _parse_sidx(data: bytes, index_range: str) -> list[str]:
"""Parse a SIDX box to extract per-segment byte ranges."""
# Find the sidx box in the data
offset = 0
while offset < len(data) - 8:
box_size = struct.unpack(">I", data[offset:offset + 4])[0]
if box_size < 8 or data[offset + 4:offset + 8] != b"sidx":
offset += max(box_size, 8)
continue
pos = offset + 8
version = data[pos]
pos += 4 # version + flags
pos += 4 # reference_ID
pos += 4 # timescale
if version == 0:
first_offset = struct.unpack(">I", data[pos + 4:pos + 8])[0]
pos += 8
else:
first_offset = struct.unpack(">Q", data[pos + 8:pos + 16])[0]
pos += 16
pos += 2 # reserved
reference_count = struct.unpack(">H", data[pos:pos + 2])[0]
pos += 2
idx_end = int(index_range.split("-")[1])
current_offset = idx_end + 1 + first_offset
segments = []
for _ in range(reference_count):
ref_size = struct.unpack(">I", data[pos:pos + 4])[0] & 0x7FFFFFFF
pos += 12 # ref_info + subseg_duration + SAP fields
seg_end = current_offset + ref_size - 1
segments.append(f"{current_offset}-{seg_end}")
current_offset = seg_end + 1
return segments
return []
@staticmethod @staticmethod
def _is_content_period(period: Element, filtered_period_ids: list[str]) -> bool: def _is_content_period(period: Element, filtered_period_ids: list[str]) -> bool:
"""Check if a period is a valid content period (not an ad, not filtered, not trick mode).""" """Check if a period is a valid content period (not an ad, not filtered, not trick mode)."""
@@ -768,6 +814,18 @@ class DASH:
if total_size: if total_size:
media_range = f"{len(init_data)}-{total_size}" media_range = f"{len(init_data)}-{total_size}"
# Parse SIDX box from indexRange to get per-segment byte ranges
index_range = segment_base.get("indexRange")
if index_range:
sidx_res = session.get(url=rep_base_url, headers={"Range": f"bytes={index_range}"})
sidx_res.raise_for_status()
sidx_segments = DASH._parse_sidx(sidx_res.content, index_range)
if sidx_segments:
for seg_range in sidx_segments:
segments.append((rep_base_url, seg_range))
else:
segments.append((rep_base_url, media_range))
else:
segments.append((rep_base_url, media_range)) segments.append((rep_base_url, media_range))
elif rep_base_url: elif rep_base_url:
segments.append((rep_base_url, None)) segments.append((rep_base_url, None))