mirror of
https://github.com/unshackle-dl/unshackle.git
synced 2026-05-17 06:09:29 +00:00
fix(dash): add SIDX parsing for SegmentBase manifests and deduplicate multi-period segments
Multi-period DASH manifests using SegmentBase with shared BaseURLs were downloading the entire file once per period, causing massive file size inflation. Parse the SIDX box to extract proper per-segment byte ranges and deduplicate identical segments across periods.
This commit is contained in:
@@ -6,6 +6,7 @@ import logging
|
|||||||
import math
|
import math
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
|
import struct
|
||||||
import sys
|
import sys
|
||||||
from copy import copy
|
from copy import copy
|
||||||
from functools import partial
|
from functools import partial
|
||||||
@@ -330,7 +331,7 @@ class DASH:
|
|||||||
period_count = len(content_periods)
|
period_count = len(content_periods)
|
||||||
|
|
||||||
if period_count > 1:
|
if period_count > 1:
|
||||||
log.info(f"Multi-period manifest detected with {period_count} content periods")
|
log.debug(f"Multi-period manifest detected with {period_count} content periods")
|
||||||
|
|
||||||
for period_idx, content_period in enumerate(content_periods):
|
for period_idx, content_period in enumerate(content_periods):
|
||||||
# Find the matching representation in this period
|
# Find the matching representation in this period
|
||||||
@@ -371,7 +372,9 @@ class DASH:
|
|||||||
if p_kid and track_kid and p_kid != track_kid:
|
if p_kid and track_kid and p_kid != track_kid:
|
||||||
log.debug(f"Period {content_period.get('id', period_idx)} has different KID: {p_kid}")
|
log.debug(f"Period {content_period.get('id', period_idx)} has different KID: {p_kid}")
|
||||||
|
|
||||||
segments.extend(p_segments)
|
for seg in p_segments:
|
||||||
|
if seg not in segments:
|
||||||
|
segments.append(seg)
|
||||||
segment_durations.extend(p_durations)
|
segment_durations.extend(p_durations)
|
||||||
|
|
||||||
if not segments:
|
if not segments:
|
||||||
@@ -568,6 +571,49 @@ class DASH:
|
|||||||
|
|
||||||
progress(downloaded="Downloaded")
|
progress(downloaded="Downloaded")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_sidx(data: bytes, index_range: str) -> list[str]:
|
||||||
|
"""Parse a SIDX box to extract per-segment byte ranges."""
|
||||||
|
# Find the sidx box in the data
|
||||||
|
offset = 0
|
||||||
|
while offset < len(data) - 8:
|
||||||
|
box_size = struct.unpack(">I", data[offset:offset + 4])[0]
|
||||||
|
if box_size < 8 or data[offset + 4:offset + 8] != b"sidx":
|
||||||
|
offset += max(box_size, 8)
|
||||||
|
continue
|
||||||
|
|
||||||
|
pos = offset + 8
|
||||||
|
version = data[pos]
|
||||||
|
pos += 4 # version + flags
|
||||||
|
pos += 4 # reference_ID
|
||||||
|
pos += 4 # timescale
|
||||||
|
|
||||||
|
if version == 0:
|
||||||
|
first_offset = struct.unpack(">I", data[pos + 4:pos + 8])[0]
|
||||||
|
pos += 8
|
||||||
|
else:
|
||||||
|
first_offset = struct.unpack(">Q", data[pos + 8:pos + 16])[0]
|
||||||
|
pos += 16
|
||||||
|
|
||||||
|
pos += 2 # reserved
|
||||||
|
reference_count = struct.unpack(">H", data[pos:pos + 2])[0]
|
||||||
|
pos += 2
|
||||||
|
|
||||||
|
idx_end = int(index_range.split("-")[1])
|
||||||
|
current_offset = idx_end + 1 + first_offset
|
||||||
|
segments = []
|
||||||
|
|
||||||
|
for _ in range(reference_count):
|
||||||
|
ref_size = struct.unpack(">I", data[pos:pos + 4])[0] & 0x7FFFFFFF
|
||||||
|
pos += 12 # ref_info + subseg_duration + SAP fields
|
||||||
|
seg_end = current_offset + ref_size - 1
|
||||||
|
segments.append(f"{current_offset}-{seg_end}")
|
||||||
|
current_offset = seg_end + 1
|
||||||
|
|
||||||
|
return segments
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _is_content_period(period: Element, filtered_period_ids: list[str]) -> bool:
|
def _is_content_period(period: Element, filtered_period_ids: list[str]) -> bool:
|
||||||
"""Check if a period is a valid content period (not an ad, not filtered, not trick mode)."""
|
"""Check if a period is a valid content period (not an ad, not filtered, not trick mode)."""
|
||||||
@@ -768,6 +814,18 @@ class DASH:
|
|||||||
if total_size:
|
if total_size:
|
||||||
media_range = f"{len(init_data)}-{total_size}"
|
media_range = f"{len(init_data)}-{total_size}"
|
||||||
|
|
||||||
|
# Parse SIDX box from indexRange to get per-segment byte ranges
|
||||||
|
index_range = segment_base.get("indexRange")
|
||||||
|
if index_range:
|
||||||
|
sidx_res = session.get(url=rep_base_url, headers={"Range": f"bytes={index_range}"})
|
||||||
|
sidx_res.raise_for_status()
|
||||||
|
sidx_segments = DASH._parse_sidx(sidx_res.content, index_range)
|
||||||
|
if sidx_segments:
|
||||||
|
for seg_range in sidx_segments:
|
||||||
|
segments.append((rep_base_url, seg_range))
|
||||||
|
else:
|
||||||
|
segments.append((rep_base_url, media_range))
|
||||||
|
else:
|
||||||
segments.append((rep_base_url, media_range))
|
segments.append((rep_base_url, media_range))
|
||||||
elif rep_base_url:
|
elif rep_base_url:
|
||||||
segments.append((rep_base_url, None))
|
segments.append((rep_base_url, None))
|
||||||
|
|||||||
Reference in New Issue
Block a user