mirror of
https://github.com/unshackle-dl/unshackle.git
synced 2026-06-22 17:07:23 +00:00
fix(ism): rebuild moov init segment for Smooth Streaming decrypt
ISM (Smooth Streaming) tracks raw-concatenate moof+mdat fragments with no ftyp/moov, so shaka-packager/mp4decrypt fail with PARSER_FAILURE (exit 2) on decrypt. The init box was previously built by n_m3u8dl_re, removed in the downloader consolidation. Add ism_init.py, a dependency-free byte-level MP4 init-segment synthesizer that rebuilds ftyp+moov from the manifest CodecPrivateData, ported from yt-dlp's write_piff_header and N_m3u8DL-RE's MSSMoovProcessor with full codec parity: - AVC (H264/AVC1/DAVC), with SPS/PPS picked by NAL type rather than position and NALUnitLengthField honored - HEVC (HVC1/HEV1) with chroma format and bit depths parsed from the de-emulated SPS via exp-Golomb so 10-bit/HDR signals correctly, and profile/tier/level lifted from the SPS PTL - Dolby Vision (DVHE/DVH1) as hvcC with a dvh1 sample entry - AAC (AACL/AACH) with the AudioSpecificConfig synthesized from SamplingRate/Channels when the manifest omits CodecPrivateData - EC-3 with a real dec3 box extracted from the WAVEFORMATEXTENSIBLE CodecPrivateData (Dolby GUID located by search, not fixed offset) - TTML subtitles as stpp/sthd/subt, wired for fragmented-TTML tracks CENC wrapping (encv/enca + sinf/tenc with default_KID) covers encrypted tracks: the per-sample IV size is derived from the fragment senc/saiz (PIFF override flag, payload arithmetic, saiz fallback) instead of assuming 8, and the constant-IV tenc form is supported. Read the track_ID from the first fragment's tfhd so the moov matches and the muxer does not drop samples. Wire ISM.download_track to prepend the synthesized init before merging; unsupported codecs soft-fail to raw concatenation with a warning. Harden against real-world inputs: 2-letter/uppercase manifest language tags normalize to ISO-639-2 (und fallback), >65535 Hz sample rates no longer overflow the 16.16 field, truncated tfhd returns None, struct.error joins the soft-fail handler, and the emulation-prevention scan no longer over-strips consecutive escapes. Add regression tests (37) covering box structure, every supported FourCC, 10-bit SPS parsing, ASC synthesis, dec3 extraction, IV-size derivation and the crash fixes. Validated structurally per codec with ffmpeg-minted fragments: shaka-packager parses synth-init+fragments with exit 0 and ffprobe reports the expected codec, including a live run against a public Smooth Streaming server.
This commit is contained in:
410
tests/core/test_ism_init.py
Normal file
410
tests/core/test_ism_init.py
Normal file
@@ -0,0 +1,410 @@
|
||||
"""Regression tests for ISM init-segment synthesis (ftyp + moov).
|
||||
|
||||
Smooth Streaming fragments carry no moov; the init box must be rebuilt from the
|
||||
manifest CodecPrivateData before shaka/mp4decrypt can parse the stream. These
|
||||
guard the byte-level box structure so a future downloader refactor cannot
|
||||
silently drop it again (the c323db9 regression).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import struct
|
||||
|
||||
import pytest
|
||||
|
||||
from unshackle.core.manifests.ism_init import (NAL_START_CODE, PIFF_SENC_UUID, box, build_avcc, build_dec3,
|
||||
build_hvcc, build_init_segment, full_box, parse_hevc_sps_format,
|
||||
read_per_sample_iv_size, read_track_id, remove_emulation_prevention,
|
||||
split_nal_units, synthesize_aac_codec_private_data)
|
||||
|
||||
# Real CodecPrivateData taken from a Smooth Streaming manifest.
|
||||
VIDEO_HEVC_CPD = (
|
||||
"0000000140010C01FFFF01600000030090000003000003009695980900000001420101016000000300900000"
|
||||
"030000030096A001E020064165959A4930BC05A80808082000007D20000BB801000000014401C172B66240"
|
||||
)
|
||||
# H.264 SPS+PPS (start-code delimited) for the AVC path.
|
||||
VIDEO_AVC_CPD = "00000001674d401e9a6602800b76020000003e90000bb800f18311200000000168ebccb22c"
|
||||
# 10-bit (Main 10) HEVC VPS+SPS+PPS minted with x265; ffprobe reads the
|
||||
# synthesized init as "Main 10 / yuv420p10le".
|
||||
VIDEO_HEVC10_CPD = (
|
||||
"0000000140010c01ffff02200000030090000003000003003c959809000000000142010102200000030090"
|
||||
"000003000003003ca00a080b9f6d96566924caf0168080000003008000000c8400000000014401c172b4624000"
|
||||
)
|
||||
AAC_LC_CPD = "1190"
|
||||
# Real Smooth EC-3 CodecPrivateData: WAVEFORMATEXTENSIBLE extension (samples
|
||||
# per block + channel mask + DD+ GUID) followed by the 5-byte dec3 payload.
|
||||
EC3_CPD = "00063F000000AF87FBA7022DFB42A4D405CD93843BDD0600200F00"
|
||||
KID = bytes.fromhex("09fd2bd778bb544785ed2322dc6a7d87")
|
||||
|
||||
|
||||
def top_level_boxes(data: bytes) -> list[tuple[str, int]]:
|
||||
boxes, offset = [], 0
|
||||
while offset + 8 <= len(data):
|
||||
size = struct.unpack(">I", data[offset : offset + 4])[0]
|
||||
box_type = data[offset + 4 : offset + 8].decode("latin1")
|
||||
if size == 1:
|
||||
size = struct.unpack(">Q", data[offset + 8 : offset + 16])[0]
|
||||
if size == 0:
|
||||
size = len(data) - offset
|
||||
boxes.append((box_type, size))
|
||||
offset += size
|
||||
return boxes
|
||||
|
||||
|
||||
def test_split_nal_units_drops_start_codes():
|
||||
nals = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD))
|
||||
# VPS (32), SPS (33), PPS (34) by HEVC NAL type = (first_byte >> 1) & 0x3F.
|
||||
assert [(n[0] >> 1) & 0x3F for n in nals] == [32, 33, 34]
|
||||
|
||||
|
||||
def test_hevc_init_structure():
|
||||
init = build_init_segment(
|
||||
stream_type="video",
|
||||
fourcc="HVC1",
|
||||
codec_private_data=VIDEO_HEVC_CPD,
|
||||
timescale=10000000,
|
||||
width=3840,
|
||||
height=1600,
|
||||
)
|
||||
boxes = top_level_boxes(init)
|
||||
assert [b[0] for b in boxes] == ["ftyp", "moov"]
|
||||
assert boxes[0][1] + boxes[1][1] == len(init)
|
||||
assert b"hvcC" in init
|
||||
assert b"hvc1" in init
|
||||
# Unencrypted: no protection scheme boxes.
|
||||
assert b"encv" not in init and b"sinf" not in init
|
||||
|
||||
|
||||
def test_avc_init_structure():
|
||||
init = build_init_segment(
|
||||
stream_type="video",
|
||||
fourcc="H264",
|
||||
codec_private_data=VIDEO_AVC_CPD,
|
||||
timescale=10000000,
|
||||
width=1280,
|
||||
height=720,
|
||||
)
|
||||
assert init[4:8] == b"ftyp"
|
||||
assert b"avcC" in init and b"avc1" in init
|
||||
|
||||
|
||||
def test_aac_audio_init_structure():
|
||||
init = build_init_segment(
|
||||
stream_type="audio",
|
||||
fourcc="AACL",
|
||||
codec_private_data=AAC_LC_CPD,
|
||||
timescale=10000000,
|
||||
channels=2,
|
||||
sampling_rate=48000,
|
||||
)
|
||||
assert b"mp4a" in init and b"esds" in init
|
||||
assert b"smhd" in init # sound media header, not video
|
||||
|
||||
|
||||
def test_encrypted_init_has_cenc_boxes():
|
||||
init = build_init_segment(
|
||||
stream_type="video",
|
||||
fourcc="HVC1",
|
||||
codec_private_data=VIDEO_HEVC_CPD,
|
||||
timescale=10000000,
|
||||
width=3840,
|
||||
height=1600,
|
||||
kid=KID,
|
||||
)
|
||||
# Encrypted sample entry is wrapped: encv -> sinf(frma+schm+schi(tenc)).
|
||||
assert b"encv" in init
|
||||
assert b"sinf" in init and b"frma" in init and b"tenc" in init
|
||||
assert b"cenc" in init
|
||||
# The 16-byte default_KID must be embedded verbatim for shaka to map the key.
|
||||
assert KID in init
|
||||
# Original codec preserved inside frma for the muxer.
|
||||
assert b"hvc1" in init
|
||||
|
||||
|
||||
def test_unsupported_codec_raises():
|
||||
# Unknown FourCC (e.g. VC-1); caller soft-fails to raw concat.
|
||||
with pytest.raises(NotImplementedError):
|
||||
build_init_segment(
|
||||
stream_type="video",
|
||||
fourcc="WVC1",
|
||||
codec_private_data="00063F00",
|
||||
timescale=10000000,
|
||||
)
|
||||
|
||||
|
||||
def test_ec3_init_embeds_dec3_from_codec_private_data():
|
||||
init = build_init_segment(
|
||||
stream_type="audio",
|
||||
fourcc="EC-3",
|
||||
codec_private_data=EC3_CPD,
|
||||
timescale=10000000,
|
||||
channels=6,
|
||||
sampling_rate=48000,
|
||||
)
|
||||
assert b"ec-3" in init
|
||||
# dec3 payload = CodecPrivateData past the 22-byte WAVEFORMATEXTENSIBLE header.
|
||||
assert box(b"dec3", bytes.fromhex(EC3_CPD)[22:]) in init
|
||||
assert b"esds" not in init # no MPEG-4 descriptor inside an ec-3 entry
|
||||
|
||||
|
||||
def test_ec3_encrypted_wraps_enca_with_frma():
|
||||
init = build_init_segment(
|
||||
stream_type="audio",
|
||||
fourcc="EC-3",
|
||||
codec_private_data=EC3_CPD,
|
||||
timescale=10000000,
|
||||
channels=6,
|
||||
kid=KID,
|
||||
)
|
||||
assert b"enca" in init and b"sinf" in init and b"tenc" in init
|
||||
assert box(b"frma", b"ec-3") in init
|
||||
assert KID in init
|
||||
|
||||
|
||||
def test_ec3_dec3_found_in_full_waveformatextensible():
|
||||
# Some services ship the full WAVEFORMATEX header (18 bytes) before the
|
||||
# extension; the dec3 payload still follows the DD+ GUID.
|
||||
full = b"\xfe\xff" + b"\x00" * 16 + bytes.fromhex(EC3_CPD)
|
||||
payload = bytes.fromhex(EC3_CPD)[22:]
|
||||
assert build_dec3(full) == box(b"dec3", payload)
|
||||
|
||||
|
||||
def test_ec3_without_dolby_guid_builds_bare_entry():
|
||||
assert build_dec3(b"\x00\x06\x3f\x00") is None
|
||||
init = build_init_segment(
|
||||
stream_type="audio",
|
||||
fourcc="EC-3",
|
||||
codec_private_data="",
|
||||
timescale=10000000,
|
||||
channels=6,
|
||||
)
|
||||
assert b"ec-3" in init and b"dec3" not in init
|
||||
|
||||
|
||||
def test_aac_codec_private_data_synthesis_matches_real_manifest():
|
||||
# 48 kHz stereo AAC-LC must produce 0x1190 — the exact ASC real manifests carry.
|
||||
assert synthesize_aac_codec_private_data("AACL", 48000, 2).hex() == "1190"
|
||||
|
||||
|
||||
def test_aach_synthesis_signals_sbr():
|
||||
asc = synthesize_aac_codec_private_data("AACH", 24000, 2)
|
||||
assert len(asc) == 4
|
||||
assert asc[0] >> 3 == 0x05 # AOT 5 = SBR (HE-AAC)
|
||||
# Extension sampling frequency = core * 2 = 48 kHz (index 3).
|
||||
assert ((asc[1] & 0x01) << 1) | (asc[2] >> 7) == 0x03
|
||||
|
||||
|
||||
def test_aac_init_without_codec_private_data_synthesizes_asc():
|
||||
init = build_init_segment(
|
||||
stream_type="audio",
|
||||
fourcc="AACL",
|
||||
codec_private_data="",
|
||||
timescale=10000000,
|
||||
channels=2,
|
||||
sampling_rate=48000,
|
||||
)
|
||||
assert b"mp4a" in init and b"esds" in init
|
||||
assert bytes.fromhex(AAC_LC_CPD) in init
|
||||
|
||||
|
||||
def test_dolby_vision_uses_dvh1_sample_entry():
|
||||
init = build_init_segment(
|
||||
stream_type="video",
|
||||
fourcc="DVH1",
|
||||
codec_private_data=VIDEO_HEVC_CPD,
|
||||
timescale=10000000,
|
||||
width=3840,
|
||||
height=1600,
|
||||
)
|
||||
assert b"dvh1" in init and b"hvcC" in init
|
||||
assert b"hvc1" not in init
|
||||
|
||||
|
||||
def test_davc_maps_to_avc1():
|
||||
init = build_init_segment(
|
||||
stream_type="video",
|
||||
fourcc="DAVC",
|
||||
codec_private_data=VIDEO_AVC_CPD,
|
||||
timescale=10000000,
|
||||
)
|
||||
assert b"avc1" in init and b"avcC" in init
|
||||
|
||||
|
||||
def test_lowercase_fourcc_normalized():
|
||||
# Real manifests ship FourCC="hvc1" in lowercase.
|
||||
init = build_init_segment(
|
||||
stream_type="video",
|
||||
fourcc="hvc1",
|
||||
codec_private_data=VIDEO_HEVC_CPD,
|
||||
timescale=10000000,
|
||||
)
|
||||
assert b"hvcC" in init
|
||||
|
||||
|
||||
def test_avcc_selects_sps_pps_by_nal_type_not_position():
|
||||
nals = split_nal_units(bytes.fromhex(VIDEO_AVC_CPD))
|
||||
swapped = NAL_START_CODE + nals[1] + NAL_START_CODE + nals[0] # PPS first
|
||||
avcc = build_avcc(swapped)
|
||||
# Profile/compat/level must still come from the SPS body.
|
||||
assert avcc[9:12] == nals[0][1:4]
|
||||
|
||||
|
||||
def test_nal_length_field_respected():
|
||||
avcc = build_avcc(bytes.fromhex(VIDEO_AVC_CPD), nal_length_size=2)
|
||||
# avcC payload byte 4 low 2 bits = lengthSizeMinusOne.
|
||||
assert avcc[12] & 0x03 == 1
|
||||
|
||||
|
||||
def test_parse_hevc_sps_format_8bit():
|
||||
sps = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD))[1]
|
||||
assert parse_hevc_sps_format(remove_emulation_prevention(sps)) == (1, 0, 0) # 4:2:0, 8-bit
|
||||
|
||||
|
||||
def test_hvcc_signals_10bit_from_sps():
|
||||
sps = next(n for n in split_nal_units(bytes.fromhex(VIDEO_HEVC10_CPD)) if (n[0] >> 1) & 0x3F == 33)
|
||||
assert parse_hevc_sps_format(remove_emulation_prevention(sps)) == (1, 2, 2) # 4:2:0, 10-bit
|
||||
payload = build_hvcc(bytes.fromhex(VIDEO_HEVC10_CPD))[8:] # strip box header
|
||||
assert payload[16] == 0xFC | 0x01 # chromaFormat 4:2:0
|
||||
assert payload[17] == 0xF8 | 0x02 # bitDepthLumaMinus8 = 2
|
||||
assert payload[18] == 0xF8 | 0x02 # bitDepthChromaMinus8 = 2
|
||||
|
||||
|
||||
def test_ttml_init_structure():
|
||||
init = build_init_segment(
|
||||
stream_type="text",
|
||||
fourcc="TTML",
|
||||
codec_private_data="",
|
||||
timescale=10000000,
|
||||
language="eng",
|
||||
)
|
||||
assert b"stpp" in init
|
||||
assert b"sthd" in init # subtitle media header
|
||||
assert b"subt" in init and b"SubtitleHandler\0" in init
|
||||
assert b"http://www.w3.org/ns/ttml\0" in init
|
||||
|
||||
|
||||
def test_constant_iv_tenc_form():
|
||||
constant_iv = bytes(range(16))
|
||||
init = build_init_segment(
|
||||
stream_type="video",
|
||||
fourcc="HVC1",
|
||||
codec_private_data=VIDEO_HEVC_CPD,
|
||||
timescale=10000000,
|
||||
kid=KID,
|
||||
constant_iv=constant_iv,
|
||||
)
|
||||
# Constant-IV form: default_Per_Sample_IV_Size = 0, then size + IV after the KID.
|
||||
assert KID + bytes([len(constant_iv)]) + constant_iv in init
|
||||
tenc_at = init.index(b"tenc")
|
||||
assert init[tenc_at + 4 + 4 + 3] == 0 # default_Per_Sample_IV_Size
|
||||
|
||||
|
||||
def make_fragment(senc: bytes = b"", saiz: bytes = b"") -> bytes:
|
||||
tfhd = full_box(b"tfhd", 0, 0, struct.pack(">I", 1) + b"\x00" * 4)
|
||||
traf = box(b"traf", tfhd + senc + saiz)
|
||||
return box(b"moof", traf) + box(b"mdat", b"\x00" * 4)
|
||||
|
||||
|
||||
def test_iv_size_from_piff_senc_override_flag():
|
||||
# PIFF senc uuid with flags&1: AlgorithmID(3) + IV_size(1) + KID(16) override.
|
||||
payload = b"\x00\x00\x00\x01" + b"\x00\x00\x01" + bytes([16]) + KID + struct.pack(">I", 0)
|
||||
senc = box(b"uuid", PIFF_SENC_UUID + payload)
|
||||
assert read_per_sample_iv_size(make_fragment(senc=senc)) == 16
|
||||
|
||||
|
||||
def test_iv_size_from_senc_payload_length():
|
||||
# Standard senc, no subsamples: 3 samples x 8-byte IVs.
|
||||
senc = full_box(b"senc", 0, 0, struct.pack(">I", 3) + b"\x11" * 24)
|
||||
assert read_per_sample_iv_size(make_fragment(senc=senc)) == 8
|
||||
|
||||
|
||||
def test_iv_size_from_senc_with_subsamples():
|
||||
# senc flags&2: per sample IV(8) + entry_count(2) + 6 bytes per entry.
|
||||
sample = b"\x22" * 8 + struct.pack(">H", 1) + b"\x00" * 6
|
||||
senc = full_box(b"senc", 0, 2, struct.pack(">I", 2) + sample * 2)
|
||||
assert read_per_sample_iv_size(make_fragment(senc=senc)) == 8
|
||||
|
||||
|
||||
def test_iv_size_from_saiz_fallback():
|
||||
saiz = full_box(b"saiz", 0, 0, bytes([16]) + struct.pack(">I", 5))
|
||||
assert read_per_sample_iv_size(make_fragment(saiz=saiz)) == 16
|
||||
|
||||
|
||||
def test_iv_size_undetermined_returns_none():
|
||||
assert read_per_sample_iv_size(make_fragment()) is None
|
||||
|
||||
|
||||
def test_hvcc_embeds_vps_sps_pps():
|
||||
hvcc = build_hvcc(bytes.fromhex(VIDEO_HEVC_CPD))
|
||||
nals = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD))
|
||||
# Each original NAL unit (VPS/SPS/PPS) is embedded verbatim in the arrays.
|
||||
for nal in nals:
|
||||
assert nal in hvcc
|
||||
|
||||
|
||||
def test_avcc_requires_sps_and_pps():
|
||||
with pytest.raises(ValueError):
|
||||
build_avcc(b"\x00\x00\x00\x01\x67only_sps")
|
||||
|
||||
|
||||
def test_read_track_id_from_fragment():
|
||||
# Minimal moof/traf/tfhd carrying track_ID = 7.
|
||||
tfhd = full_box("tfhd".encode(), 0, 0, struct.pack(">I", 7) + b"\x00" * 4)
|
||||
traf = box(b"traf", tfhd)
|
||||
moof = box(b"moof", traf)
|
||||
mdat = box(b"mdat", b"\x00\x00")
|
||||
assert read_track_id(moof + mdat) == 7
|
||||
|
||||
|
||||
def test_read_track_id_missing_returns_none():
|
||||
assert read_track_id(box(b"mdat", b"\x00\x00")) is None
|
||||
|
||||
|
||||
def test_remove_emulation_prevention():
|
||||
# 00 00 03 XX -> the 0x03 emulation byte is dropped.
|
||||
assert remove_emulation_prevention(b"\x00\x00\x03\x01") == b"\x00\x00\x01"
|
||||
assert remove_emulation_prevention(b"\x00\x00\x03\x00\x00\x03\x96") == b"\x00\x00\x00\x00\x96"
|
||||
# The byte after a consumed escape is data, even another 0x03.
|
||||
assert remove_emulation_prevention(b"\x00\x00\x03\x03") == b"\x00\x00\x03"
|
||||
assert remove_emulation_prevention(b"\x00\x00\x03\x03\x00\x00\x03\x01") == b"\x00\x00\x03\x00\x00\x01"
|
||||
|
||||
|
||||
def test_two_letter_or_uppercase_language_falls_back_to_und():
|
||||
# mdhd packs three a-z letters; "en"/"ENG" must not crash struct.pack.
|
||||
for lang in ("en", "ENG", "", "e1x"):
|
||||
init = build_init_segment(
|
||||
stream_type="audio",
|
||||
fourcc="AACL",
|
||||
codec_private_data=AAC_LC_CPD,
|
||||
timescale=10000000,
|
||||
language=lang,
|
||||
)
|
||||
assert init[4:8] == b"ftyp"
|
||||
|
||||
|
||||
def test_high_sampling_rate_does_not_overflow():
|
||||
# 96 kHz exceeds the 16.16 integer field; written as 0 like ffmpeg does.
|
||||
init = build_init_segment(
|
||||
stream_type="audio",
|
||||
fourcc="AACL",
|
||||
codec_private_data="",
|
||||
timescale=10000000,
|
||||
sampling_rate=96000,
|
||||
)
|
||||
assert b"mp4a" in init
|
||||
|
||||
|
||||
def test_read_track_id_truncated_tfhd_returns_none():
|
||||
tfhd = full_box(b"tfhd", 0, 0, b"\x00\x00") # too short for a track_ID
|
||||
fragment = box(b"moof", box(b"traf", tfhd))
|
||||
assert read_track_id(fragment) is None
|
||||
|
||||
|
||||
def test_hvcc_profile_tier_level_is_nonzero():
|
||||
# De-emulated PTL must yield real profile/level, not the off-by-one garbage.
|
||||
hvcc = build_hvcc(bytes.fromhex(VIDEO_HEVC_CPD))
|
||||
payload = hvcc[8:] # strip box header
|
||||
profile_idc = payload[1] & 0x1F
|
||||
level_idc = payload[12]
|
||||
assert profile_idc != 0
|
||||
assert level_idc != 0
|
||||
@@ -3,6 +3,7 @@ from __future__ import annotations
|
||||
import base64
|
||||
import hashlib
|
||||
import html
|
||||
import struct
|
||||
import urllib.parse
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
@@ -18,6 +19,7 @@ from requests import Session
|
||||
from unshackle.core.constants import DOWNLOAD_CANCELLED, DOWNLOAD_LICENCE_ONLY, AnyTrack
|
||||
from unshackle.core.drm import DRM_T, PlayReady, Widevine
|
||||
from unshackle.core.events import events
|
||||
from unshackle.core.manifests.ism_init import build_init_segment, read_per_sample_iv_size, read_track_id
|
||||
from unshackle.core.session import RnetSession
|
||||
from unshackle.core.tracks import Audio, Subtitle, Track, Tracks, Video
|
||||
from unshackle.core.utilities import log_event, try_ensure_utf8
|
||||
@@ -85,6 +87,104 @@ class ISM:
|
||||
drm.append(PlayReady(pssh=pr_pssh, pssh_b64=data))
|
||||
return drm
|
||||
|
||||
@staticmethod
|
||||
def _init_segment(
|
||||
track: AnyTrack, session_drm: Optional[DRM_T], first_segment: Optional[bytes] = None
|
||||
) -> Optional[bytes]:
|
||||
# Smooth fragments are moof+mdat only; rebuild the ftyp+moov init box from
|
||||
# the manifest CodecPrivateData (and KID, when encrypted) so the merged file
|
||||
# is a valid MP4 that shaka/mp4decrypt can parse.
|
||||
ism = track.data.get("ism") if isinstance(getattr(track, "data", None), dict) else None
|
||||
if not ism:
|
||||
return None
|
||||
stream_index = ism.get("stream_index")
|
||||
quality_level = ism.get("quality_level")
|
||||
manifest = ism.get("manifest")
|
||||
if stream_index is None or quality_level is None:
|
||||
return None
|
||||
# CodecPrivateData may legitimately be empty (AAC config is synthesized,
|
||||
# EC-3 decoders sync from the frames); the builder handles each case.
|
||||
cpd = quality_level.get("CodecPrivateData") or ""
|
||||
fourcc = quality_level.get("FourCC") or ""
|
||||
|
||||
root_timescale = manifest.get("TimeScale") if manifest is not None else None
|
||||
timescale = int(stream_index.get("TimeScale") or root_timescale or 10000000)
|
||||
duration = int((manifest.get("Duration") if manifest is not None else 0) or 0)
|
||||
# mdhd needs a 3-letter ISO-639-2 code; manifests often carry 2-letter tags.
|
||||
lang_attr = (stream_index.get("Language") or "").strip()
|
||||
language = "und"
|
||||
if lang_attr and tag_is_valid(lang_attr):
|
||||
try:
|
||||
language = Language.get(lang_attr).to_alpha3()
|
||||
except LookupError:
|
||||
language = "und"
|
||||
|
||||
kid: Optional[bytes] = None
|
||||
if session_drm is not None:
|
||||
kid_uuid = next(iter(getattr(session_drm, "kids", None) or []), None)
|
||||
if kid_uuid is not None:
|
||||
kid = bytes.fromhex(kid_uuid.hex)
|
||||
|
||||
# Match the moov track_ID to the fragment's tfhd, else the muxer drops samples.
|
||||
track_id = (read_track_id(first_segment) if first_segment else None) or 1
|
||||
# NALUnitLengthField: bytes per NAL length prefix, default 4.
|
||||
nal_length_size = int(quality_level.get("NALUnitLengthField") or stream_index.get("NALUnitLengthField") or 4)
|
||||
# Per-sample IV size derived from the fragment senc/saiz (PIFF default 8).
|
||||
iv_size = (read_per_sample_iv_size(first_segment) if first_segment and kid else None) or 8
|
||||
|
||||
try:
|
||||
if isinstance(track, Subtitle):
|
||||
if track.codec != Subtitle.Codec.fTTML:
|
||||
return None # plain-text subtitle formats concatenate fine
|
||||
return build_init_segment(
|
||||
stream_type="text",
|
||||
fourcc="TTML",
|
||||
codec_private_data="",
|
||||
timescale=timescale,
|
||||
duration=duration,
|
||||
language=language,
|
||||
track_id=track_id,
|
||||
)
|
||||
if isinstance(track, Video):
|
||||
return build_init_segment(
|
||||
stream_type="video",
|
||||
fourcc=fourcc,
|
||||
codec_private_data=cpd,
|
||||
timescale=timescale,
|
||||
duration=duration,
|
||||
language=language,
|
||||
width=int(quality_level.get("MaxWidth") or stream_index.get("MaxWidth") or 0),
|
||||
height=int(quality_level.get("MaxHeight") or stream_index.get("MaxHeight") or 0),
|
||||
track_id=track_id,
|
||||
nal_length_size=nal_length_size,
|
||||
kid=kid,
|
||||
iv_size=iv_size,
|
||||
)
|
||||
return build_init_segment(
|
||||
stream_type="audio",
|
||||
fourcc=fourcc,
|
||||
codec_private_data=cpd,
|
||||
timescale=timescale,
|
||||
duration=duration,
|
||||
language=language,
|
||||
channels=int(quality_level.get("Channels") or 2),
|
||||
bits_per_sample=int(quality_level.get("BitsPerSample") or 16),
|
||||
sampling_rate=int(quality_level.get("SamplingRate") or 48000),
|
||||
track_id=track_id,
|
||||
kid=kid,
|
||||
iv_size=iv_size,
|
||||
)
|
||||
except (NotImplementedError, ValueError, struct.error) as e:
|
||||
# Unsupported codec, malformed CodecPrivateData or out-of-range field —
|
||||
# fall back to raw concatenation rather than aborting the download.
|
||||
log_event(
|
||||
"manifest_ism_init_unsupported",
|
||||
level="WARNING",
|
||||
message=f"Could not synthesize ISM init segment ({fourcc}): {e}",
|
||||
context={"track_id": getattr(track, "id", None), "fourcc": fourcc},
|
||||
)
|
||||
return None
|
||||
|
||||
def to_tracks(self, language: Optional[Union[str, Language]] = None) -> Tracks:
|
||||
tracks = Tracks()
|
||||
base_url = self.url
|
||||
@@ -383,8 +483,13 @@ class ISM:
|
||||
raise FileNotFoundError(error_msg)
|
||||
|
||||
with open(save_path, "wb") as f:
|
||||
for segment_file in segments_to_merge:
|
||||
segment_data = segment_file.read_bytes()
|
||||
first_segment = segments_to_merge[0].read_bytes() if segments_to_merge else None
|
||||
init_segment = ISM._init_segment(track, session_drm, first_segment)
|
||||
if init_segment:
|
||||
f.write(init_segment)
|
||||
for index, segment_file in enumerate(segments_to_merge):
|
||||
# First segment was already read for the init synthesis — reuse it.
|
||||
segment_data = first_segment if index == 0 and first_segment else segment_file.read_bytes()
|
||||
if (
|
||||
not session_drm
|
||||
and isinstance(track, Subtitle)
|
||||
|
||||
622
unshackle/core/manifests/ism_init.py
Normal file
622
unshackle/core/manifests/ism_init.py
Normal file
@@ -0,0 +1,622 @@
|
||||
"""
|
||||
Synthesize an ISO-BMFF initialization segment (ftyp + moov) for ISM / Smooth
|
||||
Streaming tracks.
|
||||
|
||||
Smooth Streaming fragments are bare ``moof`` + ``mdat`` pairs; the server never
|
||||
sends a ``moov``. The init box must be reconstructed from the manifest's
|
||||
``CodecPrivateData`` (and, for protected content, the track KID) before a muxer
|
||||
or decryptor such as shaka-packager can parse the stream. Ported from yt-dlp's
|
||||
``write_piff_header`` and N_m3u8DL-RE's ``MSSMoovProcessor`` with HEVC, Dolby
|
||||
Vision, EC-3, TTML and CENC (PIFF) support.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import binascii
|
||||
import struct
|
||||
from typing import Iterator, Optional
|
||||
|
||||
# Big-endian field packers (named for the bit widths they encode).
|
||||
u8 = struct.Struct(">B")
|
||||
u16 = struct.Struct(">H")
|
||||
u32 = struct.Struct(">I")
|
||||
u64 = struct.Struct(">Q")
|
||||
s16 = struct.Struct(">h")
|
||||
s88 = struct.Struct(">bx") # 8.8 fixed-point
|
||||
s1616 = struct.Struct(">hxx") # 16.16 fixed-point
|
||||
u1616 = struct.Struct(">Hxx")
|
||||
s32 = struct.Struct(">i")
|
||||
|
||||
# 3x3 transformation matrix (identity), as stored in tkhd/mvhd.
|
||||
UNITY_MATRIX = (
|
||||
s32.pack(0x10000) + s32.pack(0) * 3
|
||||
+ s32.pack(0) + s32.pack(0x10000) + s32.pack(0) * 2
|
||||
+ s32.pack(0) * 2 + s32.pack(0x40000000)
|
||||
)
|
||||
|
||||
TRACK_ENABLED = 0x1
|
||||
TRACK_IN_MOVIE = 0x2
|
||||
TRACK_IN_PREVIEW = 0x4
|
||||
SELF_CONTAINED = 0x1
|
||||
|
||||
# Fixed creation/modification time — deterministic output (no wall clock).
|
||||
EPOCH = 0
|
||||
|
||||
NAL_START_CODE = b"\x00\x00\x00\x01"
|
||||
|
||||
# WAVEFORMATEXTENSIBLE SubFormat GUID for Dolby Digital Plus, as serialized
|
||||
# (little-endian) inside Smooth EC-3 CodecPrivateData.
|
||||
DOLBY_DIGITAL_PLUS_GUID = bytes.fromhex("AF87FBA7022DFB42A4D405CD93843BDD")
|
||||
|
||||
# PIFF SampleEncryptionBox usertype (the pre-CENC 'senc' carried as a uuid box).
|
||||
PIFF_SENC_UUID = bytes.fromhex("A2394F525A9B4F14A2446C427C648DF4")
|
||||
|
||||
TTML_NAMESPACE = b"http://www.w3.org/ns/ttml\0"
|
||||
|
||||
# ISO/IEC 14496-3 samplingFrequencyIndex table for AudioSpecificConfig.
|
||||
AAC_SAMPLING_FREQUENCY_INDEX = {
|
||||
96000: 0x0,
|
||||
88200: 0x1,
|
||||
64000: 0x2,
|
||||
48000: 0x3,
|
||||
44100: 0x4,
|
||||
32000: 0x5,
|
||||
24000: 0x6,
|
||||
22050: 0x7,
|
||||
16000: 0x8,
|
||||
12000: 0x9,
|
||||
11025: 0xA,
|
||||
8000: 0xB,
|
||||
7350: 0xC,
|
||||
}
|
||||
|
||||
|
||||
def box(box_type: bytes, payload: bytes) -> bytes:
|
||||
"""Wrap payload in a basic ISO-BMFF box (size + fourcc + payload)."""
|
||||
return u32.pack(8 + len(payload)) + box_type + payload
|
||||
|
||||
|
||||
def full_box(box_type: bytes, version: int, flags: int, payload: bytes) -> bytes:
|
||||
"""Wrap payload in a FullBox (adds 1-byte version + 3-byte flags)."""
|
||||
return box(box_type, u8.pack(version) + u32.pack(flags)[1:] + payload)
|
||||
|
||||
|
||||
def split_nal_units(codec_private_data: bytes) -> list[bytes]:
|
||||
"""Split CodecPrivateData into its NAL units (drops the start codes)."""
|
||||
units = [u for u in codec_private_data.split(NAL_START_CODE) if u]
|
||||
return units
|
||||
|
||||
|
||||
def remove_emulation_prevention(data: bytes) -> bytes:
|
||||
"""Strip H.26x emulation-prevention bytes (the 0x03 in any 00 00 03 run).
|
||||
|
||||
The byte after a consumed escape is data — even another 0x03 — so the scan
|
||||
must skip past it rather than re-examine (a naive trailing-window check
|
||||
over-strips consecutive escapes and shifts every later bit position).
|
||||
"""
|
||||
out = bytearray()
|
||||
i = 0
|
||||
while i < len(data):
|
||||
if i + 2 < len(data) and data[i] == 0 and data[i + 1] == 0 and data[i + 2] == 3:
|
||||
out += b"\x00\x00"
|
||||
i += 3
|
||||
else:
|
||||
out.append(data[i])
|
||||
i += 1
|
||||
return bytes(out)
|
||||
|
||||
|
||||
class BitReader:
|
||||
"""MSB-first bit reader with the exp-Golomb decode H.26x headers need."""
|
||||
|
||||
def __init__(self, data: bytes) -> None:
|
||||
self.data = data
|
||||
self.pos = 0
|
||||
|
||||
def read_bits(self, count: int) -> int:
|
||||
value = 0
|
||||
for _ in range(count):
|
||||
byte = self.data[self.pos >> 3]
|
||||
value = (value << 1) | ((byte >> (7 - (self.pos & 7))) & 1)
|
||||
self.pos += 1
|
||||
return value
|
||||
|
||||
def read_ue(self) -> int:
|
||||
zeros = 0
|
||||
while self.read_bits(1) == 0:
|
||||
zeros += 1
|
||||
if zeros > 32:
|
||||
raise ValueError("Invalid exp-Golomb code")
|
||||
return (1 << zeros) - 1 + (self.read_bits(zeros) if zeros else 0)
|
||||
|
||||
|
||||
def parse_hevc_sps_format(sps_rbsp: bytes) -> tuple[int, int, int]:
|
||||
"""
|
||||
Parse (chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8)
|
||||
from a de-emulated HEVC SPS RBSP (including its 2-byte NAL header).
|
||||
"""
|
||||
r = BitReader(sps_rbsp)
|
||||
r.read_bits(16) # NAL unit header
|
||||
r.read_bits(4) # sps_video_parameter_set_id
|
||||
max_sub_layers_minus1 = r.read_bits(3)
|
||||
r.read_bits(1) # sps_temporal_id_nesting_flag
|
||||
r.read_bits(96) # general profile_tier_level (12 bytes)
|
||||
profile_present = []
|
||||
level_present = []
|
||||
for _ in range(max_sub_layers_minus1):
|
||||
profile_present.append(r.read_bits(1))
|
||||
level_present.append(r.read_bits(1))
|
||||
if max_sub_layers_minus1 > 0:
|
||||
r.read_bits((8 - max_sub_layers_minus1) * 2) # reserved_zero_2bits
|
||||
for i in range(max_sub_layers_minus1):
|
||||
if profile_present[i]:
|
||||
r.read_bits(88) # sub_layer profile_tier
|
||||
if level_present[i]:
|
||||
r.read_bits(8) # sub_layer_level_idc
|
||||
r.read_ue() # sps_seq_parameter_set_id
|
||||
chroma_format_idc = r.read_ue()
|
||||
if chroma_format_idc == 3:
|
||||
r.read_bits(1) # separate_colour_plane_flag
|
||||
r.read_ue() # pic_width_in_luma_samples
|
||||
r.read_ue() # pic_height_in_luma_samples
|
||||
if r.read_bits(1): # conformance_window_flag
|
||||
for _ in range(4):
|
||||
r.read_ue()
|
||||
bit_depth_luma_minus8 = r.read_ue()
|
||||
bit_depth_chroma_minus8 = r.read_ue()
|
||||
return chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8
|
||||
|
||||
|
||||
def iter_boxes(data: bytes, start: int, end: int) -> Iterator[tuple[bytes, Optional[bytes], int, int]]:
|
||||
"""Yield (type, uuid_usertype, payload_start, box_end) for each child box."""
|
||||
offset = start
|
||||
while offset + 8 <= end:
|
||||
size = struct.unpack(">I", data[offset : offset + 4])[0]
|
||||
box_type = data[offset + 4 : offset + 8]
|
||||
header = 8
|
||||
if size == 1:
|
||||
size = struct.unpack(">Q", data[offset + 8 : offset + 16])[0]
|
||||
header = 16
|
||||
if size == 0:
|
||||
size = end - offset
|
||||
if size < 8: # corrupt box header; stop rather than loop forever
|
||||
return
|
||||
usertype = None
|
||||
if box_type == b"uuid" and offset + header + 16 <= end:
|
||||
usertype = data[offset + header : offset + header + 16]
|
||||
header += 16
|
||||
yield box_type, usertype, offset + header, offset + size
|
||||
offset += size
|
||||
|
||||
|
||||
def find_box(data: bytes, start: int, end: int, target: bytes) -> Optional[tuple[int, int]]:
|
||||
"""Find the first child box of the given type; return (payload_start, end)."""
|
||||
for box_type, _, body, box_end in iter_boxes(data, start, end):
|
||||
if box_type == target:
|
||||
return body, box_end
|
||||
return None
|
||||
|
||||
|
||||
def read_track_id(fragment: bytes) -> Optional[int]:
|
||||
"""Read the track_ID from a fragment's moof/traf/tfhd box, if present.
|
||||
|
||||
Smooth fragments declare their own track_ID; the synthesized moov must use
|
||||
the same value or the muxer cannot associate samples with the track. The
|
||||
track_ID sits before any tfhd optional fields, so the flags don't matter.
|
||||
"""
|
||||
moof = find_box(fragment, 0, len(fragment), b"moof")
|
||||
if not moof:
|
||||
return None
|
||||
traf = find_box(fragment, *moof, b"traf")
|
||||
if not traf:
|
||||
return None
|
||||
tfhd = find_box(fragment, *traf, b"tfhd")
|
||||
if not tfhd:
|
||||
return None
|
||||
body, _ = tfhd
|
||||
if body + 8 > len(fragment): # truncated tfhd
|
||||
return None
|
||||
# tfhd payload: version(1) + flags(3) + track_ID(4)
|
||||
return struct.unpack(">I", fragment[body + 4 : body + 8])[0]
|
||||
|
||||
|
||||
def read_per_sample_iv_size(fragment: bytes) -> Optional[int]:
|
||||
"""
|
||||
Derive the per-sample IV size (8 or 16) from a fragment's sample-encryption
|
||||
metadata, for the synthesized tenc default_Per_Sample_IV_Size.
|
||||
|
||||
Checks, in order: the PIFF 'senc' uuid override flag (explicit IV size),
|
||||
the senc payload length (sample_count vs IV/subsample entries), and the
|
||||
saiz default_sample_info_size (only unambiguous without subsamples).
|
||||
"""
|
||||
moof = find_box(fragment, 0, len(fragment), b"moof")
|
||||
if not moof:
|
||||
return None
|
||||
traf = find_box(fragment, *moof, b"traf")
|
||||
if not traf:
|
||||
return None
|
||||
|
||||
senc: Optional[tuple[int, int]] = None
|
||||
saiz_default: Optional[int] = None
|
||||
senc_has_subsamples = False
|
||||
for box_type, usertype, body, box_end in iter_boxes(fragment, *traf):
|
||||
if box_type == b"senc" or (box_type == b"uuid" and usertype == PIFF_SENC_UUID):
|
||||
senc = (body, box_end)
|
||||
elif box_type == b"saiz":
|
||||
flags = int.from_bytes(fragment[body + 1 : body + 4], "big")
|
||||
pos = body + 4 + (8 if flags & 0x1 else 0) # skip aux_info_type fields
|
||||
if pos < box_end:
|
||||
saiz_default = fragment[pos]
|
||||
|
||||
if senc:
|
||||
body, box_end = senc
|
||||
flags = int.from_bytes(fragment[body + 1 : body + 4], "big")
|
||||
senc_has_subsamples = bool(flags & 0x2)
|
||||
pos = body + 4
|
||||
if flags & 0x1: # PIFF override: AlgorithmID(3) + IV_size(1) + KID(16)
|
||||
return fragment[pos + 3]
|
||||
if pos + 4 <= box_end:
|
||||
sample_count = struct.unpack(">I", fragment[pos : pos + 4])[0]
|
||||
pos += 4
|
||||
if sample_count:
|
||||
if not senc_has_subsamples:
|
||||
size, rem = divmod(box_end - pos, sample_count)
|
||||
if rem == 0 and size in (8, 16):
|
||||
return size
|
||||
else:
|
||||
# Walk the entries with each candidate IV size; the one that
|
||||
# lands exactly on the box end is correct.
|
||||
for iv_size in (8, 16):
|
||||
cursor = pos
|
||||
for _ in range(sample_count):
|
||||
cursor += iv_size
|
||||
if cursor + 2 > box_end:
|
||||
cursor = -1
|
||||
break
|
||||
entries = struct.unpack(">H", fragment[cursor : cursor + 2])[0]
|
||||
cursor += 2 + 6 * entries
|
||||
if cursor > box_end:
|
||||
cursor = -1
|
||||
break
|
||||
if cursor == box_end:
|
||||
return iv_size
|
||||
|
||||
if not senc_has_subsamples and saiz_default in (8, 16):
|
||||
return saiz_default
|
||||
return None
|
||||
|
||||
|
||||
def build_avcc(codec_private_data: bytes, nal_length_size: int = 4) -> bytes:
|
||||
"""Build an avcC (AVC decoder config) box from SPS+PPS CodecPrivateData."""
|
||||
nals = split_nal_units(codec_private_data)
|
||||
# Pick parameter sets by H.264 NAL type (low 5 bits): 7 = SPS, 8 = PPS.
|
||||
# Manifests do not guarantee SPS-first ordering.
|
||||
sps = next((n for n in nals if n[0] & 0x1F == 7), None)
|
||||
pps = next((n for n in nals if n[0] & 0x1F == 8), None)
|
||||
if not sps or not pps:
|
||||
raise ValueError("AVC CodecPrivateData must contain SPS and PPS NAL units")
|
||||
payload = u8.pack(1) # configuration version
|
||||
payload += sps[1:4] # profile / compat / level (from SPS NAL body)
|
||||
payload += u8.pack(0xFC | (nal_length_size - 1)) # reserved + length size minus one
|
||||
payload += u8.pack(0xE0 | 1) # reserved + number of SPS (1)
|
||||
payload += u16.pack(len(sps)) + sps
|
||||
payload += u8.pack(1) # number of PPS
|
||||
payload += u16.pack(len(pps)) + pps
|
||||
return box(b"avcC", payload)
|
||||
|
||||
|
||||
def build_hvcc(codec_private_data: bytes, nal_length_size: int = 4) -> bytes:
|
||||
"""
|
||||
Build an hvcC (HEVC decoder config) box from VPS+SPS+PPS CodecPrivateData.
|
||||
|
||||
Profile/tier/level bytes are lifted from the SPS profile_tier_level; chroma
|
||||
format and bit depths are parsed from the SPS so 10-bit/HDR streams signal
|
||||
correctly (falls back to 8-bit 4:2:0 on malformed SPS data).
|
||||
"""
|
||||
nals = split_nal_units(codec_private_data)
|
||||
if len(nals) < 3:
|
||||
raise ValueError("HEVC CodecPrivateData must contain VPS, SPS and PPS NAL units")
|
||||
|
||||
# Group NAL units by type (HEVC NAL type = (first byte >> 1) & 0x3F).
|
||||
by_type: dict[int, list[bytes]] = {}
|
||||
for nal in nals:
|
||||
nal_type = (nal[0] >> 1) & 0x3F
|
||||
by_type.setdefault(nal_type, []).append(nal)
|
||||
|
||||
sps = by_type.get(33, [b""])[0]
|
||||
# profile_tier_level must be read from the de-emulated SPS RBSP, after the
|
||||
# 2-byte NAL header + 1 byte (sps_video_parameter_set_id(4) +
|
||||
# sps_max_sub_layers_minus1(3) + sps_temporal_id_nesting_flag(1)). PTL is 12
|
||||
# bytes: profile byte(1) + compat flags(4) + constraint flags(6) + level(1).
|
||||
sps_rbsp = remove_emulation_prevention(sps)
|
||||
ptl = sps_rbsp[3:15] if len(sps_rbsp) >= 15 else b"\x00" * 12
|
||||
general_profile_space_tier_profile = ptl[0:1] or b"\x00"
|
||||
general_profile_compat = ptl[1:5].ljust(4, b"\x00")
|
||||
general_constraint = ptl[5:11].ljust(6, b"\x00")
|
||||
general_level_idc = ptl[11:12] or b"\x00"
|
||||
|
||||
try:
|
||||
chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8 = parse_hevc_sps_format(sps_rbsp)
|
||||
except (IndexError, ValueError):
|
||||
chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8 = 1, 0, 0
|
||||
|
||||
payload = u8.pack(1) # configurationVersion
|
||||
payload += general_profile_space_tier_profile
|
||||
payload += general_profile_compat
|
||||
payload += general_constraint
|
||||
payload += general_level_idc
|
||||
payload += u16.pack(0xF000) # reserved(4) + min_spatial_segmentation_idc(12)
|
||||
payload += u8.pack(0xFC) # reserved(6) + parallelismType(2)
|
||||
payload += u8.pack(0xFC | (chroma_format_idc & 0x03)) # reserved(6) + chromaFormat(2)
|
||||
payload += u8.pack(0xF8 | (bit_depth_luma_minus8 & 0x07)) # reserved(5) + bitDepthLumaMinus8(3)
|
||||
payload += u8.pack(0xF8 | (bit_depth_chroma_minus8 & 0x07)) # reserved(5) + bitDepthChromaMinus8(3)
|
||||
payload += u16.pack(0) # avgFrameRate
|
||||
# constantFrameRate(2)+numTemporalLayers(3)+temporalIdNested(1)+lengthSizeMinusOne(2)
|
||||
payload += u8.pack((nal_length_size - 1) & 0x03)
|
||||
|
||||
arrays = bytearray()
|
||||
num_arrays = 0
|
||||
for nal_type in (32, 33, 34): # VPS, SPS, PPS
|
||||
units = by_type.get(nal_type)
|
||||
if not units:
|
||||
continue
|
||||
num_arrays += 1
|
||||
arrays += u8.pack(0x80 | nal_type) # array_completeness(1)+reserved(1)+NAL type(6)
|
||||
arrays += u16.pack(len(units))
|
||||
for unit in units:
|
||||
arrays += u16.pack(len(unit)) + unit
|
||||
payload += u8.pack(num_arrays) + bytes(arrays)
|
||||
return box(b"hvcC", payload)
|
||||
|
||||
|
||||
def build_esds(codec_private_data: bytes) -> bytes:
|
||||
"""Build an esds box wrapping the AAC AudioSpecificConfig."""
|
||||
asc = codec_private_data
|
||||
# DecoderSpecificInfo (tag 0x05)
|
||||
dsi = u8.pack(0x05) + u8.pack(len(asc)) + asc
|
||||
# DecoderConfigDescriptor (tag 0x04): objectType=0x40 (AAC), stream type audio
|
||||
dcd = (
|
||||
u8.pack(0x40) # object type indication = MPEG-4 AAC
|
||||
+ u8.pack(0x15) # stream type (audio) << 2 | upstream | reserved
|
||||
+ b"\x00\x00\x00" # buffer size
|
||||
+ u32.pack(0) # max bitrate
|
||||
+ u32.pack(0) # avg bitrate
|
||||
+ dsi
|
||||
)
|
||||
dcd_box = u8.pack(0x04) + u8.pack(len(dcd)) + dcd
|
||||
# SLConfigDescriptor (tag 0x06)
|
||||
sl = u8.pack(0x06) + u8.pack(1) + u8.pack(0x02)
|
||||
# ES_Descriptor (tag 0x03)
|
||||
es = u8.pack(0x03) + u8.pack(len(dcd_box) + len(sl) + 3) + u16.pack(0) + u8.pack(0) + dcd_box + sl
|
||||
return full_box(b"esds", 0, 0, es)
|
||||
|
||||
|
||||
def build_dec3(codec_private_data: bytes) -> Optional[bytes]:
|
||||
"""Build a dec3 (EC-3 specific) box from Smooth EC-3 CodecPrivateData.
|
||||
|
||||
Smooth EC-3 CodecPrivateData ([MS-SSTR] AudioTag 65534) serializes a
|
||||
WAVEFORMATEXTENSIBLE — sometimes the full structure, sometimes only its
|
||||
extension (samples-per-block + channel mask + DD+ SubFormat GUID) — with
|
||||
the raw dec3 payload (ETSI TS 102 366 F.6) after the GUID. Returns None
|
||||
when the GUID is absent — decoders still sync from EC-3 frames in mdat.
|
||||
"""
|
||||
guid_at = codec_private_data.find(DOLBY_DIGITAL_PLUS_GUID)
|
||||
if guid_at != -1 and len(codec_private_data) > guid_at + 16:
|
||||
return box(b"dec3", codec_private_data[guid_at + 16 :])
|
||||
return None
|
||||
|
||||
|
||||
def synthesize_aac_codec_private_data(fourcc: str, sampling_rate: int, channels: int) -> bytes:
|
||||
"""Generate the AAC AudioSpecificConfig when the manifest omits it.
|
||||
|
||||
AACL -> 2-byte AAC-LC config; AACH -> 4-byte HE-AAC (SBR, AOT 5) config
|
||||
with the extension sampling frequency at twice the core rate.
|
||||
"""
|
||||
freq = AAC_SAMPLING_FREQUENCY_INDEX.get(sampling_rate, 0x0)
|
||||
if fourcc == "AACH":
|
||||
ext_freq = AAC_SAMPLING_FREQUENCY_INDEX.get(sampling_rate * 2, 0x0)
|
||||
return bytes(
|
||||
(
|
||||
(0x05 << 3) | (freq >> 1),
|
||||
((freq & 0x01) << 7) | (channels << 3) | (ext_freq >> 1),
|
||||
((ext_freq & 0x01) << 7) | (0x02 << 2), # core object type = AAC LC
|
||||
0x00, # alignment bits
|
||||
)
|
||||
)
|
||||
return bytes(((0x02 << 3) | (freq >> 1), ((freq & 0x01) << 7) | (channels << 3)))
|
||||
|
||||
|
||||
def build_sinf(
|
||||
original_format: bytes,
|
||||
kid: bytes,
|
||||
iv_size: int = 8,
|
||||
constant_iv: Optional[bytes] = None,
|
||||
) -> bytes:
|
||||
"""Build a sinf protection box (frma + schm cenc + schi/tenc) for CENC.
|
||||
|
||||
iv_size is the tenc default_Per_Sample_IV_Size (8 or 16). When constant_iv
|
||||
is given, the per-sample IV size is 0 and the constant IV is appended per
|
||||
ISO/IEC 23001-7 (cbcs-style constant-IV form).
|
||||
"""
|
||||
frma = box(b"frma", original_format)
|
||||
schm = full_box(b"schm", 0, 0, b"cenc" + u32.pack(0x00010000))
|
||||
tenc_payload = (
|
||||
u8.pack(0) # reserved
|
||||
+ u8.pack(0) # default_crypt_byte_block / skip_byte_block (cenc)
|
||||
+ u8.pack(1) # default_isProtected
|
||||
+ u8.pack(0 if constant_iv else iv_size) # default_Per_Sample_IV_Size
|
||||
+ kid # default_KID (16 bytes)
|
||||
)
|
||||
if constant_iv:
|
||||
tenc_payload += u8.pack(len(constant_iv)) + constant_iv
|
||||
schi = box(b"schi", full_box(b"tenc", 0, 0, tenc_payload))
|
||||
return box(b"sinf", frma + schm + schi)
|
||||
|
||||
|
||||
def build_init_segment(
|
||||
*,
|
||||
stream_type: str,
|
||||
fourcc: str,
|
||||
codec_private_data: str,
|
||||
timescale: int = 10000000,
|
||||
duration: int = 0,
|
||||
language: str = "und",
|
||||
width: int = 0,
|
||||
height: int = 0,
|
||||
channels: int = 2,
|
||||
bits_per_sample: int = 16,
|
||||
sampling_rate: int = 48000,
|
||||
track_id: int = 1,
|
||||
nal_length_size: int = 4,
|
||||
kid: Optional[bytes] = None,
|
||||
iv_size: int = 8,
|
||||
constant_iv: Optional[bytes] = None,
|
||||
) -> bytes:
|
||||
"""
|
||||
Build a complete ftyp + moov initialization segment.
|
||||
|
||||
stream_type: "video" | "audio" | "text".
|
||||
fourcc: Smooth FourCC ("H264"/"AVC1"/"DAVC", "HVC1"/"HEV1", "DVHE"/"DVH1",
|
||||
"AACL"/"AACH", "EC-3", "TTML").
|
||||
codec_private_data: hex string from the manifest QualityLevel.
|
||||
nal_length_size: manifest NALUnitLengthField (bytes per NAL length prefix).
|
||||
kid: 16-byte default key id; when set, the sample entry is wrapped for CENC.
|
||||
iv_size / constant_iv: tenc IV form (see build_sinf).
|
||||
"""
|
||||
if stream_type not in ("video", "audio", "text"):
|
||||
raise ValueError(f"Unsupported stream type: {stream_type}")
|
||||
fourcc = (fourcc or "").upper()
|
||||
cpd = binascii.unhexlify(codec_private_data) if codec_private_data else b""
|
||||
encrypted = kid is not None
|
||||
# mdhd packs exactly three a-z letters; anything else (2-letter tags,
|
||||
# uppercase) would underflow the 5-bit fields, so fall back to "und".
|
||||
lang = (language or "").lower()
|
||||
if len(lang) != 3 or not all("a" <= c <= "z" for c in lang):
|
||||
lang = "und"
|
||||
|
||||
# --- ftyp ---
|
||||
ftyp = box(b"ftyp", b"isml" + u32.pack(1) + b"iso5" + b"iso6" + b"piff" + b"msdh")
|
||||
|
||||
# --- mvhd ---
|
||||
mvhd = full_box(
|
||||
b"mvhd", 1, 0,
|
||||
u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(timescale) + u64.pack(duration)
|
||||
+ s1616.pack(1) + s88.pack(1) + u16.pack(0) + u32.pack(0) * 2
|
||||
+ UNITY_MATRIX + u32.pack(0) * 6 + u32.pack(0xFFFFFFFF),
|
||||
)
|
||||
|
||||
# --- tkhd ---
|
||||
tkhd = full_box(
|
||||
b"tkhd", 1, TRACK_ENABLED | TRACK_IN_MOVIE | TRACK_IN_PREVIEW,
|
||||
u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(track_id) + u32.pack(0)
|
||||
+ u64.pack(duration) + u32.pack(0) * 2 + s16.pack(0) + s16.pack(0)
|
||||
+ s88.pack(1 if stream_type == "audio" else 0) + u16.pack(0) + UNITY_MATRIX
|
||||
+ u1616.pack(width) + u1616.pack(height),
|
||||
)
|
||||
|
||||
# --- mdhd + hdlr ---
|
||||
packed_lang = ((ord(lang[0]) - 0x60) << 10) | ((ord(lang[1]) - 0x60) << 5) | (ord(lang[2]) - 0x60)
|
||||
mdhd = full_box(
|
||||
b"mdhd", 1, 0,
|
||||
u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(timescale) + u64.pack(duration)
|
||||
+ u16.pack(packed_lang) + u16.pack(0),
|
||||
)
|
||||
if stream_type == "audio":
|
||||
hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"soun" + u32.pack(0) * 3 + b"SoundHandler\0")
|
||||
media_header = full_box(b"smhd", 0, 0, s88.pack(0) + u16.pack(0))
|
||||
elif stream_type == "text":
|
||||
hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"subt" + u32.pack(0) * 3 + b"SubtitleHandler\0")
|
||||
media_header = full_box(b"sthd", 0, 0, b"")
|
||||
else:
|
||||
hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"vide" + u32.pack(0) * 3 + b"VideoHandler\0")
|
||||
media_header = full_box(b"vmhd", 0, 1, u16.pack(0) + u16.pack(0) * 3)
|
||||
|
||||
# --- dinf ---
|
||||
dref = full_box(b"dref", 0, 0, u32.pack(1) + full_box(b"url ", 0, SELF_CONTAINED, b""))
|
||||
dinf = box(b"dinf", dref)
|
||||
|
||||
# --- stsd sample entry ---
|
||||
sample_entry_payload = u8.pack(0) * 6 + u16.pack(1) # reserved + data reference index
|
||||
if stream_type == "video":
|
||||
sample_entry_payload += (
|
||||
u16.pack(0) + u16.pack(0) + u32.pack(0) * 3
|
||||
+ u16.pack(width) + u16.pack(height)
|
||||
+ u1616.pack(0x48) + u1616.pack(0x48) + u32.pack(0) + u16.pack(1)
|
||||
+ u8.pack(0) * 32 + u16.pack(0x18) + s16.pack(-1)
|
||||
)
|
||||
if fourcc in ("H264", "AVC1", "DAVC"):
|
||||
config_box = build_avcc(cpd, nal_length_size)
|
||||
codec_fourcc = b"avc1"
|
||||
elif fourcc in ("HVC1", "HEV1", "HEVC", "H265"):
|
||||
config_box = build_hvcc(cpd, nal_length_size)
|
||||
codec_fourcc = b"hvc1"
|
||||
elif fourcc in ("DVHE", "DVH1"):
|
||||
# Dolby Vision over HEVC: same hvcC config, dvh1 sample entry.
|
||||
config_box = build_hvcc(cpd, nal_length_size)
|
||||
codec_fourcc = b"dvh1"
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported video FourCC: {fourcc}")
|
||||
sample_entry_payload += config_box
|
||||
if encrypted:
|
||||
sample_entry_payload += build_sinf(codec_fourcc, kid, iv_size, constant_iv)
|
||||
sample_entry_box = box(b"encv", sample_entry_payload)
|
||||
else:
|
||||
sample_entry_box = box(codec_fourcc, sample_entry_payload)
|
||||
elif stream_type == "audio":
|
||||
# samplerate is 16.16 fixed-point; rates above 65535 Hz are written as 0
|
||||
# (decoders read the real rate from the codec config), matching ffmpeg.
|
||||
sample_entry_payload += (
|
||||
u32.pack(0) * 2 + u16.pack(channels) + u16.pack(bits_per_sample)
|
||||
+ u16.pack(0) + u16.pack(0) + u32.pack((sampling_rate if sampling_rate <= 0xFFFF else 0) << 16)
|
||||
)
|
||||
if fourcc in ("AACL", "AACH", "AAC"):
|
||||
if not cpd:
|
||||
cpd = synthesize_aac_codec_private_data(fourcc, sampling_rate, channels)
|
||||
sample_entry_payload += build_esds(cpd)
|
||||
codec_fourcc = b"mp4a"
|
||||
elif fourcc == "EC-3":
|
||||
dec3 = build_dec3(cpd)
|
||||
if dec3:
|
||||
sample_entry_payload += dec3
|
||||
codec_fourcc = b"ec-3"
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported audio FourCC: {fourcc}")
|
||||
if encrypted:
|
||||
sample_entry_payload += build_sinf(codec_fourcc, kid, iv_size, constant_iv)
|
||||
sample_entry_box = box(b"enca", sample_entry_payload)
|
||||
else:
|
||||
sample_entry_box = box(codec_fourcc, sample_entry_payload)
|
||||
else: # text
|
||||
if fourcc in ("TTML", "STPP", "DFXP"):
|
||||
# XMLSubtitleSampleEntry: namespace + schema_location + aux mime types.
|
||||
sample_entry_payload += TTML_NAMESPACE + b"\0" + b"\0"
|
||||
sample_entry_box = box(b"stpp", sample_entry_payload)
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported text FourCC: {fourcc}")
|
||||
|
||||
stsd = full_box(b"stsd", 0, 0, u32.pack(1) + sample_entry_box)
|
||||
|
||||
# --- empty sample tables (fragmented: real samples live in moof/traf) ---
|
||||
stbl = box(
|
||||
b"stbl",
|
||||
stsd
|
||||
+ full_box(b"stts", 0, 0, u32.pack(0))
|
||||
+ full_box(b"stsc", 0, 0, u32.pack(0))
|
||||
+ full_box(b"stsz", 0, 0, u32.pack(0) + u32.pack(0))
|
||||
+ full_box(b"stco", 0, 0, u32.pack(0)),
|
||||
)
|
||||
|
||||
minf = box(b"minf", media_header + dinf + stbl)
|
||||
mdia = box(b"mdia", mdhd + hdlr + minf)
|
||||
trak = box(b"trak", tkhd + mdia)
|
||||
|
||||
# --- mvex (mehd + trex) signals a fragmented file ---
|
||||
mehd = full_box(b"mehd", 1, 0, u64.pack(duration))
|
||||
trex = full_box(
|
||||
b"trex", 0, 0,
|
||||
u32.pack(track_id) + u32.pack(1) + u32.pack(0) + u32.pack(0) + u32.pack(0),
|
||||
)
|
||||
mvex = box(b"mvex", mehd + trex)
|
||||
|
||||
moov = box(b"moov", mvhd + trak + mvex)
|
||||
return ftyp + moov
|
||||
Reference in New Issue
Block a user