fix(ism): rebuild moov init segment for Smooth Streaming decrypt

ISM (Smooth Streaming) tracks raw-concatenate moof+mdat fragments with no ftyp/moov, so shaka-packager/mp4decrypt fail with PARSER_FAILURE (exit 2) on decrypt. The init box was previously built by n_m3u8dl_re, removed in the downloader consolidation.

Add ism_init.py, a dependency-free byte-level MP4 init-segment synthesizer that rebuilds ftyp+moov from the manifest CodecPrivateData, ported from yt-dlp's write_piff_header and N_m3u8DL-RE's MSSMoovProcessor with full codec parity:

- AVC (H264/AVC1/DAVC), with SPS/PPS picked by NAL type rather than position and NALUnitLengthField honored
- HEVC (HVC1/HEV1) with chroma format and bit depths parsed from the de-emulated SPS via exp-Golomb so 10-bit/HDR signals correctly, and profile/tier/level lifted from the SPS PTL
- Dolby Vision (DVHE/DVH1) as hvcC with a dvh1 sample entry
- AAC (AACL/AACH) with the AudioSpecificConfig synthesized from SamplingRate/Channels when the manifest omits CodecPrivateData
- EC-3 with a real dec3 box extracted from the WAVEFORMATEXTENSIBLE CodecPrivateData (Dolby GUID located by search, not fixed offset)
- TTML subtitles as stpp/sthd/subt, wired for fragmented-TTML tracks

CENC wrapping (encv/enca + sinf/tenc with default_KID) covers encrypted tracks: the per-sample IV size is derived from the fragment senc/saiz (PIFF override flag, payload arithmetic, saiz fallback) instead of assuming 8, and the constant-IV tenc form is supported. Read the track_ID from the first fragment's tfhd so the moov matches and the muxer does not drop samples.

Wire ISM.download_track to prepend the synthesized init before merging; unsupported codecs soft-fail to raw concatenation with a warning. Harden against real-world inputs: 2-letter/uppercase manifest language tags normalize to ISO-639-2 (und fallback), >65535 Hz sample rates no longer overflow the 16.16 field, truncated tfhd returns None, struct.error joins the soft-fail handler, and the emulation-prevention scan no longer over-strips consecutive escapes.

Add regression tests (37) covering box structure, every supported FourCC, 10-bit SPS parsing, ASC synthesis, dec3 extraction, IV-size derivation and the crash fixes. Validated structurally per codec with ffmpeg-minted fragments: shaka-packager parses synth-init+fragments with exit 0 and ffprobe reports the expected codec, including a live run against a public Smooth Streaming server.
This commit is contained in:
imSp4rky
2026-06-11 13:41:58 -06:00
parent 466bf610cc
commit 39034f2bb5
3 changed files with 1139 additions and 2 deletions

410
tests/core/test_ism_init.py Normal file
View File

@@ -0,0 +1,410 @@
"""Regression tests for ISM init-segment synthesis (ftyp + moov).
Smooth Streaming fragments carry no moov; the init box must be rebuilt from the
manifest CodecPrivateData before shaka/mp4decrypt can parse the stream. These
guard the byte-level box structure so a future downloader refactor cannot
silently drop it again (the c323db9 regression).
"""
from __future__ import annotations
import struct
import pytest
from unshackle.core.manifests.ism_init import (NAL_START_CODE, PIFF_SENC_UUID, box, build_avcc, build_dec3,
build_hvcc, build_init_segment, full_box, parse_hevc_sps_format,
read_per_sample_iv_size, read_track_id, remove_emulation_prevention,
split_nal_units, synthesize_aac_codec_private_data)
# Real CodecPrivateData taken from a Smooth Streaming manifest.
VIDEO_HEVC_CPD = (
"0000000140010C01FFFF01600000030090000003000003009695980900000001420101016000000300900000"
"030000030096A001E020064165959A4930BC05A80808082000007D20000BB801000000014401C172B66240"
)
# H.264 SPS+PPS (start-code delimited) for the AVC path.
VIDEO_AVC_CPD = "00000001674d401e9a6602800b76020000003e90000bb800f18311200000000168ebccb22c"
# 10-bit (Main 10) HEVC VPS+SPS+PPS minted with x265; ffprobe reads the
# synthesized init as "Main 10 / yuv420p10le".
VIDEO_HEVC10_CPD = (
"0000000140010c01ffff02200000030090000003000003003c959809000000000142010102200000030090"
"000003000003003ca00a080b9f6d96566924caf0168080000003008000000c8400000000014401c172b4624000"
)
AAC_LC_CPD = "1190"
# Real Smooth EC-3 CodecPrivateData: WAVEFORMATEXTENSIBLE extension (samples
# per block + channel mask + DD+ GUID) followed by the 5-byte dec3 payload.
EC3_CPD = "00063F000000AF87FBA7022DFB42A4D405CD93843BDD0600200F00"
KID = bytes.fromhex("09fd2bd778bb544785ed2322dc6a7d87")
def top_level_boxes(data: bytes) -> list[tuple[str, int]]:
boxes, offset = [], 0
while offset + 8 <= len(data):
size = struct.unpack(">I", data[offset : offset + 4])[0]
box_type = data[offset + 4 : offset + 8].decode("latin1")
if size == 1:
size = struct.unpack(">Q", data[offset + 8 : offset + 16])[0]
if size == 0:
size = len(data) - offset
boxes.append((box_type, size))
offset += size
return boxes
def test_split_nal_units_drops_start_codes():
nals = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD))
# VPS (32), SPS (33), PPS (34) by HEVC NAL type = (first_byte >> 1) & 0x3F.
assert [(n[0] >> 1) & 0x3F for n in nals] == [32, 33, 34]
def test_hevc_init_structure():
init = build_init_segment(
stream_type="video",
fourcc="HVC1",
codec_private_data=VIDEO_HEVC_CPD,
timescale=10000000,
width=3840,
height=1600,
)
boxes = top_level_boxes(init)
assert [b[0] for b in boxes] == ["ftyp", "moov"]
assert boxes[0][1] + boxes[1][1] == len(init)
assert b"hvcC" in init
assert b"hvc1" in init
# Unencrypted: no protection scheme boxes.
assert b"encv" not in init and b"sinf" not in init
def test_avc_init_structure():
init = build_init_segment(
stream_type="video",
fourcc="H264",
codec_private_data=VIDEO_AVC_CPD,
timescale=10000000,
width=1280,
height=720,
)
assert init[4:8] == b"ftyp"
assert b"avcC" in init and b"avc1" in init
def test_aac_audio_init_structure():
init = build_init_segment(
stream_type="audio",
fourcc="AACL",
codec_private_data=AAC_LC_CPD,
timescale=10000000,
channels=2,
sampling_rate=48000,
)
assert b"mp4a" in init and b"esds" in init
assert b"smhd" in init # sound media header, not video
def test_encrypted_init_has_cenc_boxes():
init = build_init_segment(
stream_type="video",
fourcc="HVC1",
codec_private_data=VIDEO_HEVC_CPD,
timescale=10000000,
width=3840,
height=1600,
kid=KID,
)
# Encrypted sample entry is wrapped: encv -> sinf(frma+schm+schi(tenc)).
assert b"encv" in init
assert b"sinf" in init and b"frma" in init and b"tenc" in init
assert b"cenc" in init
# The 16-byte default_KID must be embedded verbatim for shaka to map the key.
assert KID in init
# Original codec preserved inside frma for the muxer.
assert b"hvc1" in init
def test_unsupported_codec_raises():
# Unknown FourCC (e.g. VC-1); caller soft-fails to raw concat.
with pytest.raises(NotImplementedError):
build_init_segment(
stream_type="video",
fourcc="WVC1",
codec_private_data="00063F00",
timescale=10000000,
)
def test_ec3_init_embeds_dec3_from_codec_private_data():
init = build_init_segment(
stream_type="audio",
fourcc="EC-3",
codec_private_data=EC3_CPD,
timescale=10000000,
channels=6,
sampling_rate=48000,
)
assert b"ec-3" in init
# dec3 payload = CodecPrivateData past the 22-byte WAVEFORMATEXTENSIBLE header.
assert box(b"dec3", bytes.fromhex(EC3_CPD)[22:]) in init
assert b"esds" not in init # no MPEG-4 descriptor inside an ec-3 entry
def test_ec3_encrypted_wraps_enca_with_frma():
init = build_init_segment(
stream_type="audio",
fourcc="EC-3",
codec_private_data=EC3_CPD,
timescale=10000000,
channels=6,
kid=KID,
)
assert b"enca" in init and b"sinf" in init and b"tenc" in init
assert box(b"frma", b"ec-3") in init
assert KID in init
def test_ec3_dec3_found_in_full_waveformatextensible():
# Some services ship the full WAVEFORMATEX header (18 bytes) before the
# extension; the dec3 payload still follows the DD+ GUID.
full = b"\xfe\xff" + b"\x00" * 16 + bytes.fromhex(EC3_CPD)
payload = bytes.fromhex(EC3_CPD)[22:]
assert build_dec3(full) == box(b"dec3", payload)
def test_ec3_without_dolby_guid_builds_bare_entry():
assert build_dec3(b"\x00\x06\x3f\x00") is None
init = build_init_segment(
stream_type="audio",
fourcc="EC-3",
codec_private_data="",
timescale=10000000,
channels=6,
)
assert b"ec-3" in init and b"dec3" not in init
def test_aac_codec_private_data_synthesis_matches_real_manifest():
# 48 kHz stereo AAC-LC must produce 0x1190 — the exact ASC real manifests carry.
assert synthesize_aac_codec_private_data("AACL", 48000, 2).hex() == "1190"
def test_aach_synthesis_signals_sbr():
asc = synthesize_aac_codec_private_data("AACH", 24000, 2)
assert len(asc) == 4
assert asc[0] >> 3 == 0x05 # AOT 5 = SBR (HE-AAC)
# Extension sampling frequency = core * 2 = 48 kHz (index 3).
assert ((asc[1] & 0x01) << 1) | (asc[2] >> 7) == 0x03
def test_aac_init_without_codec_private_data_synthesizes_asc():
init = build_init_segment(
stream_type="audio",
fourcc="AACL",
codec_private_data="",
timescale=10000000,
channels=2,
sampling_rate=48000,
)
assert b"mp4a" in init and b"esds" in init
assert bytes.fromhex(AAC_LC_CPD) in init
def test_dolby_vision_uses_dvh1_sample_entry():
init = build_init_segment(
stream_type="video",
fourcc="DVH1",
codec_private_data=VIDEO_HEVC_CPD,
timescale=10000000,
width=3840,
height=1600,
)
assert b"dvh1" in init and b"hvcC" in init
assert b"hvc1" not in init
def test_davc_maps_to_avc1():
init = build_init_segment(
stream_type="video",
fourcc="DAVC",
codec_private_data=VIDEO_AVC_CPD,
timescale=10000000,
)
assert b"avc1" in init and b"avcC" in init
def test_lowercase_fourcc_normalized():
# Real manifests ship FourCC="hvc1" in lowercase.
init = build_init_segment(
stream_type="video",
fourcc="hvc1",
codec_private_data=VIDEO_HEVC_CPD,
timescale=10000000,
)
assert b"hvcC" in init
def test_avcc_selects_sps_pps_by_nal_type_not_position():
nals = split_nal_units(bytes.fromhex(VIDEO_AVC_CPD))
swapped = NAL_START_CODE + nals[1] + NAL_START_CODE + nals[0] # PPS first
avcc = build_avcc(swapped)
# Profile/compat/level must still come from the SPS body.
assert avcc[9:12] == nals[0][1:4]
def test_nal_length_field_respected():
avcc = build_avcc(bytes.fromhex(VIDEO_AVC_CPD), nal_length_size=2)
# avcC payload byte 4 low 2 bits = lengthSizeMinusOne.
assert avcc[12] & 0x03 == 1
def test_parse_hevc_sps_format_8bit():
sps = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD))[1]
assert parse_hevc_sps_format(remove_emulation_prevention(sps)) == (1, 0, 0) # 4:2:0, 8-bit
def test_hvcc_signals_10bit_from_sps():
sps = next(n for n in split_nal_units(bytes.fromhex(VIDEO_HEVC10_CPD)) if (n[0] >> 1) & 0x3F == 33)
assert parse_hevc_sps_format(remove_emulation_prevention(sps)) == (1, 2, 2) # 4:2:0, 10-bit
payload = build_hvcc(bytes.fromhex(VIDEO_HEVC10_CPD))[8:] # strip box header
assert payload[16] == 0xFC | 0x01 # chromaFormat 4:2:0
assert payload[17] == 0xF8 | 0x02 # bitDepthLumaMinus8 = 2
assert payload[18] == 0xF8 | 0x02 # bitDepthChromaMinus8 = 2
def test_ttml_init_structure():
init = build_init_segment(
stream_type="text",
fourcc="TTML",
codec_private_data="",
timescale=10000000,
language="eng",
)
assert b"stpp" in init
assert b"sthd" in init # subtitle media header
assert b"subt" in init and b"SubtitleHandler\0" in init
assert b"http://www.w3.org/ns/ttml\0" in init
def test_constant_iv_tenc_form():
constant_iv = bytes(range(16))
init = build_init_segment(
stream_type="video",
fourcc="HVC1",
codec_private_data=VIDEO_HEVC_CPD,
timescale=10000000,
kid=KID,
constant_iv=constant_iv,
)
# Constant-IV form: default_Per_Sample_IV_Size = 0, then size + IV after the KID.
assert KID + bytes([len(constant_iv)]) + constant_iv in init
tenc_at = init.index(b"tenc")
assert init[tenc_at + 4 + 4 + 3] == 0 # default_Per_Sample_IV_Size
def make_fragment(senc: bytes = b"", saiz: bytes = b"") -> bytes:
tfhd = full_box(b"tfhd", 0, 0, struct.pack(">I", 1) + b"\x00" * 4)
traf = box(b"traf", tfhd + senc + saiz)
return box(b"moof", traf) + box(b"mdat", b"\x00" * 4)
def test_iv_size_from_piff_senc_override_flag():
# PIFF senc uuid with flags&1: AlgorithmID(3) + IV_size(1) + KID(16) override.
payload = b"\x00\x00\x00\x01" + b"\x00\x00\x01" + bytes([16]) + KID + struct.pack(">I", 0)
senc = box(b"uuid", PIFF_SENC_UUID + payload)
assert read_per_sample_iv_size(make_fragment(senc=senc)) == 16
def test_iv_size_from_senc_payload_length():
# Standard senc, no subsamples: 3 samples x 8-byte IVs.
senc = full_box(b"senc", 0, 0, struct.pack(">I", 3) + b"\x11" * 24)
assert read_per_sample_iv_size(make_fragment(senc=senc)) == 8
def test_iv_size_from_senc_with_subsamples():
# senc flags&2: per sample IV(8) + entry_count(2) + 6 bytes per entry.
sample = b"\x22" * 8 + struct.pack(">H", 1) + b"\x00" * 6
senc = full_box(b"senc", 0, 2, struct.pack(">I", 2) + sample * 2)
assert read_per_sample_iv_size(make_fragment(senc=senc)) == 8
def test_iv_size_from_saiz_fallback():
saiz = full_box(b"saiz", 0, 0, bytes([16]) + struct.pack(">I", 5))
assert read_per_sample_iv_size(make_fragment(saiz=saiz)) == 16
def test_iv_size_undetermined_returns_none():
assert read_per_sample_iv_size(make_fragment()) is None
def test_hvcc_embeds_vps_sps_pps():
hvcc = build_hvcc(bytes.fromhex(VIDEO_HEVC_CPD))
nals = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD))
# Each original NAL unit (VPS/SPS/PPS) is embedded verbatim in the arrays.
for nal in nals:
assert nal in hvcc
def test_avcc_requires_sps_and_pps():
with pytest.raises(ValueError):
build_avcc(b"\x00\x00\x00\x01\x67only_sps")
def test_read_track_id_from_fragment():
# Minimal moof/traf/tfhd carrying track_ID = 7.
tfhd = full_box("tfhd".encode(), 0, 0, struct.pack(">I", 7) + b"\x00" * 4)
traf = box(b"traf", tfhd)
moof = box(b"moof", traf)
mdat = box(b"mdat", b"\x00\x00")
assert read_track_id(moof + mdat) == 7
def test_read_track_id_missing_returns_none():
assert read_track_id(box(b"mdat", b"\x00\x00")) is None
def test_remove_emulation_prevention():
# 00 00 03 XX -> the 0x03 emulation byte is dropped.
assert remove_emulation_prevention(b"\x00\x00\x03\x01") == b"\x00\x00\x01"
assert remove_emulation_prevention(b"\x00\x00\x03\x00\x00\x03\x96") == b"\x00\x00\x00\x00\x96"
# The byte after a consumed escape is data, even another 0x03.
assert remove_emulation_prevention(b"\x00\x00\x03\x03") == b"\x00\x00\x03"
assert remove_emulation_prevention(b"\x00\x00\x03\x03\x00\x00\x03\x01") == b"\x00\x00\x03\x00\x00\x01"
def test_two_letter_or_uppercase_language_falls_back_to_und():
# mdhd packs three a-z letters; "en"/"ENG" must not crash struct.pack.
for lang in ("en", "ENG", "", "e1x"):
init = build_init_segment(
stream_type="audio",
fourcc="AACL",
codec_private_data=AAC_LC_CPD,
timescale=10000000,
language=lang,
)
assert init[4:8] == b"ftyp"
def test_high_sampling_rate_does_not_overflow():
# 96 kHz exceeds the 16.16 integer field; written as 0 like ffmpeg does.
init = build_init_segment(
stream_type="audio",
fourcc="AACL",
codec_private_data="",
timescale=10000000,
sampling_rate=96000,
)
assert b"mp4a" in init
def test_read_track_id_truncated_tfhd_returns_none():
tfhd = full_box(b"tfhd", 0, 0, b"\x00\x00") # too short for a track_ID
fragment = box(b"moof", box(b"traf", tfhd))
assert read_track_id(fragment) is None
def test_hvcc_profile_tier_level_is_nonzero():
# De-emulated PTL must yield real profile/level, not the off-by-one garbage.
hvcc = build_hvcc(bytes.fromhex(VIDEO_HEVC_CPD))
payload = hvcc[8:] # strip box header
profile_idc = payload[1] & 0x1F
level_idc = payload[12]
assert profile_idc != 0
assert level_idc != 0

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
import base64 import base64
import hashlib import hashlib
import html import html
import struct
import urllib.parse import urllib.parse
from functools import partial from functools import partial
from pathlib import Path from pathlib import Path
@@ -18,6 +19,7 @@ from requests import Session
from unshackle.core.constants import DOWNLOAD_CANCELLED, DOWNLOAD_LICENCE_ONLY, AnyTrack from unshackle.core.constants import DOWNLOAD_CANCELLED, DOWNLOAD_LICENCE_ONLY, AnyTrack
from unshackle.core.drm import DRM_T, PlayReady, Widevine from unshackle.core.drm import DRM_T, PlayReady, Widevine
from unshackle.core.events import events from unshackle.core.events import events
from unshackle.core.manifests.ism_init import build_init_segment, read_per_sample_iv_size, read_track_id
from unshackle.core.session import RnetSession from unshackle.core.session import RnetSession
from unshackle.core.tracks import Audio, Subtitle, Track, Tracks, Video from unshackle.core.tracks import Audio, Subtitle, Track, Tracks, Video
from unshackle.core.utilities import log_event, try_ensure_utf8 from unshackle.core.utilities import log_event, try_ensure_utf8
@@ -85,6 +87,104 @@ class ISM:
drm.append(PlayReady(pssh=pr_pssh, pssh_b64=data)) drm.append(PlayReady(pssh=pr_pssh, pssh_b64=data))
return drm return drm
@staticmethod
def _init_segment(
track: AnyTrack, session_drm: Optional[DRM_T], first_segment: Optional[bytes] = None
) -> Optional[bytes]:
# Smooth fragments are moof+mdat only; rebuild the ftyp+moov init box from
# the manifest CodecPrivateData (and KID, when encrypted) so the merged file
# is a valid MP4 that shaka/mp4decrypt can parse.
ism = track.data.get("ism") if isinstance(getattr(track, "data", None), dict) else None
if not ism:
return None
stream_index = ism.get("stream_index")
quality_level = ism.get("quality_level")
manifest = ism.get("manifest")
if stream_index is None or quality_level is None:
return None
# CodecPrivateData may legitimately be empty (AAC config is synthesized,
# EC-3 decoders sync from the frames); the builder handles each case.
cpd = quality_level.get("CodecPrivateData") or ""
fourcc = quality_level.get("FourCC") or ""
root_timescale = manifest.get("TimeScale") if manifest is not None else None
timescale = int(stream_index.get("TimeScale") or root_timescale or 10000000)
duration = int((manifest.get("Duration") if manifest is not None else 0) or 0)
# mdhd needs a 3-letter ISO-639-2 code; manifests often carry 2-letter tags.
lang_attr = (stream_index.get("Language") or "").strip()
language = "und"
if lang_attr and tag_is_valid(lang_attr):
try:
language = Language.get(lang_attr).to_alpha3()
except LookupError:
language = "und"
kid: Optional[bytes] = None
if session_drm is not None:
kid_uuid = next(iter(getattr(session_drm, "kids", None) or []), None)
if kid_uuid is not None:
kid = bytes.fromhex(kid_uuid.hex)
# Match the moov track_ID to the fragment's tfhd, else the muxer drops samples.
track_id = (read_track_id(first_segment) if first_segment else None) or 1
# NALUnitLengthField: bytes per NAL length prefix, default 4.
nal_length_size = int(quality_level.get("NALUnitLengthField") or stream_index.get("NALUnitLengthField") or 4)
# Per-sample IV size derived from the fragment senc/saiz (PIFF default 8).
iv_size = (read_per_sample_iv_size(first_segment) if first_segment and kid else None) or 8
try:
if isinstance(track, Subtitle):
if track.codec != Subtitle.Codec.fTTML:
return None # plain-text subtitle formats concatenate fine
return build_init_segment(
stream_type="text",
fourcc="TTML",
codec_private_data="",
timescale=timescale,
duration=duration,
language=language,
track_id=track_id,
)
if isinstance(track, Video):
return build_init_segment(
stream_type="video",
fourcc=fourcc,
codec_private_data=cpd,
timescale=timescale,
duration=duration,
language=language,
width=int(quality_level.get("MaxWidth") or stream_index.get("MaxWidth") or 0),
height=int(quality_level.get("MaxHeight") or stream_index.get("MaxHeight") or 0),
track_id=track_id,
nal_length_size=nal_length_size,
kid=kid,
iv_size=iv_size,
)
return build_init_segment(
stream_type="audio",
fourcc=fourcc,
codec_private_data=cpd,
timescale=timescale,
duration=duration,
language=language,
channels=int(quality_level.get("Channels") or 2),
bits_per_sample=int(quality_level.get("BitsPerSample") or 16),
sampling_rate=int(quality_level.get("SamplingRate") or 48000),
track_id=track_id,
kid=kid,
iv_size=iv_size,
)
except (NotImplementedError, ValueError, struct.error) as e:
# Unsupported codec, malformed CodecPrivateData or out-of-range field —
# fall back to raw concatenation rather than aborting the download.
log_event(
"manifest_ism_init_unsupported",
level="WARNING",
message=f"Could not synthesize ISM init segment ({fourcc}): {e}",
context={"track_id": getattr(track, "id", None), "fourcc": fourcc},
)
return None
def to_tracks(self, language: Optional[Union[str, Language]] = None) -> Tracks: def to_tracks(self, language: Optional[Union[str, Language]] = None) -> Tracks:
tracks = Tracks() tracks = Tracks()
base_url = self.url base_url = self.url
@@ -383,8 +483,13 @@ class ISM:
raise FileNotFoundError(error_msg) raise FileNotFoundError(error_msg)
with open(save_path, "wb") as f: with open(save_path, "wb") as f:
for segment_file in segments_to_merge: first_segment = segments_to_merge[0].read_bytes() if segments_to_merge else None
segment_data = segment_file.read_bytes() init_segment = ISM._init_segment(track, session_drm, first_segment)
if init_segment:
f.write(init_segment)
for index, segment_file in enumerate(segments_to_merge):
# First segment was already read for the init synthesis — reuse it.
segment_data = first_segment if index == 0 and first_segment else segment_file.read_bytes()
if ( if (
not session_drm not session_drm
and isinstance(track, Subtitle) and isinstance(track, Subtitle)

View File

@@ -0,0 +1,622 @@
"""
Synthesize an ISO-BMFF initialization segment (ftyp + moov) for ISM / Smooth
Streaming tracks.
Smooth Streaming fragments are bare ``moof`` + ``mdat`` pairs; the server never
sends a ``moov``. The init box must be reconstructed from the manifest's
``CodecPrivateData`` (and, for protected content, the track KID) before a muxer
or decryptor such as shaka-packager can parse the stream. Ported from yt-dlp's
``write_piff_header`` and N_m3u8DL-RE's ``MSSMoovProcessor`` with HEVC, Dolby
Vision, EC-3, TTML and CENC (PIFF) support.
"""
from __future__ import annotations
import binascii
import struct
from typing import Iterator, Optional
# Big-endian field packers (named for the bit widths they encode).
u8 = struct.Struct(">B")
u16 = struct.Struct(">H")
u32 = struct.Struct(">I")
u64 = struct.Struct(">Q")
s16 = struct.Struct(">h")
s88 = struct.Struct(">bx") # 8.8 fixed-point
s1616 = struct.Struct(">hxx") # 16.16 fixed-point
u1616 = struct.Struct(">Hxx")
s32 = struct.Struct(">i")
# 3x3 transformation matrix (identity), as stored in tkhd/mvhd.
UNITY_MATRIX = (
s32.pack(0x10000) + s32.pack(0) * 3
+ s32.pack(0) + s32.pack(0x10000) + s32.pack(0) * 2
+ s32.pack(0) * 2 + s32.pack(0x40000000)
)
TRACK_ENABLED = 0x1
TRACK_IN_MOVIE = 0x2
TRACK_IN_PREVIEW = 0x4
SELF_CONTAINED = 0x1
# Fixed creation/modification time — deterministic output (no wall clock).
EPOCH = 0
NAL_START_CODE = b"\x00\x00\x00\x01"
# WAVEFORMATEXTENSIBLE SubFormat GUID for Dolby Digital Plus, as serialized
# (little-endian) inside Smooth EC-3 CodecPrivateData.
DOLBY_DIGITAL_PLUS_GUID = bytes.fromhex("AF87FBA7022DFB42A4D405CD93843BDD")
# PIFF SampleEncryptionBox usertype (the pre-CENC 'senc' carried as a uuid box).
PIFF_SENC_UUID = bytes.fromhex("A2394F525A9B4F14A2446C427C648DF4")
TTML_NAMESPACE = b"http://www.w3.org/ns/ttml\0"
# ISO/IEC 14496-3 samplingFrequencyIndex table for AudioSpecificConfig.
AAC_SAMPLING_FREQUENCY_INDEX = {
96000: 0x0,
88200: 0x1,
64000: 0x2,
48000: 0x3,
44100: 0x4,
32000: 0x5,
24000: 0x6,
22050: 0x7,
16000: 0x8,
12000: 0x9,
11025: 0xA,
8000: 0xB,
7350: 0xC,
}
def box(box_type: bytes, payload: bytes) -> bytes:
"""Wrap payload in a basic ISO-BMFF box (size + fourcc + payload)."""
return u32.pack(8 + len(payload)) + box_type + payload
def full_box(box_type: bytes, version: int, flags: int, payload: bytes) -> bytes:
"""Wrap payload in a FullBox (adds 1-byte version + 3-byte flags)."""
return box(box_type, u8.pack(version) + u32.pack(flags)[1:] + payload)
def split_nal_units(codec_private_data: bytes) -> list[bytes]:
"""Split CodecPrivateData into its NAL units (drops the start codes)."""
units = [u for u in codec_private_data.split(NAL_START_CODE) if u]
return units
def remove_emulation_prevention(data: bytes) -> bytes:
"""Strip H.26x emulation-prevention bytes (the 0x03 in any 00 00 03 run).
The byte after a consumed escape is data — even another 0x03 — so the scan
must skip past it rather than re-examine (a naive trailing-window check
over-strips consecutive escapes and shifts every later bit position).
"""
out = bytearray()
i = 0
while i < len(data):
if i + 2 < len(data) and data[i] == 0 and data[i + 1] == 0 and data[i + 2] == 3:
out += b"\x00\x00"
i += 3
else:
out.append(data[i])
i += 1
return bytes(out)
class BitReader:
"""MSB-first bit reader with the exp-Golomb decode H.26x headers need."""
def __init__(self, data: bytes) -> None:
self.data = data
self.pos = 0
def read_bits(self, count: int) -> int:
value = 0
for _ in range(count):
byte = self.data[self.pos >> 3]
value = (value << 1) | ((byte >> (7 - (self.pos & 7))) & 1)
self.pos += 1
return value
def read_ue(self) -> int:
zeros = 0
while self.read_bits(1) == 0:
zeros += 1
if zeros > 32:
raise ValueError("Invalid exp-Golomb code")
return (1 << zeros) - 1 + (self.read_bits(zeros) if zeros else 0)
def parse_hevc_sps_format(sps_rbsp: bytes) -> tuple[int, int, int]:
"""
Parse (chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8)
from a de-emulated HEVC SPS RBSP (including its 2-byte NAL header).
"""
r = BitReader(sps_rbsp)
r.read_bits(16) # NAL unit header
r.read_bits(4) # sps_video_parameter_set_id
max_sub_layers_minus1 = r.read_bits(3)
r.read_bits(1) # sps_temporal_id_nesting_flag
r.read_bits(96) # general profile_tier_level (12 bytes)
profile_present = []
level_present = []
for _ in range(max_sub_layers_minus1):
profile_present.append(r.read_bits(1))
level_present.append(r.read_bits(1))
if max_sub_layers_minus1 > 0:
r.read_bits((8 - max_sub_layers_minus1) * 2) # reserved_zero_2bits
for i in range(max_sub_layers_minus1):
if profile_present[i]:
r.read_bits(88) # sub_layer profile_tier
if level_present[i]:
r.read_bits(8) # sub_layer_level_idc
r.read_ue() # sps_seq_parameter_set_id
chroma_format_idc = r.read_ue()
if chroma_format_idc == 3:
r.read_bits(1) # separate_colour_plane_flag
r.read_ue() # pic_width_in_luma_samples
r.read_ue() # pic_height_in_luma_samples
if r.read_bits(1): # conformance_window_flag
for _ in range(4):
r.read_ue()
bit_depth_luma_minus8 = r.read_ue()
bit_depth_chroma_minus8 = r.read_ue()
return chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8
def iter_boxes(data: bytes, start: int, end: int) -> Iterator[tuple[bytes, Optional[bytes], int, int]]:
"""Yield (type, uuid_usertype, payload_start, box_end) for each child box."""
offset = start
while offset + 8 <= end:
size = struct.unpack(">I", data[offset : offset + 4])[0]
box_type = data[offset + 4 : offset + 8]
header = 8
if size == 1:
size = struct.unpack(">Q", data[offset + 8 : offset + 16])[0]
header = 16
if size == 0:
size = end - offset
if size < 8: # corrupt box header; stop rather than loop forever
return
usertype = None
if box_type == b"uuid" and offset + header + 16 <= end:
usertype = data[offset + header : offset + header + 16]
header += 16
yield box_type, usertype, offset + header, offset + size
offset += size
def find_box(data: bytes, start: int, end: int, target: bytes) -> Optional[tuple[int, int]]:
"""Find the first child box of the given type; return (payload_start, end)."""
for box_type, _, body, box_end in iter_boxes(data, start, end):
if box_type == target:
return body, box_end
return None
def read_track_id(fragment: bytes) -> Optional[int]:
"""Read the track_ID from a fragment's moof/traf/tfhd box, if present.
Smooth fragments declare their own track_ID; the synthesized moov must use
the same value or the muxer cannot associate samples with the track. The
track_ID sits before any tfhd optional fields, so the flags don't matter.
"""
moof = find_box(fragment, 0, len(fragment), b"moof")
if not moof:
return None
traf = find_box(fragment, *moof, b"traf")
if not traf:
return None
tfhd = find_box(fragment, *traf, b"tfhd")
if not tfhd:
return None
body, _ = tfhd
if body + 8 > len(fragment): # truncated tfhd
return None
# tfhd payload: version(1) + flags(3) + track_ID(4)
return struct.unpack(">I", fragment[body + 4 : body + 8])[0]
def read_per_sample_iv_size(fragment: bytes) -> Optional[int]:
"""
Derive the per-sample IV size (8 or 16) from a fragment's sample-encryption
metadata, for the synthesized tenc default_Per_Sample_IV_Size.
Checks, in order: the PIFF 'senc' uuid override flag (explicit IV size),
the senc payload length (sample_count vs IV/subsample entries), and the
saiz default_sample_info_size (only unambiguous without subsamples).
"""
moof = find_box(fragment, 0, len(fragment), b"moof")
if not moof:
return None
traf = find_box(fragment, *moof, b"traf")
if not traf:
return None
senc: Optional[tuple[int, int]] = None
saiz_default: Optional[int] = None
senc_has_subsamples = False
for box_type, usertype, body, box_end in iter_boxes(fragment, *traf):
if box_type == b"senc" or (box_type == b"uuid" and usertype == PIFF_SENC_UUID):
senc = (body, box_end)
elif box_type == b"saiz":
flags = int.from_bytes(fragment[body + 1 : body + 4], "big")
pos = body + 4 + (8 if flags & 0x1 else 0) # skip aux_info_type fields
if pos < box_end:
saiz_default = fragment[pos]
if senc:
body, box_end = senc
flags = int.from_bytes(fragment[body + 1 : body + 4], "big")
senc_has_subsamples = bool(flags & 0x2)
pos = body + 4
if flags & 0x1: # PIFF override: AlgorithmID(3) + IV_size(1) + KID(16)
return fragment[pos + 3]
if pos + 4 <= box_end:
sample_count = struct.unpack(">I", fragment[pos : pos + 4])[0]
pos += 4
if sample_count:
if not senc_has_subsamples:
size, rem = divmod(box_end - pos, sample_count)
if rem == 0 and size in (8, 16):
return size
else:
# Walk the entries with each candidate IV size; the one that
# lands exactly on the box end is correct.
for iv_size in (8, 16):
cursor = pos
for _ in range(sample_count):
cursor += iv_size
if cursor + 2 > box_end:
cursor = -1
break
entries = struct.unpack(">H", fragment[cursor : cursor + 2])[0]
cursor += 2 + 6 * entries
if cursor > box_end:
cursor = -1
break
if cursor == box_end:
return iv_size
if not senc_has_subsamples and saiz_default in (8, 16):
return saiz_default
return None
def build_avcc(codec_private_data: bytes, nal_length_size: int = 4) -> bytes:
"""Build an avcC (AVC decoder config) box from SPS+PPS CodecPrivateData."""
nals = split_nal_units(codec_private_data)
# Pick parameter sets by H.264 NAL type (low 5 bits): 7 = SPS, 8 = PPS.
# Manifests do not guarantee SPS-first ordering.
sps = next((n for n in nals if n[0] & 0x1F == 7), None)
pps = next((n for n in nals if n[0] & 0x1F == 8), None)
if not sps or not pps:
raise ValueError("AVC CodecPrivateData must contain SPS and PPS NAL units")
payload = u8.pack(1) # configuration version
payload += sps[1:4] # profile / compat / level (from SPS NAL body)
payload += u8.pack(0xFC | (nal_length_size - 1)) # reserved + length size minus one
payload += u8.pack(0xE0 | 1) # reserved + number of SPS (1)
payload += u16.pack(len(sps)) + sps
payload += u8.pack(1) # number of PPS
payload += u16.pack(len(pps)) + pps
return box(b"avcC", payload)
def build_hvcc(codec_private_data: bytes, nal_length_size: int = 4) -> bytes:
"""
Build an hvcC (HEVC decoder config) box from VPS+SPS+PPS CodecPrivateData.
Profile/tier/level bytes are lifted from the SPS profile_tier_level; chroma
format and bit depths are parsed from the SPS so 10-bit/HDR streams signal
correctly (falls back to 8-bit 4:2:0 on malformed SPS data).
"""
nals = split_nal_units(codec_private_data)
if len(nals) < 3:
raise ValueError("HEVC CodecPrivateData must contain VPS, SPS and PPS NAL units")
# Group NAL units by type (HEVC NAL type = (first byte >> 1) & 0x3F).
by_type: dict[int, list[bytes]] = {}
for nal in nals:
nal_type = (nal[0] >> 1) & 0x3F
by_type.setdefault(nal_type, []).append(nal)
sps = by_type.get(33, [b""])[0]
# profile_tier_level must be read from the de-emulated SPS RBSP, after the
# 2-byte NAL header + 1 byte (sps_video_parameter_set_id(4) +
# sps_max_sub_layers_minus1(3) + sps_temporal_id_nesting_flag(1)). PTL is 12
# bytes: profile byte(1) + compat flags(4) + constraint flags(6) + level(1).
sps_rbsp = remove_emulation_prevention(sps)
ptl = sps_rbsp[3:15] if len(sps_rbsp) >= 15 else b"\x00" * 12
general_profile_space_tier_profile = ptl[0:1] or b"\x00"
general_profile_compat = ptl[1:5].ljust(4, b"\x00")
general_constraint = ptl[5:11].ljust(6, b"\x00")
general_level_idc = ptl[11:12] or b"\x00"
try:
chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8 = parse_hevc_sps_format(sps_rbsp)
except (IndexError, ValueError):
chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8 = 1, 0, 0
payload = u8.pack(1) # configurationVersion
payload += general_profile_space_tier_profile
payload += general_profile_compat
payload += general_constraint
payload += general_level_idc
payload += u16.pack(0xF000) # reserved(4) + min_spatial_segmentation_idc(12)
payload += u8.pack(0xFC) # reserved(6) + parallelismType(2)
payload += u8.pack(0xFC | (chroma_format_idc & 0x03)) # reserved(6) + chromaFormat(2)
payload += u8.pack(0xF8 | (bit_depth_luma_minus8 & 0x07)) # reserved(5) + bitDepthLumaMinus8(3)
payload += u8.pack(0xF8 | (bit_depth_chroma_minus8 & 0x07)) # reserved(5) + bitDepthChromaMinus8(3)
payload += u16.pack(0) # avgFrameRate
# constantFrameRate(2)+numTemporalLayers(3)+temporalIdNested(1)+lengthSizeMinusOne(2)
payload += u8.pack((nal_length_size - 1) & 0x03)
arrays = bytearray()
num_arrays = 0
for nal_type in (32, 33, 34): # VPS, SPS, PPS
units = by_type.get(nal_type)
if not units:
continue
num_arrays += 1
arrays += u8.pack(0x80 | nal_type) # array_completeness(1)+reserved(1)+NAL type(6)
arrays += u16.pack(len(units))
for unit in units:
arrays += u16.pack(len(unit)) + unit
payload += u8.pack(num_arrays) + bytes(arrays)
return box(b"hvcC", payload)
def build_esds(codec_private_data: bytes) -> bytes:
"""Build an esds box wrapping the AAC AudioSpecificConfig."""
asc = codec_private_data
# DecoderSpecificInfo (tag 0x05)
dsi = u8.pack(0x05) + u8.pack(len(asc)) + asc
# DecoderConfigDescriptor (tag 0x04): objectType=0x40 (AAC), stream type audio
dcd = (
u8.pack(0x40) # object type indication = MPEG-4 AAC
+ u8.pack(0x15) # stream type (audio) << 2 | upstream | reserved
+ b"\x00\x00\x00" # buffer size
+ u32.pack(0) # max bitrate
+ u32.pack(0) # avg bitrate
+ dsi
)
dcd_box = u8.pack(0x04) + u8.pack(len(dcd)) + dcd
# SLConfigDescriptor (tag 0x06)
sl = u8.pack(0x06) + u8.pack(1) + u8.pack(0x02)
# ES_Descriptor (tag 0x03)
es = u8.pack(0x03) + u8.pack(len(dcd_box) + len(sl) + 3) + u16.pack(0) + u8.pack(0) + dcd_box + sl
return full_box(b"esds", 0, 0, es)
def build_dec3(codec_private_data: bytes) -> Optional[bytes]:
"""Build a dec3 (EC-3 specific) box from Smooth EC-3 CodecPrivateData.
Smooth EC-3 CodecPrivateData ([MS-SSTR] AudioTag 65534) serializes a
WAVEFORMATEXTENSIBLE — sometimes the full structure, sometimes only its
extension (samples-per-block + channel mask + DD+ SubFormat GUID) — with
the raw dec3 payload (ETSI TS 102 366 F.6) after the GUID. Returns None
when the GUID is absent — decoders still sync from EC-3 frames in mdat.
"""
guid_at = codec_private_data.find(DOLBY_DIGITAL_PLUS_GUID)
if guid_at != -1 and len(codec_private_data) > guid_at + 16:
return box(b"dec3", codec_private_data[guid_at + 16 :])
return None
def synthesize_aac_codec_private_data(fourcc: str, sampling_rate: int, channels: int) -> bytes:
"""Generate the AAC AudioSpecificConfig when the manifest omits it.
AACL -> 2-byte AAC-LC config; AACH -> 4-byte HE-AAC (SBR, AOT 5) config
with the extension sampling frequency at twice the core rate.
"""
freq = AAC_SAMPLING_FREQUENCY_INDEX.get(sampling_rate, 0x0)
if fourcc == "AACH":
ext_freq = AAC_SAMPLING_FREQUENCY_INDEX.get(sampling_rate * 2, 0x0)
return bytes(
(
(0x05 << 3) | (freq >> 1),
((freq & 0x01) << 7) | (channels << 3) | (ext_freq >> 1),
((ext_freq & 0x01) << 7) | (0x02 << 2), # core object type = AAC LC
0x00, # alignment bits
)
)
return bytes(((0x02 << 3) | (freq >> 1), ((freq & 0x01) << 7) | (channels << 3)))
def build_sinf(
original_format: bytes,
kid: bytes,
iv_size: int = 8,
constant_iv: Optional[bytes] = None,
) -> bytes:
"""Build a sinf protection box (frma + schm cenc + schi/tenc) for CENC.
iv_size is the tenc default_Per_Sample_IV_Size (8 or 16). When constant_iv
is given, the per-sample IV size is 0 and the constant IV is appended per
ISO/IEC 23001-7 (cbcs-style constant-IV form).
"""
frma = box(b"frma", original_format)
schm = full_box(b"schm", 0, 0, b"cenc" + u32.pack(0x00010000))
tenc_payload = (
u8.pack(0) # reserved
+ u8.pack(0) # default_crypt_byte_block / skip_byte_block (cenc)
+ u8.pack(1) # default_isProtected
+ u8.pack(0 if constant_iv else iv_size) # default_Per_Sample_IV_Size
+ kid # default_KID (16 bytes)
)
if constant_iv:
tenc_payload += u8.pack(len(constant_iv)) + constant_iv
schi = box(b"schi", full_box(b"tenc", 0, 0, tenc_payload))
return box(b"sinf", frma + schm + schi)
def build_init_segment(
*,
stream_type: str,
fourcc: str,
codec_private_data: str,
timescale: int = 10000000,
duration: int = 0,
language: str = "und",
width: int = 0,
height: int = 0,
channels: int = 2,
bits_per_sample: int = 16,
sampling_rate: int = 48000,
track_id: int = 1,
nal_length_size: int = 4,
kid: Optional[bytes] = None,
iv_size: int = 8,
constant_iv: Optional[bytes] = None,
) -> bytes:
"""
Build a complete ftyp + moov initialization segment.
stream_type: "video" | "audio" | "text".
fourcc: Smooth FourCC ("H264"/"AVC1"/"DAVC", "HVC1"/"HEV1", "DVHE"/"DVH1",
"AACL"/"AACH", "EC-3", "TTML").
codec_private_data: hex string from the manifest QualityLevel.
nal_length_size: manifest NALUnitLengthField (bytes per NAL length prefix).
kid: 16-byte default key id; when set, the sample entry is wrapped for CENC.
iv_size / constant_iv: tenc IV form (see build_sinf).
"""
if stream_type not in ("video", "audio", "text"):
raise ValueError(f"Unsupported stream type: {stream_type}")
fourcc = (fourcc or "").upper()
cpd = binascii.unhexlify(codec_private_data) if codec_private_data else b""
encrypted = kid is not None
# mdhd packs exactly three a-z letters; anything else (2-letter tags,
# uppercase) would underflow the 5-bit fields, so fall back to "und".
lang = (language or "").lower()
if len(lang) != 3 or not all("a" <= c <= "z" for c in lang):
lang = "und"
# --- ftyp ---
ftyp = box(b"ftyp", b"isml" + u32.pack(1) + b"iso5" + b"iso6" + b"piff" + b"msdh")
# --- mvhd ---
mvhd = full_box(
b"mvhd", 1, 0,
u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(timescale) + u64.pack(duration)
+ s1616.pack(1) + s88.pack(1) + u16.pack(0) + u32.pack(0) * 2
+ UNITY_MATRIX + u32.pack(0) * 6 + u32.pack(0xFFFFFFFF),
)
# --- tkhd ---
tkhd = full_box(
b"tkhd", 1, TRACK_ENABLED | TRACK_IN_MOVIE | TRACK_IN_PREVIEW,
u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(track_id) + u32.pack(0)
+ u64.pack(duration) + u32.pack(0) * 2 + s16.pack(0) + s16.pack(0)
+ s88.pack(1 if stream_type == "audio" else 0) + u16.pack(0) + UNITY_MATRIX
+ u1616.pack(width) + u1616.pack(height),
)
# --- mdhd + hdlr ---
packed_lang = ((ord(lang[0]) - 0x60) << 10) | ((ord(lang[1]) - 0x60) << 5) | (ord(lang[2]) - 0x60)
mdhd = full_box(
b"mdhd", 1, 0,
u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(timescale) + u64.pack(duration)
+ u16.pack(packed_lang) + u16.pack(0),
)
if stream_type == "audio":
hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"soun" + u32.pack(0) * 3 + b"SoundHandler\0")
media_header = full_box(b"smhd", 0, 0, s88.pack(0) + u16.pack(0))
elif stream_type == "text":
hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"subt" + u32.pack(0) * 3 + b"SubtitleHandler\0")
media_header = full_box(b"sthd", 0, 0, b"")
else:
hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"vide" + u32.pack(0) * 3 + b"VideoHandler\0")
media_header = full_box(b"vmhd", 0, 1, u16.pack(0) + u16.pack(0) * 3)
# --- dinf ---
dref = full_box(b"dref", 0, 0, u32.pack(1) + full_box(b"url ", 0, SELF_CONTAINED, b""))
dinf = box(b"dinf", dref)
# --- stsd sample entry ---
sample_entry_payload = u8.pack(0) * 6 + u16.pack(1) # reserved + data reference index
if stream_type == "video":
sample_entry_payload += (
u16.pack(0) + u16.pack(0) + u32.pack(0) * 3
+ u16.pack(width) + u16.pack(height)
+ u1616.pack(0x48) + u1616.pack(0x48) + u32.pack(0) + u16.pack(1)
+ u8.pack(0) * 32 + u16.pack(0x18) + s16.pack(-1)
)
if fourcc in ("H264", "AVC1", "DAVC"):
config_box = build_avcc(cpd, nal_length_size)
codec_fourcc = b"avc1"
elif fourcc in ("HVC1", "HEV1", "HEVC", "H265"):
config_box = build_hvcc(cpd, nal_length_size)
codec_fourcc = b"hvc1"
elif fourcc in ("DVHE", "DVH1"):
# Dolby Vision over HEVC: same hvcC config, dvh1 sample entry.
config_box = build_hvcc(cpd, nal_length_size)
codec_fourcc = b"dvh1"
else:
raise NotImplementedError(f"Unsupported video FourCC: {fourcc}")
sample_entry_payload += config_box
if encrypted:
sample_entry_payload += build_sinf(codec_fourcc, kid, iv_size, constant_iv)
sample_entry_box = box(b"encv", sample_entry_payload)
else:
sample_entry_box = box(codec_fourcc, sample_entry_payload)
elif stream_type == "audio":
# samplerate is 16.16 fixed-point; rates above 65535 Hz are written as 0
# (decoders read the real rate from the codec config), matching ffmpeg.
sample_entry_payload += (
u32.pack(0) * 2 + u16.pack(channels) + u16.pack(bits_per_sample)
+ u16.pack(0) + u16.pack(0) + u32.pack((sampling_rate if sampling_rate <= 0xFFFF else 0) << 16)
)
if fourcc in ("AACL", "AACH", "AAC"):
if not cpd:
cpd = synthesize_aac_codec_private_data(fourcc, sampling_rate, channels)
sample_entry_payload += build_esds(cpd)
codec_fourcc = b"mp4a"
elif fourcc == "EC-3":
dec3 = build_dec3(cpd)
if dec3:
sample_entry_payload += dec3
codec_fourcc = b"ec-3"
else:
raise NotImplementedError(f"Unsupported audio FourCC: {fourcc}")
if encrypted:
sample_entry_payload += build_sinf(codec_fourcc, kid, iv_size, constant_iv)
sample_entry_box = box(b"enca", sample_entry_payload)
else:
sample_entry_box = box(codec_fourcc, sample_entry_payload)
else: # text
if fourcc in ("TTML", "STPP", "DFXP"):
# XMLSubtitleSampleEntry: namespace + schema_location + aux mime types.
sample_entry_payload += TTML_NAMESPACE + b"\0" + b"\0"
sample_entry_box = box(b"stpp", sample_entry_payload)
else:
raise NotImplementedError(f"Unsupported text FourCC: {fourcc}")
stsd = full_box(b"stsd", 0, 0, u32.pack(1) + sample_entry_box)
# --- empty sample tables (fragmented: real samples live in moof/traf) ---
stbl = box(
b"stbl",
stsd
+ full_box(b"stts", 0, 0, u32.pack(0))
+ full_box(b"stsc", 0, 0, u32.pack(0))
+ full_box(b"stsz", 0, 0, u32.pack(0) + u32.pack(0))
+ full_box(b"stco", 0, 0, u32.pack(0)),
)
minf = box(b"minf", media_header + dinf + stbl)
mdia = box(b"mdia", mdhd + hdlr + minf)
trak = box(b"trak", tkhd + mdia)
# --- mvex (mehd + trex) signals a fragmented file ---
mehd = full_box(b"mehd", 1, 0, u64.pack(duration))
trex = full_box(
b"trex", 0, 0,
u32.pack(track_id) + u32.pack(1) + u32.pack(0) + u32.pack(0) + u32.pack(0),
)
mvex = box(b"mvex", mehd + trex)
moov = box(b"moov", mvhd + trak + mvex)
return ftyp + moov