fix(ism): rebuild moov init segment for Smooth Streaming decrypt

ISM (Smooth Streaming) tracks raw-concatenate moof+mdat fragments with no ftyp/moov, so shaka-packager/mp4decrypt fail with PARSER_FAILURE (exit 2) on decrypt. The init box was previously built by n_m3u8dl_re, removed in the downloader consolidation.

Add ism_init.py, a dependency-free byte-level MP4 init-segment synthesizer that rebuilds ftyp+moov from the manifest CodecPrivateData, ported from yt-dlp's write_piff_header and N_m3u8DL-RE's MSSMoovProcessor with full codec parity:

- AVC (H264/AVC1/DAVC), with SPS/PPS picked by NAL type rather than position and NALUnitLengthField honored
- HEVC (HVC1/HEV1) with chroma format and bit depths parsed from the de-emulated SPS via exp-Golomb so 10-bit/HDR signals correctly, and profile/tier/level lifted from the SPS PTL
- Dolby Vision (DVHE/DVH1) as hvcC with a dvh1 sample entry
- AAC (AACL/AACH) with the AudioSpecificConfig synthesized from SamplingRate/Channels when the manifest omits CodecPrivateData
- EC-3 with a real dec3 box extracted from the WAVEFORMATEXTENSIBLE CodecPrivateData (Dolby GUID located by search, not fixed offset)
- TTML subtitles as stpp/sthd/subt, wired for fragmented-TTML tracks

CENC wrapping (encv/enca + sinf/tenc with default_KID) covers encrypted tracks: the per-sample IV size is derived from the fragment senc/saiz (PIFF override flag, payload arithmetic, saiz fallback) instead of assuming 8, and the constant-IV tenc form is supported. Read the track_ID from the first fragment's tfhd so the moov matches and the muxer does not drop samples.

Wire ISM.download_track to prepend the synthesized init before merging; unsupported codecs soft-fail to raw concatenation with a warning. Harden against real-world inputs: 2-letter/uppercase manifest language tags normalize to ISO-639-2 (und fallback), >65535 Hz sample rates no longer overflow the 16.16 field, truncated tfhd returns None, struct.error joins the soft-fail handler, and the emulation-prevention scan no longer over-strips consecutive escapes.

Add regression tests (37) covering box structure, every supported FourCC, 10-bit SPS parsing, ASC synthesis, dec3 extraction, IV-size derivation and the crash fixes. Validated structurally per codec with ffmpeg-minted fragments: shaka-packager parses synth-init+fragments with exit 0 and ffprobe reports the expected codec, including a live run against a public Smooth Streaming server.
This commit is contained in:
imSp4rky
2026-06-11 13:41:58 -06:00
parent 466bf610cc
commit 39034f2bb5
3 changed files with 1139 additions and 2 deletions

410
tests/core/test_ism_init.py Normal file
View File

@@ -0,0 +1,410 @@
"""Regression tests for ISM init-segment synthesis (ftyp + moov).
Smooth Streaming fragments carry no moov; the init box must be rebuilt from the
manifest CodecPrivateData before shaka/mp4decrypt can parse the stream. These
guard the byte-level box structure so a future downloader refactor cannot
silently drop it again (the c323db9 regression).
"""
from __future__ import annotations
import struct
import pytest
from unshackle.core.manifests.ism_init import (NAL_START_CODE, PIFF_SENC_UUID, box, build_avcc, build_dec3,
build_hvcc, build_init_segment, full_box, parse_hevc_sps_format,
read_per_sample_iv_size, read_track_id, remove_emulation_prevention,
split_nal_units, synthesize_aac_codec_private_data)
# Real CodecPrivateData taken from a Smooth Streaming manifest.
VIDEO_HEVC_CPD = (
"0000000140010C01FFFF01600000030090000003000003009695980900000001420101016000000300900000"
"030000030096A001E020064165959A4930BC05A80808082000007D20000BB801000000014401C172B66240"
)
# H.264 SPS+PPS (start-code delimited) for the AVC path.
VIDEO_AVC_CPD = "00000001674d401e9a6602800b76020000003e90000bb800f18311200000000168ebccb22c"
# 10-bit (Main 10) HEVC VPS+SPS+PPS minted with x265; ffprobe reads the
# synthesized init as "Main 10 / yuv420p10le".
VIDEO_HEVC10_CPD = (
"0000000140010c01ffff02200000030090000003000003003c959809000000000142010102200000030090"
"000003000003003ca00a080b9f6d96566924caf0168080000003008000000c8400000000014401c172b4624000"
)
AAC_LC_CPD = "1190"
# Real Smooth EC-3 CodecPrivateData: WAVEFORMATEXTENSIBLE extension (samples
# per block + channel mask + DD+ GUID) followed by the 5-byte dec3 payload.
EC3_CPD = "00063F000000AF87FBA7022DFB42A4D405CD93843BDD0600200F00"
KID = bytes.fromhex("09fd2bd778bb544785ed2322dc6a7d87")
def top_level_boxes(data: bytes) -> list[tuple[str, int]]:
boxes, offset = [], 0
while offset + 8 <= len(data):
size = struct.unpack(">I", data[offset : offset + 4])[0]
box_type = data[offset + 4 : offset + 8].decode("latin1")
if size == 1:
size = struct.unpack(">Q", data[offset + 8 : offset + 16])[0]
if size == 0:
size = len(data) - offset
boxes.append((box_type, size))
offset += size
return boxes
def test_split_nal_units_drops_start_codes():
nals = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD))
# VPS (32), SPS (33), PPS (34) by HEVC NAL type = (first_byte >> 1) & 0x3F.
assert [(n[0] >> 1) & 0x3F for n in nals] == [32, 33, 34]
def test_hevc_init_structure():
init = build_init_segment(
stream_type="video",
fourcc="HVC1",
codec_private_data=VIDEO_HEVC_CPD,
timescale=10000000,
width=3840,
height=1600,
)
boxes = top_level_boxes(init)
assert [b[0] for b in boxes] == ["ftyp", "moov"]
assert boxes[0][1] + boxes[1][1] == len(init)
assert b"hvcC" in init
assert b"hvc1" in init
# Unencrypted: no protection scheme boxes.
assert b"encv" not in init and b"sinf" not in init
def test_avc_init_structure():
init = build_init_segment(
stream_type="video",
fourcc="H264",
codec_private_data=VIDEO_AVC_CPD,
timescale=10000000,
width=1280,
height=720,
)
assert init[4:8] == b"ftyp"
assert b"avcC" in init and b"avc1" in init
def test_aac_audio_init_structure():
init = build_init_segment(
stream_type="audio",
fourcc="AACL",
codec_private_data=AAC_LC_CPD,
timescale=10000000,
channels=2,
sampling_rate=48000,
)
assert b"mp4a" in init and b"esds" in init
assert b"smhd" in init # sound media header, not video
def test_encrypted_init_has_cenc_boxes():
init = build_init_segment(
stream_type="video",
fourcc="HVC1",
codec_private_data=VIDEO_HEVC_CPD,
timescale=10000000,
width=3840,
height=1600,
kid=KID,
)
# Encrypted sample entry is wrapped: encv -> sinf(frma+schm+schi(tenc)).
assert b"encv" in init
assert b"sinf" in init and b"frma" in init and b"tenc" in init
assert b"cenc" in init
# The 16-byte default_KID must be embedded verbatim for shaka to map the key.
assert KID in init
# Original codec preserved inside frma for the muxer.
assert b"hvc1" in init
def test_unsupported_codec_raises():
# Unknown FourCC (e.g. VC-1); caller soft-fails to raw concat.
with pytest.raises(NotImplementedError):
build_init_segment(
stream_type="video",
fourcc="WVC1",
codec_private_data="00063F00",
timescale=10000000,
)
def test_ec3_init_embeds_dec3_from_codec_private_data():
init = build_init_segment(
stream_type="audio",
fourcc="EC-3",
codec_private_data=EC3_CPD,
timescale=10000000,
channels=6,
sampling_rate=48000,
)
assert b"ec-3" in init
# dec3 payload = CodecPrivateData past the 22-byte WAVEFORMATEXTENSIBLE header.
assert box(b"dec3", bytes.fromhex(EC3_CPD)[22:]) in init
assert b"esds" not in init # no MPEG-4 descriptor inside an ec-3 entry
def test_ec3_encrypted_wraps_enca_with_frma():
init = build_init_segment(
stream_type="audio",
fourcc="EC-3",
codec_private_data=EC3_CPD,
timescale=10000000,
channels=6,
kid=KID,
)
assert b"enca" in init and b"sinf" in init and b"tenc" in init
assert box(b"frma", b"ec-3") in init
assert KID in init
def test_ec3_dec3_found_in_full_waveformatextensible():
# Some services ship the full WAVEFORMATEX header (18 bytes) before the
# extension; the dec3 payload still follows the DD+ GUID.
full = b"\xfe\xff" + b"\x00" * 16 + bytes.fromhex(EC3_CPD)
payload = bytes.fromhex(EC3_CPD)[22:]
assert build_dec3(full) == box(b"dec3", payload)
def test_ec3_without_dolby_guid_builds_bare_entry():
assert build_dec3(b"\x00\x06\x3f\x00") is None
init = build_init_segment(
stream_type="audio",
fourcc="EC-3",
codec_private_data="",
timescale=10000000,
channels=6,
)
assert b"ec-3" in init and b"dec3" not in init
def test_aac_codec_private_data_synthesis_matches_real_manifest():
# 48 kHz stereo AAC-LC must produce 0x1190 — the exact ASC real manifests carry.
assert synthesize_aac_codec_private_data("AACL", 48000, 2).hex() == "1190"
def test_aach_synthesis_signals_sbr():
asc = synthesize_aac_codec_private_data("AACH", 24000, 2)
assert len(asc) == 4
assert asc[0] >> 3 == 0x05 # AOT 5 = SBR (HE-AAC)
# Extension sampling frequency = core * 2 = 48 kHz (index 3).
assert ((asc[1] & 0x01) << 1) | (asc[2] >> 7) == 0x03
def test_aac_init_without_codec_private_data_synthesizes_asc():
init = build_init_segment(
stream_type="audio",
fourcc="AACL",
codec_private_data="",
timescale=10000000,
channels=2,
sampling_rate=48000,
)
assert b"mp4a" in init and b"esds" in init
assert bytes.fromhex(AAC_LC_CPD) in init
def test_dolby_vision_uses_dvh1_sample_entry():
init = build_init_segment(
stream_type="video",
fourcc="DVH1",
codec_private_data=VIDEO_HEVC_CPD,
timescale=10000000,
width=3840,
height=1600,
)
assert b"dvh1" in init and b"hvcC" in init
assert b"hvc1" not in init
def test_davc_maps_to_avc1():
init = build_init_segment(
stream_type="video",
fourcc="DAVC",
codec_private_data=VIDEO_AVC_CPD,
timescale=10000000,
)
assert b"avc1" in init and b"avcC" in init
def test_lowercase_fourcc_normalized():
# Real manifests ship FourCC="hvc1" in lowercase.
init = build_init_segment(
stream_type="video",
fourcc="hvc1",
codec_private_data=VIDEO_HEVC_CPD,
timescale=10000000,
)
assert b"hvcC" in init
def test_avcc_selects_sps_pps_by_nal_type_not_position():
nals = split_nal_units(bytes.fromhex(VIDEO_AVC_CPD))
swapped = NAL_START_CODE + nals[1] + NAL_START_CODE + nals[0] # PPS first
avcc = build_avcc(swapped)
# Profile/compat/level must still come from the SPS body.
assert avcc[9:12] == nals[0][1:4]
def test_nal_length_field_respected():
avcc = build_avcc(bytes.fromhex(VIDEO_AVC_CPD), nal_length_size=2)
# avcC payload byte 4 low 2 bits = lengthSizeMinusOne.
assert avcc[12] & 0x03 == 1
def test_parse_hevc_sps_format_8bit():
sps = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD))[1]
assert parse_hevc_sps_format(remove_emulation_prevention(sps)) == (1, 0, 0) # 4:2:0, 8-bit
def test_hvcc_signals_10bit_from_sps():
sps = next(n for n in split_nal_units(bytes.fromhex(VIDEO_HEVC10_CPD)) if (n[0] >> 1) & 0x3F == 33)
assert parse_hevc_sps_format(remove_emulation_prevention(sps)) == (1, 2, 2) # 4:2:0, 10-bit
payload = build_hvcc(bytes.fromhex(VIDEO_HEVC10_CPD))[8:] # strip box header
assert payload[16] == 0xFC | 0x01 # chromaFormat 4:2:0
assert payload[17] == 0xF8 | 0x02 # bitDepthLumaMinus8 = 2
assert payload[18] == 0xF8 | 0x02 # bitDepthChromaMinus8 = 2
def test_ttml_init_structure():
init = build_init_segment(
stream_type="text",
fourcc="TTML",
codec_private_data="",
timescale=10000000,
language="eng",
)
assert b"stpp" in init
assert b"sthd" in init # subtitle media header
assert b"subt" in init and b"SubtitleHandler\0" in init
assert b"http://www.w3.org/ns/ttml\0" in init
def test_constant_iv_tenc_form():
constant_iv = bytes(range(16))
init = build_init_segment(
stream_type="video",
fourcc="HVC1",
codec_private_data=VIDEO_HEVC_CPD,
timescale=10000000,
kid=KID,
constant_iv=constant_iv,
)
# Constant-IV form: default_Per_Sample_IV_Size = 0, then size + IV after the KID.
assert KID + bytes([len(constant_iv)]) + constant_iv in init
tenc_at = init.index(b"tenc")
assert init[tenc_at + 4 + 4 + 3] == 0 # default_Per_Sample_IV_Size
def make_fragment(senc: bytes = b"", saiz: bytes = b"") -> bytes:
tfhd = full_box(b"tfhd", 0, 0, struct.pack(">I", 1) + b"\x00" * 4)
traf = box(b"traf", tfhd + senc + saiz)
return box(b"moof", traf) + box(b"mdat", b"\x00" * 4)
def test_iv_size_from_piff_senc_override_flag():
# PIFF senc uuid with flags&1: AlgorithmID(3) + IV_size(1) + KID(16) override.
payload = b"\x00\x00\x00\x01" + b"\x00\x00\x01" + bytes([16]) + KID + struct.pack(">I", 0)
senc = box(b"uuid", PIFF_SENC_UUID + payload)
assert read_per_sample_iv_size(make_fragment(senc=senc)) == 16
def test_iv_size_from_senc_payload_length():
# Standard senc, no subsamples: 3 samples x 8-byte IVs.
senc = full_box(b"senc", 0, 0, struct.pack(">I", 3) + b"\x11" * 24)
assert read_per_sample_iv_size(make_fragment(senc=senc)) == 8
def test_iv_size_from_senc_with_subsamples():
# senc flags&2: per sample IV(8) + entry_count(2) + 6 bytes per entry.
sample = b"\x22" * 8 + struct.pack(">H", 1) + b"\x00" * 6
senc = full_box(b"senc", 0, 2, struct.pack(">I", 2) + sample * 2)
assert read_per_sample_iv_size(make_fragment(senc=senc)) == 8
def test_iv_size_from_saiz_fallback():
saiz = full_box(b"saiz", 0, 0, bytes([16]) + struct.pack(">I", 5))
assert read_per_sample_iv_size(make_fragment(saiz=saiz)) == 16
def test_iv_size_undetermined_returns_none():
assert read_per_sample_iv_size(make_fragment()) is None
def test_hvcc_embeds_vps_sps_pps():
hvcc = build_hvcc(bytes.fromhex(VIDEO_HEVC_CPD))
nals = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD))
# Each original NAL unit (VPS/SPS/PPS) is embedded verbatim in the arrays.
for nal in nals:
assert nal in hvcc
def test_avcc_requires_sps_and_pps():
with pytest.raises(ValueError):
build_avcc(b"\x00\x00\x00\x01\x67only_sps")
def test_read_track_id_from_fragment():
# Minimal moof/traf/tfhd carrying track_ID = 7.
tfhd = full_box("tfhd".encode(), 0, 0, struct.pack(">I", 7) + b"\x00" * 4)
traf = box(b"traf", tfhd)
moof = box(b"moof", traf)
mdat = box(b"mdat", b"\x00\x00")
assert read_track_id(moof + mdat) == 7
def test_read_track_id_missing_returns_none():
assert read_track_id(box(b"mdat", b"\x00\x00")) is None
def test_remove_emulation_prevention():
# 00 00 03 XX -> the 0x03 emulation byte is dropped.
assert remove_emulation_prevention(b"\x00\x00\x03\x01") == b"\x00\x00\x01"
assert remove_emulation_prevention(b"\x00\x00\x03\x00\x00\x03\x96") == b"\x00\x00\x00\x00\x96"
# The byte after a consumed escape is data, even another 0x03.
assert remove_emulation_prevention(b"\x00\x00\x03\x03") == b"\x00\x00\x03"
assert remove_emulation_prevention(b"\x00\x00\x03\x03\x00\x00\x03\x01") == b"\x00\x00\x03\x00\x00\x01"
def test_two_letter_or_uppercase_language_falls_back_to_und():
# mdhd packs three a-z letters; "en"/"ENG" must not crash struct.pack.
for lang in ("en", "ENG", "", "e1x"):
init = build_init_segment(
stream_type="audio",
fourcc="AACL",
codec_private_data=AAC_LC_CPD,
timescale=10000000,
language=lang,
)
assert init[4:8] == b"ftyp"
def test_high_sampling_rate_does_not_overflow():
# 96 kHz exceeds the 16.16 integer field; written as 0 like ffmpeg does.
init = build_init_segment(
stream_type="audio",
fourcc="AACL",
codec_private_data="",
timescale=10000000,
sampling_rate=96000,
)
assert b"mp4a" in init
def test_read_track_id_truncated_tfhd_returns_none():
tfhd = full_box(b"tfhd", 0, 0, b"\x00\x00") # too short for a track_ID
fragment = box(b"moof", box(b"traf", tfhd))
assert read_track_id(fragment) is None
def test_hvcc_profile_tier_level_is_nonzero():
# De-emulated PTL must yield real profile/level, not the off-by-one garbage.
hvcc = build_hvcc(bytes.fromhex(VIDEO_HEVC_CPD))
payload = hvcc[8:] # strip box header
profile_idc = payload[1] & 0x1F
level_idc = payload[12]
assert profile_idc != 0
assert level_idc != 0

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
import base64
import hashlib
import html
import struct
import urllib.parse
from functools import partial
from pathlib import Path
@@ -18,6 +19,7 @@ from requests import Session
from unshackle.core.constants import DOWNLOAD_CANCELLED, DOWNLOAD_LICENCE_ONLY, AnyTrack
from unshackle.core.drm import DRM_T, PlayReady, Widevine
from unshackle.core.events import events
from unshackle.core.manifests.ism_init import build_init_segment, read_per_sample_iv_size, read_track_id
from unshackle.core.session import RnetSession
from unshackle.core.tracks import Audio, Subtitle, Track, Tracks, Video
from unshackle.core.utilities import log_event, try_ensure_utf8
@@ -85,6 +87,104 @@ class ISM:
drm.append(PlayReady(pssh=pr_pssh, pssh_b64=data))
return drm
@staticmethod
def _init_segment(
track: AnyTrack, session_drm: Optional[DRM_T], first_segment: Optional[bytes] = None
) -> Optional[bytes]:
# Smooth fragments are moof+mdat only; rebuild the ftyp+moov init box from
# the manifest CodecPrivateData (and KID, when encrypted) so the merged file
# is a valid MP4 that shaka/mp4decrypt can parse.
ism = track.data.get("ism") if isinstance(getattr(track, "data", None), dict) else None
if not ism:
return None
stream_index = ism.get("stream_index")
quality_level = ism.get("quality_level")
manifest = ism.get("manifest")
if stream_index is None or quality_level is None:
return None
# CodecPrivateData may legitimately be empty (AAC config is synthesized,
# EC-3 decoders sync from the frames); the builder handles each case.
cpd = quality_level.get("CodecPrivateData") or ""
fourcc = quality_level.get("FourCC") or ""
root_timescale = manifest.get("TimeScale") if manifest is not None else None
timescale = int(stream_index.get("TimeScale") or root_timescale or 10000000)
duration = int((manifest.get("Duration") if manifest is not None else 0) or 0)
# mdhd needs a 3-letter ISO-639-2 code; manifests often carry 2-letter tags.
lang_attr = (stream_index.get("Language") or "").strip()
language = "und"
if lang_attr and tag_is_valid(lang_attr):
try:
language = Language.get(lang_attr).to_alpha3()
except LookupError:
language = "und"
kid: Optional[bytes] = None
if session_drm is not None:
kid_uuid = next(iter(getattr(session_drm, "kids", None) or []), None)
if kid_uuid is not None:
kid = bytes.fromhex(kid_uuid.hex)
# Match the moov track_ID to the fragment's tfhd, else the muxer drops samples.
track_id = (read_track_id(first_segment) if first_segment else None) or 1
# NALUnitLengthField: bytes per NAL length prefix, default 4.
nal_length_size = int(quality_level.get("NALUnitLengthField") or stream_index.get("NALUnitLengthField") or 4)
# Per-sample IV size derived from the fragment senc/saiz (PIFF default 8).
iv_size = (read_per_sample_iv_size(first_segment) if first_segment and kid else None) or 8
try:
if isinstance(track, Subtitle):
if track.codec != Subtitle.Codec.fTTML:
return None # plain-text subtitle formats concatenate fine
return build_init_segment(
stream_type="text",
fourcc="TTML",
codec_private_data="",
timescale=timescale,
duration=duration,
language=language,
track_id=track_id,
)
if isinstance(track, Video):
return build_init_segment(
stream_type="video",
fourcc=fourcc,
codec_private_data=cpd,
timescale=timescale,
duration=duration,
language=language,
width=int(quality_level.get("MaxWidth") or stream_index.get("MaxWidth") or 0),
height=int(quality_level.get("MaxHeight") or stream_index.get("MaxHeight") or 0),
track_id=track_id,
nal_length_size=nal_length_size,
kid=kid,
iv_size=iv_size,
)
return build_init_segment(
stream_type="audio",
fourcc=fourcc,
codec_private_data=cpd,
timescale=timescale,
duration=duration,
language=language,
channels=int(quality_level.get("Channels") or 2),
bits_per_sample=int(quality_level.get("BitsPerSample") or 16),
sampling_rate=int(quality_level.get("SamplingRate") or 48000),
track_id=track_id,
kid=kid,
iv_size=iv_size,
)
except (NotImplementedError, ValueError, struct.error) as e:
# Unsupported codec, malformed CodecPrivateData or out-of-range field —
# fall back to raw concatenation rather than aborting the download.
log_event(
"manifest_ism_init_unsupported",
level="WARNING",
message=f"Could not synthesize ISM init segment ({fourcc}): {e}",
context={"track_id": getattr(track, "id", None), "fourcc": fourcc},
)
return None
def to_tracks(self, language: Optional[Union[str, Language]] = None) -> Tracks:
tracks = Tracks()
base_url = self.url
@@ -383,8 +483,13 @@ class ISM:
raise FileNotFoundError(error_msg)
with open(save_path, "wb") as f:
for segment_file in segments_to_merge:
segment_data = segment_file.read_bytes()
first_segment = segments_to_merge[0].read_bytes() if segments_to_merge else None
init_segment = ISM._init_segment(track, session_drm, first_segment)
if init_segment:
f.write(init_segment)
for index, segment_file in enumerate(segments_to_merge):
# First segment was already read for the init synthesis — reuse it.
segment_data = first_segment if index == 0 and first_segment else segment_file.read_bytes()
if (
not session_drm
and isinstance(track, Subtitle)

View File

@@ -0,0 +1,622 @@
"""
Synthesize an ISO-BMFF initialization segment (ftyp + moov) for ISM / Smooth
Streaming tracks.
Smooth Streaming fragments are bare ``moof`` + ``mdat`` pairs; the server never
sends a ``moov``. The init box must be reconstructed from the manifest's
``CodecPrivateData`` (and, for protected content, the track KID) before a muxer
or decryptor such as shaka-packager can parse the stream. Ported from yt-dlp's
``write_piff_header`` and N_m3u8DL-RE's ``MSSMoovProcessor`` with HEVC, Dolby
Vision, EC-3, TTML and CENC (PIFF) support.
"""
from __future__ import annotations
import binascii
import struct
from typing import Iterator, Optional
# Big-endian field packers (named for the bit widths they encode).
u8 = struct.Struct(">B")
u16 = struct.Struct(">H")
u32 = struct.Struct(">I")
u64 = struct.Struct(">Q")
s16 = struct.Struct(">h")
s88 = struct.Struct(">bx") # 8.8 fixed-point
s1616 = struct.Struct(">hxx") # 16.16 fixed-point
u1616 = struct.Struct(">Hxx")
s32 = struct.Struct(">i")
# 3x3 transformation matrix (identity), as stored in tkhd/mvhd.
UNITY_MATRIX = (
s32.pack(0x10000) + s32.pack(0) * 3
+ s32.pack(0) + s32.pack(0x10000) + s32.pack(0) * 2
+ s32.pack(0) * 2 + s32.pack(0x40000000)
)
TRACK_ENABLED = 0x1
TRACK_IN_MOVIE = 0x2
TRACK_IN_PREVIEW = 0x4
SELF_CONTAINED = 0x1
# Fixed creation/modification time — deterministic output (no wall clock).
EPOCH = 0
NAL_START_CODE = b"\x00\x00\x00\x01"
# WAVEFORMATEXTENSIBLE SubFormat GUID for Dolby Digital Plus, as serialized
# (little-endian) inside Smooth EC-3 CodecPrivateData.
DOLBY_DIGITAL_PLUS_GUID = bytes.fromhex("AF87FBA7022DFB42A4D405CD93843BDD")
# PIFF SampleEncryptionBox usertype (the pre-CENC 'senc' carried as a uuid box).
PIFF_SENC_UUID = bytes.fromhex("A2394F525A9B4F14A2446C427C648DF4")
TTML_NAMESPACE = b"http://www.w3.org/ns/ttml\0"
# ISO/IEC 14496-3 samplingFrequencyIndex table for AudioSpecificConfig.
AAC_SAMPLING_FREQUENCY_INDEX = {
96000: 0x0,
88200: 0x1,
64000: 0x2,
48000: 0x3,
44100: 0x4,
32000: 0x5,
24000: 0x6,
22050: 0x7,
16000: 0x8,
12000: 0x9,
11025: 0xA,
8000: 0xB,
7350: 0xC,
}
def box(box_type: bytes, payload: bytes) -> bytes:
"""Wrap payload in a basic ISO-BMFF box (size + fourcc + payload)."""
return u32.pack(8 + len(payload)) + box_type + payload
def full_box(box_type: bytes, version: int, flags: int, payload: bytes) -> bytes:
"""Wrap payload in a FullBox (adds 1-byte version + 3-byte flags)."""
return box(box_type, u8.pack(version) + u32.pack(flags)[1:] + payload)
def split_nal_units(codec_private_data: bytes) -> list[bytes]:
"""Split CodecPrivateData into its NAL units (drops the start codes)."""
units = [u for u in codec_private_data.split(NAL_START_CODE) if u]
return units
def remove_emulation_prevention(data: bytes) -> bytes:
"""Strip H.26x emulation-prevention bytes (the 0x03 in any 00 00 03 run).
The byte after a consumed escape is data — even another 0x03 — so the scan
must skip past it rather than re-examine (a naive trailing-window check
over-strips consecutive escapes and shifts every later bit position).
"""
out = bytearray()
i = 0
while i < len(data):
if i + 2 < len(data) and data[i] == 0 and data[i + 1] == 0 and data[i + 2] == 3:
out += b"\x00\x00"
i += 3
else:
out.append(data[i])
i += 1
return bytes(out)
class BitReader:
"""MSB-first bit reader with the exp-Golomb decode H.26x headers need."""
def __init__(self, data: bytes) -> None:
self.data = data
self.pos = 0
def read_bits(self, count: int) -> int:
value = 0
for _ in range(count):
byte = self.data[self.pos >> 3]
value = (value << 1) | ((byte >> (7 - (self.pos & 7))) & 1)
self.pos += 1
return value
def read_ue(self) -> int:
zeros = 0
while self.read_bits(1) == 0:
zeros += 1
if zeros > 32:
raise ValueError("Invalid exp-Golomb code")
return (1 << zeros) - 1 + (self.read_bits(zeros) if zeros else 0)
def parse_hevc_sps_format(sps_rbsp: bytes) -> tuple[int, int, int]:
"""
Parse (chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8)
from a de-emulated HEVC SPS RBSP (including its 2-byte NAL header).
"""
r = BitReader(sps_rbsp)
r.read_bits(16) # NAL unit header
r.read_bits(4) # sps_video_parameter_set_id
max_sub_layers_minus1 = r.read_bits(3)
r.read_bits(1) # sps_temporal_id_nesting_flag
r.read_bits(96) # general profile_tier_level (12 bytes)
profile_present = []
level_present = []
for _ in range(max_sub_layers_minus1):
profile_present.append(r.read_bits(1))
level_present.append(r.read_bits(1))
if max_sub_layers_minus1 > 0:
r.read_bits((8 - max_sub_layers_minus1) * 2) # reserved_zero_2bits
for i in range(max_sub_layers_minus1):
if profile_present[i]:
r.read_bits(88) # sub_layer profile_tier
if level_present[i]:
r.read_bits(8) # sub_layer_level_idc
r.read_ue() # sps_seq_parameter_set_id
chroma_format_idc = r.read_ue()
if chroma_format_idc == 3:
r.read_bits(1) # separate_colour_plane_flag
r.read_ue() # pic_width_in_luma_samples
r.read_ue() # pic_height_in_luma_samples
if r.read_bits(1): # conformance_window_flag
for _ in range(4):
r.read_ue()
bit_depth_luma_minus8 = r.read_ue()
bit_depth_chroma_minus8 = r.read_ue()
return chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8
def iter_boxes(data: bytes, start: int, end: int) -> Iterator[tuple[bytes, Optional[bytes], int, int]]:
"""Yield (type, uuid_usertype, payload_start, box_end) for each child box."""
offset = start
while offset + 8 <= end:
size = struct.unpack(">I", data[offset : offset + 4])[0]
box_type = data[offset + 4 : offset + 8]
header = 8
if size == 1:
size = struct.unpack(">Q", data[offset + 8 : offset + 16])[0]
header = 16
if size == 0:
size = end - offset
if size < 8: # corrupt box header; stop rather than loop forever
return
usertype = None
if box_type == b"uuid" and offset + header + 16 <= end:
usertype = data[offset + header : offset + header + 16]
header += 16
yield box_type, usertype, offset + header, offset + size
offset += size
def find_box(data: bytes, start: int, end: int, target: bytes) -> Optional[tuple[int, int]]:
"""Find the first child box of the given type; return (payload_start, end)."""
for box_type, _, body, box_end in iter_boxes(data, start, end):
if box_type == target:
return body, box_end
return None
def read_track_id(fragment: bytes) -> Optional[int]:
"""Read the track_ID from a fragment's moof/traf/tfhd box, if present.
Smooth fragments declare their own track_ID; the synthesized moov must use
the same value or the muxer cannot associate samples with the track. The
track_ID sits before any tfhd optional fields, so the flags don't matter.
"""
moof = find_box(fragment, 0, len(fragment), b"moof")
if not moof:
return None
traf = find_box(fragment, *moof, b"traf")
if not traf:
return None
tfhd = find_box(fragment, *traf, b"tfhd")
if not tfhd:
return None
body, _ = tfhd
if body + 8 > len(fragment): # truncated tfhd
return None
# tfhd payload: version(1) + flags(3) + track_ID(4)
return struct.unpack(">I", fragment[body + 4 : body + 8])[0]
def read_per_sample_iv_size(fragment: bytes) -> Optional[int]:
"""
Derive the per-sample IV size (8 or 16) from a fragment's sample-encryption
metadata, for the synthesized tenc default_Per_Sample_IV_Size.
Checks, in order: the PIFF 'senc' uuid override flag (explicit IV size),
the senc payload length (sample_count vs IV/subsample entries), and the
saiz default_sample_info_size (only unambiguous without subsamples).
"""
moof = find_box(fragment, 0, len(fragment), b"moof")
if not moof:
return None
traf = find_box(fragment, *moof, b"traf")
if not traf:
return None
senc: Optional[tuple[int, int]] = None
saiz_default: Optional[int] = None
senc_has_subsamples = False
for box_type, usertype, body, box_end in iter_boxes(fragment, *traf):
if box_type == b"senc" or (box_type == b"uuid" and usertype == PIFF_SENC_UUID):
senc = (body, box_end)
elif box_type == b"saiz":
flags = int.from_bytes(fragment[body + 1 : body + 4], "big")
pos = body + 4 + (8 if flags & 0x1 else 0) # skip aux_info_type fields
if pos < box_end:
saiz_default = fragment[pos]
if senc:
body, box_end = senc
flags = int.from_bytes(fragment[body + 1 : body + 4], "big")
senc_has_subsamples = bool(flags & 0x2)
pos = body + 4
if flags & 0x1: # PIFF override: AlgorithmID(3) + IV_size(1) + KID(16)
return fragment[pos + 3]
if pos + 4 <= box_end:
sample_count = struct.unpack(">I", fragment[pos : pos + 4])[0]
pos += 4
if sample_count:
if not senc_has_subsamples:
size, rem = divmod(box_end - pos, sample_count)
if rem == 0 and size in (8, 16):
return size
else:
# Walk the entries with each candidate IV size; the one that
# lands exactly on the box end is correct.
for iv_size in (8, 16):
cursor = pos
for _ in range(sample_count):
cursor += iv_size
if cursor + 2 > box_end:
cursor = -1
break
entries = struct.unpack(">H", fragment[cursor : cursor + 2])[0]
cursor += 2 + 6 * entries
if cursor > box_end:
cursor = -1
break
if cursor == box_end:
return iv_size
if not senc_has_subsamples and saiz_default in (8, 16):
return saiz_default
return None
def build_avcc(codec_private_data: bytes, nal_length_size: int = 4) -> bytes:
"""Build an avcC (AVC decoder config) box from SPS+PPS CodecPrivateData."""
nals = split_nal_units(codec_private_data)
# Pick parameter sets by H.264 NAL type (low 5 bits): 7 = SPS, 8 = PPS.
# Manifests do not guarantee SPS-first ordering.
sps = next((n for n in nals if n[0] & 0x1F == 7), None)
pps = next((n for n in nals if n[0] & 0x1F == 8), None)
if not sps or not pps:
raise ValueError("AVC CodecPrivateData must contain SPS and PPS NAL units")
payload = u8.pack(1) # configuration version
payload += sps[1:4] # profile / compat / level (from SPS NAL body)
payload += u8.pack(0xFC | (nal_length_size - 1)) # reserved + length size minus one
payload += u8.pack(0xE0 | 1) # reserved + number of SPS (1)
payload += u16.pack(len(sps)) + sps
payload += u8.pack(1) # number of PPS
payload += u16.pack(len(pps)) + pps
return box(b"avcC", payload)
def build_hvcc(codec_private_data: bytes, nal_length_size: int = 4) -> bytes:
"""
Build an hvcC (HEVC decoder config) box from VPS+SPS+PPS CodecPrivateData.
Profile/tier/level bytes are lifted from the SPS profile_tier_level; chroma
format and bit depths are parsed from the SPS so 10-bit/HDR streams signal
correctly (falls back to 8-bit 4:2:0 on malformed SPS data).
"""
nals = split_nal_units(codec_private_data)
if len(nals) < 3:
raise ValueError("HEVC CodecPrivateData must contain VPS, SPS and PPS NAL units")
# Group NAL units by type (HEVC NAL type = (first byte >> 1) & 0x3F).
by_type: dict[int, list[bytes]] = {}
for nal in nals:
nal_type = (nal[0] >> 1) & 0x3F
by_type.setdefault(nal_type, []).append(nal)
sps = by_type.get(33, [b""])[0]
# profile_tier_level must be read from the de-emulated SPS RBSP, after the
# 2-byte NAL header + 1 byte (sps_video_parameter_set_id(4) +
# sps_max_sub_layers_minus1(3) + sps_temporal_id_nesting_flag(1)). PTL is 12
# bytes: profile byte(1) + compat flags(4) + constraint flags(6) + level(1).
sps_rbsp = remove_emulation_prevention(sps)
ptl = sps_rbsp[3:15] if len(sps_rbsp) >= 15 else b"\x00" * 12
general_profile_space_tier_profile = ptl[0:1] or b"\x00"
general_profile_compat = ptl[1:5].ljust(4, b"\x00")
general_constraint = ptl[5:11].ljust(6, b"\x00")
general_level_idc = ptl[11:12] or b"\x00"
try:
chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8 = parse_hevc_sps_format(sps_rbsp)
except (IndexError, ValueError):
chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8 = 1, 0, 0
payload = u8.pack(1) # configurationVersion
payload += general_profile_space_tier_profile
payload += general_profile_compat
payload += general_constraint
payload += general_level_idc
payload += u16.pack(0xF000) # reserved(4) + min_spatial_segmentation_idc(12)
payload += u8.pack(0xFC) # reserved(6) + parallelismType(2)
payload += u8.pack(0xFC | (chroma_format_idc & 0x03)) # reserved(6) + chromaFormat(2)
payload += u8.pack(0xF8 | (bit_depth_luma_minus8 & 0x07)) # reserved(5) + bitDepthLumaMinus8(3)
payload += u8.pack(0xF8 | (bit_depth_chroma_minus8 & 0x07)) # reserved(5) + bitDepthChromaMinus8(3)
payload += u16.pack(0) # avgFrameRate
# constantFrameRate(2)+numTemporalLayers(3)+temporalIdNested(1)+lengthSizeMinusOne(2)
payload += u8.pack((nal_length_size - 1) & 0x03)
arrays = bytearray()
num_arrays = 0
for nal_type in (32, 33, 34): # VPS, SPS, PPS
units = by_type.get(nal_type)
if not units:
continue
num_arrays += 1
arrays += u8.pack(0x80 | nal_type) # array_completeness(1)+reserved(1)+NAL type(6)
arrays += u16.pack(len(units))
for unit in units:
arrays += u16.pack(len(unit)) + unit
payload += u8.pack(num_arrays) + bytes(arrays)
return box(b"hvcC", payload)
def build_esds(codec_private_data: bytes) -> bytes:
"""Build an esds box wrapping the AAC AudioSpecificConfig."""
asc = codec_private_data
# DecoderSpecificInfo (tag 0x05)
dsi = u8.pack(0x05) + u8.pack(len(asc)) + asc
# DecoderConfigDescriptor (tag 0x04): objectType=0x40 (AAC), stream type audio
dcd = (
u8.pack(0x40) # object type indication = MPEG-4 AAC
+ u8.pack(0x15) # stream type (audio) << 2 | upstream | reserved
+ b"\x00\x00\x00" # buffer size
+ u32.pack(0) # max bitrate
+ u32.pack(0) # avg bitrate
+ dsi
)
dcd_box = u8.pack(0x04) + u8.pack(len(dcd)) + dcd
# SLConfigDescriptor (tag 0x06)
sl = u8.pack(0x06) + u8.pack(1) + u8.pack(0x02)
# ES_Descriptor (tag 0x03)
es = u8.pack(0x03) + u8.pack(len(dcd_box) + len(sl) + 3) + u16.pack(0) + u8.pack(0) + dcd_box + sl
return full_box(b"esds", 0, 0, es)
def build_dec3(codec_private_data: bytes) -> Optional[bytes]:
"""Build a dec3 (EC-3 specific) box from Smooth EC-3 CodecPrivateData.
Smooth EC-3 CodecPrivateData ([MS-SSTR] AudioTag 65534) serializes a
WAVEFORMATEXTENSIBLE — sometimes the full structure, sometimes only its
extension (samples-per-block + channel mask + DD+ SubFormat GUID) — with
the raw dec3 payload (ETSI TS 102 366 F.6) after the GUID. Returns None
when the GUID is absent — decoders still sync from EC-3 frames in mdat.
"""
guid_at = codec_private_data.find(DOLBY_DIGITAL_PLUS_GUID)
if guid_at != -1 and len(codec_private_data) > guid_at + 16:
return box(b"dec3", codec_private_data[guid_at + 16 :])
return None
def synthesize_aac_codec_private_data(fourcc: str, sampling_rate: int, channels: int) -> bytes:
"""Generate the AAC AudioSpecificConfig when the manifest omits it.
AACL -> 2-byte AAC-LC config; AACH -> 4-byte HE-AAC (SBR, AOT 5) config
with the extension sampling frequency at twice the core rate.
"""
freq = AAC_SAMPLING_FREQUENCY_INDEX.get(sampling_rate, 0x0)
if fourcc == "AACH":
ext_freq = AAC_SAMPLING_FREQUENCY_INDEX.get(sampling_rate * 2, 0x0)
return bytes(
(
(0x05 << 3) | (freq >> 1),
((freq & 0x01) << 7) | (channels << 3) | (ext_freq >> 1),
((ext_freq & 0x01) << 7) | (0x02 << 2), # core object type = AAC LC
0x00, # alignment bits
)
)
return bytes(((0x02 << 3) | (freq >> 1), ((freq & 0x01) << 7) | (channels << 3)))
def build_sinf(
original_format: bytes,
kid: bytes,
iv_size: int = 8,
constant_iv: Optional[bytes] = None,
) -> bytes:
"""Build a sinf protection box (frma + schm cenc + schi/tenc) for CENC.
iv_size is the tenc default_Per_Sample_IV_Size (8 or 16). When constant_iv
is given, the per-sample IV size is 0 and the constant IV is appended per
ISO/IEC 23001-7 (cbcs-style constant-IV form).
"""
frma = box(b"frma", original_format)
schm = full_box(b"schm", 0, 0, b"cenc" + u32.pack(0x00010000))
tenc_payload = (
u8.pack(0) # reserved
+ u8.pack(0) # default_crypt_byte_block / skip_byte_block (cenc)
+ u8.pack(1) # default_isProtected
+ u8.pack(0 if constant_iv else iv_size) # default_Per_Sample_IV_Size
+ kid # default_KID (16 bytes)
)
if constant_iv:
tenc_payload += u8.pack(len(constant_iv)) + constant_iv
schi = box(b"schi", full_box(b"tenc", 0, 0, tenc_payload))
return box(b"sinf", frma + schm + schi)
def build_init_segment(
*,
stream_type: str,
fourcc: str,
codec_private_data: str,
timescale: int = 10000000,
duration: int = 0,
language: str = "und",
width: int = 0,
height: int = 0,
channels: int = 2,
bits_per_sample: int = 16,
sampling_rate: int = 48000,
track_id: int = 1,
nal_length_size: int = 4,
kid: Optional[bytes] = None,
iv_size: int = 8,
constant_iv: Optional[bytes] = None,
) -> bytes:
"""
Build a complete ftyp + moov initialization segment.
stream_type: "video" | "audio" | "text".
fourcc: Smooth FourCC ("H264"/"AVC1"/"DAVC", "HVC1"/"HEV1", "DVHE"/"DVH1",
"AACL"/"AACH", "EC-3", "TTML").
codec_private_data: hex string from the manifest QualityLevel.
nal_length_size: manifest NALUnitLengthField (bytes per NAL length prefix).
kid: 16-byte default key id; when set, the sample entry is wrapped for CENC.
iv_size / constant_iv: tenc IV form (see build_sinf).
"""
if stream_type not in ("video", "audio", "text"):
raise ValueError(f"Unsupported stream type: {stream_type}")
fourcc = (fourcc or "").upper()
cpd = binascii.unhexlify(codec_private_data) if codec_private_data else b""
encrypted = kid is not None
# mdhd packs exactly three a-z letters; anything else (2-letter tags,
# uppercase) would underflow the 5-bit fields, so fall back to "und".
lang = (language or "").lower()
if len(lang) != 3 or not all("a" <= c <= "z" for c in lang):
lang = "und"
# --- ftyp ---
ftyp = box(b"ftyp", b"isml" + u32.pack(1) + b"iso5" + b"iso6" + b"piff" + b"msdh")
# --- mvhd ---
mvhd = full_box(
b"mvhd", 1, 0,
u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(timescale) + u64.pack(duration)
+ s1616.pack(1) + s88.pack(1) + u16.pack(0) + u32.pack(0) * 2
+ UNITY_MATRIX + u32.pack(0) * 6 + u32.pack(0xFFFFFFFF),
)
# --- tkhd ---
tkhd = full_box(
b"tkhd", 1, TRACK_ENABLED | TRACK_IN_MOVIE | TRACK_IN_PREVIEW,
u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(track_id) + u32.pack(0)
+ u64.pack(duration) + u32.pack(0) * 2 + s16.pack(0) + s16.pack(0)
+ s88.pack(1 if stream_type == "audio" else 0) + u16.pack(0) + UNITY_MATRIX
+ u1616.pack(width) + u1616.pack(height),
)
# --- mdhd + hdlr ---
packed_lang = ((ord(lang[0]) - 0x60) << 10) | ((ord(lang[1]) - 0x60) << 5) | (ord(lang[2]) - 0x60)
mdhd = full_box(
b"mdhd", 1, 0,
u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(timescale) + u64.pack(duration)
+ u16.pack(packed_lang) + u16.pack(0),
)
if stream_type == "audio":
hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"soun" + u32.pack(0) * 3 + b"SoundHandler\0")
media_header = full_box(b"smhd", 0, 0, s88.pack(0) + u16.pack(0))
elif stream_type == "text":
hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"subt" + u32.pack(0) * 3 + b"SubtitleHandler\0")
media_header = full_box(b"sthd", 0, 0, b"")
else:
hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"vide" + u32.pack(0) * 3 + b"VideoHandler\0")
media_header = full_box(b"vmhd", 0, 1, u16.pack(0) + u16.pack(0) * 3)
# --- dinf ---
dref = full_box(b"dref", 0, 0, u32.pack(1) + full_box(b"url ", 0, SELF_CONTAINED, b""))
dinf = box(b"dinf", dref)
# --- stsd sample entry ---
sample_entry_payload = u8.pack(0) * 6 + u16.pack(1) # reserved + data reference index
if stream_type == "video":
sample_entry_payload += (
u16.pack(0) + u16.pack(0) + u32.pack(0) * 3
+ u16.pack(width) + u16.pack(height)
+ u1616.pack(0x48) + u1616.pack(0x48) + u32.pack(0) + u16.pack(1)
+ u8.pack(0) * 32 + u16.pack(0x18) + s16.pack(-1)
)
if fourcc in ("H264", "AVC1", "DAVC"):
config_box = build_avcc(cpd, nal_length_size)
codec_fourcc = b"avc1"
elif fourcc in ("HVC1", "HEV1", "HEVC", "H265"):
config_box = build_hvcc(cpd, nal_length_size)
codec_fourcc = b"hvc1"
elif fourcc in ("DVHE", "DVH1"):
# Dolby Vision over HEVC: same hvcC config, dvh1 sample entry.
config_box = build_hvcc(cpd, nal_length_size)
codec_fourcc = b"dvh1"
else:
raise NotImplementedError(f"Unsupported video FourCC: {fourcc}")
sample_entry_payload += config_box
if encrypted:
sample_entry_payload += build_sinf(codec_fourcc, kid, iv_size, constant_iv)
sample_entry_box = box(b"encv", sample_entry_payload)
else:
sample_entry_box = box(codec_fourcc, sample_entry_payload)
elif stream_type == "audio":
# samplerate is 16.16 fixed-point; rates above 65535 Hz are written as 0
# (decoders read the real rate from the codec config), matching ffmpeg.
sample_entry_payload += (
u32.pack(0) * 2 + u16.pack(channels) + u16.pack(bits_per_sample)
+ u16.pack(0) + u16.pack(0) + u32.pack((sampling_rate if sampling_rate <= 0xFFFF else 0) << 16)
)
if fourcc in ("AACL", "AACH", "AAC"):
if not cpd:
cpd = synthesize_aac_codec_private_data(fourcc, sampling_rate, channels)
sample_entry_payload += build_esds(cpd)
codec_fourcc = b"mp4a"
elif fourcc == "EC-3":
dec3 = build_dec3(cpd)
if dec3:
sample_entry_payload += dec3
codec_fourcc = b"ec-3"
else:
raise NotImplementedError(f"Unsupported audio FourCC: {fourcc}")
if encrypted:
sample_entry_payload += build_sinf(codec_fourcc, kid, iv_size, constant_iv)
sample_entry_box = box(b"enca", sample_entry_payload)
else:
sample_entry_box = box(codec_fourcc, sample_entry_payload)
else: # text
if fourcc in ("TTML", "STPP", "DFXP"):
# XMLSubtitleSampleEntry: namespace + schema_location + aux mime types.
sample_entry_payload += TTML_NAMESPACE + b"\0" + b"\0"
sample_entry_box = box(b"stpp", sample_entry_payload)
else:
raise NotImplementedError(f"Unsupported text FourCC: {fourcc}")
stsd = full_box(b"stsd", 0, 0, u32.pack(1) + sample_entry_box)
# --- empty sample tables (fragmented: real samples live in moof/traf) ---
stbl = box(
b"stbl",
stsd
+ full_box(b"stts", 0, 0, u32.pack(0))
+ full_box(b"stsc", 0, 0, u32.pack(0))
+ full_box(b"stsz", 0, 0, u32.pack(0) + u32.pack(0))
+ full_box(b"stco", 0, 0, u32.pack(0)),
)
minf = box(b"minf", media_header + dinf + stbl)
mdia = box(b"mdia", mdhd + hdlr + minf)
trak = box(b"trak", tkhd + mdia)
# --- mvex (mehd + trex) signals a fragmented file ---
mehd = full_box(b"mehd", 1, 0, u64.pack(duration))
trex = full_box(
b"trex", 0, 0,
u32.pack(track_id) + u32.pack(1) + u32.pack(0) + u32.pack(0) + u32.pack(0),
)
mvex = box(b"mvex", mehd + trex)
moov = box(b"moov", mvhd + trak + mvex)
return ftyp + moov