fix(ism): rebuild moov init segment for Smooth Streaming decrypt

ISM (Smooth Streaming) tracks raw-concatenate moof+mdat fragments with no ftyp/moov, so shaka-packager/mp4decrypt fail with PARSER_FAILURE (exit 2) on decrypt. The init box was previously built by n_m3u8dl_re, removed in the downloader consolidation. Add ism_init.py, a dependency-free byte-level MP4 init-segment synthesizer that rebuilds ftyp+moov from the manifest CodecPrivateData, ported from yt-dlp's write_piff_header and N_m3u8DL-RE's MSSMoovProcessor with full codec parity: - AVC (H264/AVC1/DAVC), with SPS/PPS picked by NAL type rather than position and NALUnitLengthField honored - HEVC (HVC1/HEV1) with chroma format and bit depths parsed from the de-emulated SPS via exp-Golomb so 10-bit/HDR signals correctly, and profile/tier/level lifted from the SPS PTL - Dolby Vision (DVHE/DVH1) as hvcC with a dvh1 sample entry - AAC (AACL/AACH) with the AudioSpecificConfig synthesized from SamplingRate/Channels when the manifest omits CodecPrivateData - EC-3 with a real dec3 box extracted from the WAVEFORMATEXTENSIBLE CodecPrivateData (Dolby GUID located by search, not fixed offset) - TTML subtitles as stpp/sthd/subt, wired for fragmented-TTML tracks CENC wrapping (encv/enca + sinf/tenc with default_KID) covers encrypted tracks: the per-sample IV size is derived from the fragment senc/saiz (PIFF override flag, payload arithmetic, saiz fallback) instead of assuming 8, and the constant-IV tenc form is supported. Read the track_ID from the first fragment's tfhd so the moov matches and the muxer does not drop samples. Wire ISM.download_track to prepend the synthesized init before merging; unsupported codecs soft-fail to raw concatenation with a warning. Harden against real-world inputs: 2-letter/uppercase manifest language tags normalize to ISO-639-2 (und fallback), >65535 Hz sample rates no longer overflow the 16.16 field, truncated tfhd returns None, struct.error joins the soft-fail handler, and the emulation-prevention scan no longer over-strips consecutive escapes. Add regression tests (37) covering box structure, every supported FourCC, 10-bit SPS parsing, ASC synthesis, dec3 extraction, IV-size derivation and the crash fixes. Validated structurally per codec with ffmpeg-minted fragments: shaka-packager parses synth-init+fragments with exit 0 and ffprobe reports the expected codec, including a live run against a public Smooth Streaming server.
2026-06-22 17:07:23 +00:00 · 2026-06-11 13:41:58 -06:00
parent 466bf610cc
commit 39034f2bb5
3 changed files with 1139 additions and 2 deletions
--- a/tests/core/test_ism_init.py
+++ b/tests/core/test_ism_init.py
@@ -0,0 +1,410 @@
 """Regression tests for ISM init-segment synthesis (ftyp + moov).
 Smooth Streaming fragments carry no moov; the init box must be rebuilt from the
 manifest CodecPrivateData before shaka/mp4decrypt can parse the stream. These
 guard the byte-level box structure so a future downloader refactor cannot
 silently drop it again (the c323db9 regression).
 """
 from __future__ import annotations
 import struct
 import pytest
 from unshackle.core.manifests.ism_init import (NAL_START_CODE, PIFF_SENC_UUID, box, build_avcc, build_dec3,
                                               build_hvcc, build_init_segment, full_box, parse_hevc_sps_format,
                                               read_per_sample_iv_size, read_track_id, remove_emulation_prevention,
                                               split_nal_units, synthesize_aac_codec_private_data)
 # Real CodecPrivateData taken from a Smooth Streaming manifest.
 VIDEO_HEVC_CPD = (
    "0000000140010C01FFFF01600000030090000003000003009695980900000001420101016000000300900000"
    "030000030096A001E020064165959A4930BC05A80808082000007D20000BB801000000014401C172B66240"
 )
 # H.264 SPS+PPS (start-code delimited) for the AVC path.
 VIDEO_AVC_CPD = "00000001674d401e9a6602800b76020000003e90000bb800f18311200000000168ebccb22c"
 # 10-bit (Main 10) HEVC VPS+SPS+PPS minted with x265; ffprobe reads the
 # synthesized init as "Main 10 / yuv420p10le".
 VIDEO_HEVC10_CPD = (
    "0000000140010c01ffff02200000030090000003000003003c959809000000000142010102200000030090"
    "000003000003003ca00a080b9f6d96566924caf0168080000003008000000c8400000000014401c172b4624000"
 )
 AAC_LC_CPD = "1190"
 # Real Smooth EC-3 CodecPrivateData: WAVEFORMATEXTENSIBLE extension (samples
 # per block + channel mask + DD+ GUID) followed by the 5-byte dec3 payload.
 EC3_CPD = "00063F000000AF87FBA7022DFB42A4D405CD93843BDD0600200F00"
 KID = bytes.fromhex("09fd2bd778bb544785ed2322dc6a7d87")
 def top_level_boxes(data: bytes) -> list[tuple[str, int]]:
    boxes, offset = [], 0
    while offset + 8 <= len(data):
        size = struct.unpack(">I", data[offset : offset + 4])[0]
        box_type = data[offset + 4 : offset + 8].decode("latin1")
        if size == 1:
            size = struct.unpack(">Q", data[offset + 8 : offset + 16])[0]
        if size == 0:
            size = len(data) - offset
        boxes.append((box_type, size))
        offset += size
    return boxes
 def test_split_nal_units_drops_start_codes():
    nals = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD))
    # VPS (32), SPS (33), PPS (34) by HEVC NAL type = (first_byte >> 1) & 0x3F.
    assert [(n[0] >> 1) & 0x3F for n in nals] == [32, 33, 34]
 def test_hevc_init_structure():
    init = build_init_segment(
        stream_type="video",
        fourcc="HVC1",
        codec_private_data=VIDEO_HEVC_CPD,
        timescale=10000000,
        width=3840,
        height=1600,
    )
    boxes = top_level_boxes(init)
    assert [b[0] for b in boxes] == ["ftyp", "moov"]
    assert boxes[0][1] + boxes[1][1] == len(init)
    assert b"hvcC" in init
    assert b"hvc1" in init
    # Unencrypted: no protection scheme boxes.
    assert b"encv" not in init and b"sinf" not in init
 def test_avc_init_structure():
    init = build_init_segment(
        stream_type="video",
        fourcc="H264",
        codec_private_data=VIDEO_AVC_CPD,
        timescale=10000000,
        width=1280,
        height=720,
    )
    assert init[4:8] == b"ftyp"
    assert b"avcC" in init and b"avc1" in init
 def test_aac_audio_init_structure():
    init = build_init_segment(
        stream_type="audio",
        fourcc="AACL",
        codec_private_data=AAC_LC_CPD,
        timescale=10000000,
        channels=2,
        sampling_rate=48000,
    )
    assert b"mp4a" in init and b"esds" in init
    assert b"smhd" in init  # sound media header, not video
 def test_encrypted_init_has_cenc_boxes():
    init = build_init_segment(
        stream_type="video",
        fourcc="HVC1",
        codec_private_data=VIDEO_HEVC_CPD,
        timescale=10000000,
        width=3840,
        height=1600,
        kid=KID,
    )
    # Encrypted sample entry is wrapped: encv -> sinf(frma+schm+schi(tenc)).
    assert b"encv" in init
    assert b"sinf" in init and b"frma" in init and b"tenc" in init
    assert b"cenc" in init
    # The 16-byte default_KID must be embedded verbatim for shaka to map the key.
    assert KID in init
    # Original codec preserved inside frma for the muxer.
    assert b"hvc1" in init
 def test_unsupported_codec_raises():
    # Unknown FourCC (e.g. VC-1); caller soft-fails to raw concat.
    with pytest.raises(NotImplementedError):
        build_init_segment(
            stream_type="video",
            fourcc="WVC1",
            codec_private_data="00063F00",
            timescale=10000000,
        )
 def test_ec3_init_embeds_dec3_from_codec_private_data():
    init = build_init_segment(
        stream_type="audio",
        fourcc="EC-3",
        codec_private_data=EC3_CPD,
        timescale=10000000,
        channels=6,
        sampling_rate=48000,
    )
    assert b"ec-3" in init
    # dec3 payload = CodecPrivateData past the 22-byte WAVEFORMATEXTENSIBLE header.
    assert box(b"dec3", bytes.fromhex(EC3_CPD)[22:]) in init
    assert b"esds" not in init  # no MPEG-4 descriptor inside an ec-3 entry
 def test_ec3_encrypted_wraps_enca_with_frma():
    init = build_init_segment(
        stream_type="audio",
        fourcc="EC-3",
        codec_private_data=EC3_CPD,
        timescale=10000000,
        channels=6,
        kid=KID,
    )
    assert b"enca" in init and b"sinf" in init and b"tenc" in init
    assert box(b"frma", b"ec-3") in init
    assert KID in init
 def test_ec3_dec3_found_in_full_waveformatextensible():
    # Some services ship the full WAVEFORMATEX header (18 bytes) before the
    # extension; the dec3 payload still follows the DD+ GUID.
    full = b"\xfe\xff" + b"\x00" * 16 + bytes.fromhex(EC3_CPD)
    payload = bytes.fromhex(EC3_CPD)[22:]
    assert build_dec3(full) == box(b"dec3", payload)
 def test_ec3_without_dolby_guid_builds_bare_entry():
    assert build_dec3(b"\x00\x06\x3f\x00") is None
    init = build_init_segment(
        stream_type="audio",
        fourcc="EC-3",
        codec_private_data="",
        timescale=10000000,
        channels=6,
    )
    assert b"ec-3" in init and b"dec3" not in init
 def test_aac_codec_private_data_synthesis_matches_real_manifest():
    # 48 kHz stereo AAC-LC must produce 0x1190 — the exact ASC real manifests carry.
    assert synthesize_aac_codec_private_data("AACL", 48000, 2).hex() == "1190"
 def test_aach_synthesis_signals_sbr():
    asc = synthesize_aac_codec_private_data("AACH", 24000, 2)
    assert len(asc) == 4
    assert asc[0] >> 3 == 0x05  # AOT 5 = SBR (HE-AAC)
    # Extension sampling frequency = core * 2 = 48 kHz (index 3).
    assert ((asc[1] & 0x01) << 1) | (asc[2] >> 7) == 0x03
 def test_aac_init_without_codec_private_data_synthesizes_asc():
    init = build_init_segment(
        stream_type="audio",
        fourcc="AACL",
        codec_private_data="",
        timescale=10000000,
        channels=2,
        sampling_rate=48000,
    )
    assert b"mp4a" in init and b"esds" in init
    assert bytes.fromhex(AAC_LC_CPD) in init
 def test_dolby_vision_uses_dvh1_sample_entry():
    init = build_init_segment(
        stream_type="video",
        fourcc="DVH1",
        codec_private_data=VIDEO_HEVC_CPD,
        timescale=10000000,
        width=3840,
        height=1600,
    )
    assert b"dvh1" in init and b"hvcC" in init
    assert b"hvc1" not in init
 def test_davc_maps_to_avc1():
    init = build_init_segment(
        stream_type="video",
        fourcc="DAVC",
        codec_private_data=VIDEO_AVC_CPD,
        timescale=10000000,
    )
    assert b"avc1" in init and b"avcC" in init
 def test_lowercase_fourcc_normalized():
    # Real manifests ship FourCC="hvc1" in lowercase.
    init = build_init_segment(
        stream_type="video",
        fourcc="hvc1",
        codec_private_data=VIDEO_HEVC_CPD,
        timescale=10000000,
    )
    assert b"hvcC" in init
 def test_avcc_selects_sps_pps_by_nal_type_not_position():
    nals = split_nal_units(bytes.fromhex(VIDEO_AVC_CPD))
    swapped = NAL_START_CODE + nals[1] + NAL_START_CODE + nals[0]  # PPS first
    avcc = build_avcc(swapped)
    # Profile/compat/level must still come from the SPS body.
    assert avcc[9:12] == nals[0][1:4]
 def test_nal_length_field_respected():
    avcc = build_avcc(bytes.fromhex(VIDEO_AVC_CPD), nal_length_size=2)
    # avcC payload byte 4 low 2 bits = lengthSizeMinusOne.
    assert avcc[12] & 0x03 == 1
 def test_parse_hevc_sps_format_8bit():
    sps = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD))[1]
    assert parse_hevc_sps_format(remove_emulation_prevention(sps)) == (1, 0, 0)  # 4:2:0, 8-bit
 def test_hvcc_signals_10bit_from_sps():
    sps = next(n for n in split_nal_units(bytes.fromhex(VIDEO_HEVC10_CPD)) if (n[0] >> 1) & 0x3F == 33)
    assert parse_hevc_sps_format(remove_emulation_prevention(sps)) == (1, 2, 2)  # 4:2:0, 10-bit
    payload = build_hvcc(bytes.fromhex(VIDEO_HEVC10_CPD))[8:]  # strip box header
    assert payload[16] == 0xFC | 0x01  # chromaFormat 4:2:0
    assert payload[17] == 0xF8 | 0x02  # bitDepthLumaMinus8 = 2
    assert payload[18] == 0xF8 | 0x02  # bitDepthChromaMinus8 = 2
 def test_ttml_init_structure():
    init = build_init_segment(
        stream_type="text",
        fourcc="TTML",
        codec_private_data="",
        timescale=10000000,
        language="eng",
    )
    assert b"stpp" in init
    assert b"sthd" in init  # subtitle media header
    assert b"subt" in init and b"SubtitleHandler\0" in init
    assert b"http://www.w3.org/ns/ttml\0" in init
 def test_constant_iv_tenc_form():
    constant_iv = bytes(range(16))
    init = build_init_segment(
        stream_type="video",
        fourcc="HVC1",
        codec_private_data=VIDEO_HEVC_CPD,
        timescale=10000000,
        kid=KID,
        constant_iv=constant_iv,
    )
    # Constant-IV form: default_Per_Sample_IV_Size = 0, then size + IV after the KID.
    assert KID + bytes([len(constant_iv)]) + constant_iv in init
    tenc_at = init.index(b"tenc")
    assert init[tenc_at + 4 + 4 + 3] == 0  # default_Per_Sample_IV_Size
 def make_fragment(senc: bytes = b"", saiz: bytes = b"") -> bytes:
    tfhd = full_box(b"tfhd", 0, 0, struct.pack(">I", 1) + b"\x00" * 4)
    traf = box(b"traf", tfhd + senc + saiz)
    return box(b"moof", traf) + box(b"mdat", b"\x00" * 4)
 def test_iv_size_from_piff_senc_override_flag():
    # PIFF senc uuid with flags&1: AlgorithmID(3) + IV_size(1) + KID(16) override.
    payload = b"\x00\x00\x00\x01" + b"\x00\x00\x01" + bytes([16]) + KID + struct.pack(">I", 0)
    senc = box(b"uuid", PIFF_SENC_UUID + payload)
    assert read_per_sample_iv_size(make_fragment(senc=senc)) == 16
 def test_iv_size_from_senc_payload_length():
    # Standard senc, no subsamples: 3 samples x 8-byte IVs.
    senc = full_box(b"senc", 0, 0, struct.pack(">I", 3) + b"\x11" * 24)
    assert read_per_sample_iv_size(make_fragment(senc=senc)) == 8
 def test_iv_size_from_senc_with_subsamples():
    # senc flags&2: per sample IV(8) + entry_count(2) + 6 bytes per entry.
    sample = b"\x22" * 8 + struct.pack(">H", 1) + b"\x00" * 6
    senc = full_box(b"senc", 0, 2, struct.pack(">I", 2) + sample * 2)
    assert read_per_sample_iv_size(make_fragment(senc=senc)) == 8
 def test_iv_size_from_saiz_fallback():
    saiz = full_box(b"saiz", 0, 0, bytes([16]) + struct.pack(">I", 5))
    assert read_per_sample_iv_size(make_fragment(saiz=saiz)) == 16
 def test_iv_size_undetermined_returns_none():
    assert read_per_sample_iv_size(make_fragment()) is None
 def test_hvcc_embeds_vps_sps_pps():
    hvcc = build_hvcc(bytes.fromhex(VIDEO_HEVC_CPD))
    nals = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD))
    # Each original NAL unit (VPS/SPS/PPS) is embedded verbatim in the arrays.
    for nal in nals:
        assert nal in hvcc
 def test_avcc_requires_sps_and_pps():
    with pytest.raises(ValueError):
        build_avcc(b"\x00\x00\x00\x01\x67only_sps")
 def test_read_track_id_from_fragment():
    # Minimal moof/traf/tfhd carrying track_ID = 7.
    tfhd = full_box("tfhd".encode(), 0, 0, struct.pack(">I", 7) + b"\x00" * 4)
    traf = box(b"traf", tfhd)
    moof = box(b"moof", traf)
    mdat = box(b"mdat", b"\x00\x00")
    assert read_track_id(moof + mdat) == 7
 def test_read_track_id_missing_returns_none():
    assert read_track_id(box(b"mdat", b"\x00\x00")) is None
 def test_remove_emulation_prevention():
    # 00 00 03 XX -> the 0x03 emulation byte is dropped.
    assert remove_emulation_prevention(b"\x00\x00\x03\x01") == b"\x00\x00\x01"
    assert remove_emulation_prevention(b"\x00\x00\x03\x00\x00\x03\x96") == b"\x00\x00\x00\x00\x96"
    # The byte after a consumed escape is data, even another 0x03.
    assert remove_emulation_prevention(b"\x00\x00\x03\x03") == b"\x00\x00\x03"
    assert remove_emulation_prevention(b"\x00\x00\x03\x03\x00\x00\x03\x01") == b"\x00\x00\x03\x00\x00\x01"
 def test_two_letter_or_uppercase_language_falls_back_to_und():
    # mdhd packs three a-z letters; "en"/"ENG" must not crash struct.pack.
    for lang in ("en", "ENG", "", "e1x"):
        init = build_init_segment(
            stream_type="audio",
            fourcc="AACL",
            codec_private_data=AAC_LC_CPD,
            timescale=10000000,
            language=lang,
        )
        assert init[4:8] == b"ftyp"
 def test_high_sampling_rate_does_not_overflow():
    # 96 kHz exceeds the 16.16 integer field; written as 0 like ffmpeg does.
    init = build_init_segment(
        stream_type="audio",
        fourcc="AACL",
        codec_private_data="",
        timescale=10000000,
        sampling_rate=96000,
    )
    assert b"mp4a" in init
 def test_read_track_id_truncated_tfhd_returns_none():
    tfhd = full_box(b"tfhd", 0, 0, b"\x00\x00")  # too short for a track_ID
    fragment = box(b"moof", box(b"traf", tfhd))
    assert read_track_id(fragment) is None
 def test_hvcc_profile_tier_level_is_nonzero():
    # De-emulated PTL must yield real profile/level, not the off-by-one garbage.
    hvcc = build_hvcc(bytes.fromhex(VIDEO_HEVC_CPD))
    payload = hvcc[8:]  # strip box header
    profile_idc = payload[1] & 0x1F
    level_idc = payload[12]
    assert profile_idc != 0
    assert level_idc != 0
--- a/unshackle/core/manifests/ism.py
+++ b/unshackle/core/manifests/ism.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import base64
 import hashlib
 import html
 import struct
 import urllib.parse
 from functools import partial
 from pathlib import Path
@@ -18,6 +19,7 @@ from requests import Session
 from unshackle.core.constants import DOWNLOAD_CANCELLED, DOWNLOAD_LICENCE_ONLY, AnyTrack
 from unshackle.core.drm import DRM_T, PlayReady, Widevine
 from unshackle.core.events import events
 from unshackle.core.manifests.ism_init import build_init_segment, read_per_sample_iv_size, read_track_id
 from unshackle.core.session import RnetSession
 from unshackle.core.tracks import Audio, Subtitle, Track, Tracks, Video
 from unshackle.core.utilities import log_event, try_ensure_utf8
@@ -85,6 +87,104 @@ class ISM:
                drm.append(PlayReady(pssh=pr_pssh, pssh_b64=data))
        return drm
    @staticmethod
    def _init_segment(
        track: AnyTrack, session_drm: Optional[DRM_T], first_segment: Optional[bytes] = None
    ) -> Optional[bytes]:
        # Smooth fragments are moof+mdat only; rebuild the ftyp+moov init box from
        # the manifest CodecPrivateData (and KID, when encrypted) so the merged file
        # is a valid MP4 that shaka/mp4decrypt can parse.
        ism = track.data.get("ism") if isinstance(getattr(track, "data", None), dict) else None
        if not ism:
            return None
        stream_index = ism.get("stream_index")
        quality_level = ism.get("quality_level")
        manifest = ism.get("manifest")
        if stream_index is None or quality_level is None:
            return None
        # CodecPrivateData may legitimately be empty (AAC config is synthesized,
        # EC-3 decoders sync from the frames); the builder handles each case.
        cpd = quality_level.get("CodecPrivateData") or ""
        fourcc = quality_level.get("FourCC") or ""
        root_timescale = manifest.get("TimeScale") if manifest is not None else None
        timescale = int(stream_index.get("TimeScale") or root_timescale or 10000000)
        duration = int((manifest.get("Duration") if manifest is not None else 0) or 0)
        # mdhd needs a 3-letter ISO-639-2 code; manifests often carry 2-letter tags.
        lang_attr = (stream_index.get("Language") or "").strip()
        language = "und"
        if lang_attr and tag_is_valid(lang_attr):
            try:
                language = Language.get(lang_attr).to_alpha3()
            except LookupError:
                language = "und"
        kid: Optional[bytes] = None
        if session_drm is not None:
            kid_uuid = next(iter(getattr(session_drm, "kids", None) or []), None)
            if kid_uuid is not None:
                kid = bytes.fromhex(kid_uuid.hex)
        # Match the moov track_ID to the fragment's tfhd, else the muxer drops samples.
        track_id = (read_track_id(first_segment) if first_segment else None) or 1
        # NALUnitLengthField: bytes per NAL length prefix, default 4.
        nal_length_size = int(quality_level.get("NALUnitLengthField") or stream_index.get("NALUnitLengthField") or 4)
        # Per-sample IV size derived from the fragment senc/saiz (PIFF default 8).
        iv_size = (read_per_sample_iv_size(first_segment) if first_segment and kid else None) or 8
        try:
            if isinstance(track, Subtitle):
                if track.codec != Subtitle.Codec.fTTML:
                    return None  # plain-text subtitle formats concatenate fine
                return build_init_segment(
                    stream_type="text",
                    fourcc="TTML",
                    codec_private_data="",
                    timescale=timescale,
                    duration=duration,
                    language=language,
                    track_id=track_id,
                )
            if isinstance(track, Video):
                return build_init_segment(
                    stream_type="video",
                    fourcc=fourcc,
                    codec_private_data=cpd,
                    timescale=timescale,
                    duration=duration,
                    language=language,
                    width=int(quality_level.get("MaxWidth") or stream_index.get("MaxWidth") or 0),
                    height=int(quality_level.get("MaxHeight") or stream_index.get("MaxHeight") or 0),
                    track_id=track_id,
                    nal_length_size=nal_length_size,
                    kid=kid,
                    iv_size=iv_size,
                )
            return build_init_segment(
                stream_type="audio",
                fourcc=fourcc,
                codec_private_data=cpd,
                timescale=timescale,
                duration=duration,
                language=language,
                channels=int(quality_level.get("Channels") or 2),
                bits_per_sample=int(quality_level.get("BitsPerSample") or 16),
                sampling_rate=int(quality_level.get("SamplingRate") or 48000),
                track_id=track_id,
                kid=kid,
                iv_size=iv_size,
            )
        except (NotImplementedError, ValueError, struct.error) as e:
            # Unsupported codec, malformed CodecPrivateData or out-of-range field —
            # fall back to raw concatenation rather than aborting the download.
            log_event(
                "manifest_ism_init_unsupported",
                level="WARNING",
                message=f"Could not synthesize ISM init segment ({fourcc}): {e}",
                context={"track_id": getattr(track, "id", None), "fourcc": fourcc},
            )
            return None
    def to_tracks(self, language: Optional[Union[str, Language]] = None) -> Tracks:
        tracks = Tracks()
        base_url = self.url
@@ -383,8 +483,13 @@ class ISM:
            raise FileNotFoundError(error_msg)
        with open(save_path, "wb") as f:
-            for segment_file in segments_to_merge:
+            first_segment = segments_to_merge[0].read_bytes() if segments_to_merge else None
-                segment_data = segment_file.read_bytes()
+            init_segment = ISM._init_segment(track, session_drm, first_segment)
            if init_segment:
                f.write(init_segment)
            for index, segment_file in enumerate(segments_to_merge):
                # First segment was already read for the init synthesis — reuse it.
                segment_data = first_segment if index == 0 and first_segment else segment_file.read_bytes()
                if (
                    not session_drm
                    and isinstance(track, Subtitle)
--- a/unshackle/core/manifests/ism_init.py
+++ b/unshackle/core/manifests/ism_init.py
@@ -0,0 +1,622 @@
 """
 Synthesize an ISO-BMFF initialization segment (ftyp + moov) for ISM / Smooth
 Streaming tracks.
 Smooth Streaming fragments are bare ``moof`` + ``mdat`` pairs; the server never
 sends a ``moov``. The init box must be reconstructed from the manifest's
 ``CodecPrivateData`` (and, for protected content, the track KID) before a muxer
 or decryptor such as shaka-packager can parse the stream. Ported from yt-dlp's
 ``write_piff_header`` and N_m3u8DL-RE's ``MSSMoovProcessor`` with HEVC, Dolby
 Vision, EC-3, TTML and CENC (PIFF) support.
 """
 from __future__ import annotations
 import binascii
 import struct
 from typing import Iterator, Optional
 # Big-endian field packers (named for the bit widths they encode).
 u8 = struct.Struct(">B")
 u16 = struct.Struct(">H")
 u32 = struct.Struct(">I")
 u64 = struct.Struct(">Q")
 s16 = struct.Struct(">h")
 s88 = struct.Struct(">bx")  # 8.8 fixed-point
 s1616 = struct.Struct(">hxx")  # 16.16 fixed-point
 u1616 = struct.Struct(">Hxx")
 s32 = struct.Struct(">i")
 # 3x3 transformation matrix (identity), as stored in tkhd/mvhd.
 UNITY_MATRIX = (
    s32.pack(0x10000) + s32.pack(0) * 3
    + s32.pack(0) + s32.pack(0x10000) + s32.pack(0) * 2
    + s32.pack(0) * 2 + s32.pack(0x40000000)
 )
 TRACK_ENABLED = 0x1
 TRACK_IN_MOVIE = 0x2
 TRACK_IN_PREVIEW = 0x4
 SELF_CONTAINED = 0x1
 # Fixed creation/modification time — deterministic output (no wall clock).
 EPOCH = 0
 NAL_START_CODE = b"\x00\x00\x00\x01"
 # WAVEFORMATEXTENSIBLE SubFormat GUID for Dolby Digital Plus, as serialized
 # (little-endian) inside Smooth EC-3 CodecPrivateData.
 DOLBY_DIGITAL_PLUS_GUID = bytes.fromhex("AF87FBA7022DFB42A4D405CD93843BDD")
 # PIFF SampleEncryptionBox usertype (the pre-CENC 'senc' carried as a uuid box).
 PIFF_SENC_UUID = bytes.fromhex("A2394F525A9B4F14A2446C427C648DF4")
 TTML_NAMESPACE = b"http://www.w3.org/ns/ttml\0"
 # ISO/IEC 14496-3 samplingFrequencyIndex table for AudioSpecificConfig.
 AAC_SAMPLING_FREQUENCY_INDEX = {
    96000: 0x0,
    88200: 0x1,
    64000: 0x2,
    48000: 0x3,
    44100: 0x4,
    32000: 0x5,
    24000: 0x6,
    22050: 0x7,
    16000: 0x8,
    12000: 0x9,
    11025: 0xA,
    8000: 0xB,
    7350: 0xC,
 }
 def box(box_type: bytes, payload: bytes) -> bytes:
    """Wrap payload in a basic ISO-BMFF box (size + fourcc + payload)."""
    return u32.pack(8 + len(payload)) + box_type + payload
 def full_box(box_type: bytes, version: int, flags: int, payload: bytes) -> bytes:
    """Wrap payload in a FullBox (adds 1-byte version + 3-byte flags)."""
    return box(box_type, u8.pack(version) + u32.pack(flags)[1:] + payload)
 def split_nal_units(codec_private_data: bytes) -> list[bytes]:
    """Split CodecPrivateData into its NAL units (drops the start codes)."""
    units = [u for u in codec_private_data.split(NAL_START_CODE) if u]
    return units
 def remove_emulation_prevention(data: bytes) -> bytes:
    """Strip H.26x emulation-prevention bytes (the 0x03 in any 00 00 03 run).
    The byte after a consumed escape is data — even another 0x03 — so the scan
    must skip past it rather than re-examine (a naive trailing-window check
    over-strips consecutive escapes and shifts every later bit position).
    """
    out = bytearray()
    i = 0
    while i < len(data):
        if i + 2 < len(data) and data[i] == 0 and data[i + 1] == 0 and data[i + 2] == 3:
            out += b"\x00\x00"
            i += 3
        else:
            out.append(data[i])
            i += 1
    return bytes(out)
 class BitReader:
    """MSB-first bit reader with the exp-Golomb decode H.26x headers need."""
    def __init__(self, data: bytes) -> None:
        self.data = data
        self.pos = 0
    def read_bits(self, count: int) -> int:
        value = 0
        for _ in range(count):
            byte = self.data[self.pos >> 3]
            value = (value << 1) | ((byte >> (7 - (self.pos & 7))) & 1)
            self.pos += 1
        return value
    def read_ue(self) -> int:
        zeros = 0
        while self.read_bits(1) == 0:
            zeros += 1
            if zeros > 32:
                raise ValueError("Invalid exp-Golomb code")
        return (1 << zeros) - 1 + (self.read_bits(zeros) if zeros else 0)
 def parse_hevc_sps_format(sps_rbsp: bytes) -> tuple[int, int, int]:
    """
    Parse (chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8)
    from a de-emulated HEVC SPS RBSP (including its 2-byte NAL header).
    """
    r = BitReader(sps_rbsp)
    r.read_bits(16)  # NAL unit header
    r.read_bits(4)  # sps_video_parameter_set_id
    max_sub_layers_minus1 = r.read_bits(3)
    r.read_bits(1)  # sps_temporal_id_nesting_flag
    r.read_bits(96)  # general profile_tier_level (12 bytes)
    profile_present = []
    level_present = []
    for _ in range(max_sub_layers_minus1):
        profile_present.append(r.read_bits(1))
        level_present.append(r.read_bits(1))
    if max_sub_layers_minus1 > 0:
        r.read_bits((8 - max_sub_layers_minus1) * 2)  # reserved_zero_2bits
    for i in range(max_sub_layers_minus1):
        if profile_present[i]:
            r.read_bits(88)  # sub_layer profile_tier
        if level_present[i]:
            r.read_bits(8)  # sub_layer_level_idc
    r.read_ue()  # sps_seq_parameter_set_id
    chroma_format_idc = r.read_ue()
    if chroma_format_idc == 3:
        r.read_bits(1)  # separate_colour_plane_flag
    r.read_ue()  # pic_width_in_luma_samples
    r.read_ue()  # pic_height_in_luma_samples
    if r.read_bits(1):  # conformance_window_flag
        for _ in range(4):
            r.read_ue()
    bit_depth_luma_minus8 = r.read_ue()
    bit_depth_chroma_minus8 = r.read_ue()
    return chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8
 def iter_boxes(data: bytes, start: int, end: int) -> Iterator[tuple[bytes, Optional[bytes], int, int]]:
    """Yield (type, uuid_usertype, payload_start, box_end) for each child box."""
    offset = start
    while offset + 8 <= end:
        size = struct.unpack(">I", data[offset : offset + 4])[0]
        box_type = data[offset + 4 : offset + 8]
        header = 8
        if size == 1:
            size = struct.unpack(">Q", data[offset + 8 : offset + 16])[0]
            header = 16
        if size == 0:
            size = end - offset
        if size < 8:  # corrupt box header; stop rather than loop forever
            return
        usertype = None
        if box_type == b"uuid" and offset + header + 16 <= end:
            usertype = data[offset + header : offset + header + 16]
            header += 16
        yield box_type, usertype, offset + header, offset + size
        offset += size
 def find_box(data: bytes, start: int, end: int, target: bytes) -> Optional[tuple[int, int]]:
    """Find the first child box of the given type; return (payload_start, end)."""
    for box_type, _, body, box_end in iter_boxes(data, start, end):
        if box_type == target:
            return body, box_end
    return None
 def read_track_id(fragment: bytes) -> Optional[int]:
    """Read the track_ID from a fragment's moof/traf/tfhd box, if present.
    Smooth fragments declare their own track_ID; the synthesized moov must use
    the same value or the muxer cannot associate samples with the track. The
    track_ID sits before any tfhd optional fields, so the flags don't matter.
    """
    moof = find_box(fragment, 0, len(fragment), b"moof")
    if not moof:
        return None
    traf = find_box(fragment, *moof, b"traf")
    if not traf:
        return None
    tfhd = find_box(fragment, *traf, b"tfhd")
    if not tfhd:
        return None
    body, _ = tfhd
    if body + 8 > len(fragment):  # truncated tfhd
        return None
    # tfhd payload: version(1) + flags(3) + track_ID(4)
    return struct.unpack(">I", fragment[body + 4 : body + 8])[0]
 def read_per_sample_iv_size(fragment: bytes) -> Optional[int]:
    """
    Derive the per-sample IV size (8 or 16) from a fragment's sample-encryption
    metadata, for the synthesized tenc default_Per_Sample_IV_Size.
    Checks, in order: the PIFF 'senc' uuid override flag (explicit IV size),
    the senc payload length (sample_count vs IV/subsample entries), and the
    saiz default_sample_info_size (only unambiguous without subsamples).
    """
    moof = find_box(fragment, 0, len(fragment), b"moof")
    if not moof:
        return None
    traf = find_box(fragment, *moof, b"traf")
    if not traf:
        return None
    senc: Optional[tuple[int, int]] = None
    saiz_default: Optional[int] = None
    senc_has_subsamples = False
    for box_type, usertype, body, box_end in iter_boxes(fragment, *traf):
        if box_type == b"senc" or (box_type == b"uuid" and usertype == PIFF_SENC_UUID):
            senc = (body, box_end)
        elif box_type == b"saiz":
            flags = int.from_bytes(fragment[body + 1 : body + 4], "big")
            pos = body + 4 + (8 if flags & 0x1 else 0)  # skip aux_info_type fields
            if pos < box_end:
                saiz_default = fragment[pos]
    if senc:
        body, box_end = senc
        flags = int.from_bytes(fragment[body + 1 : body + 4], "big")
        senc_has_subsamples = bool(flags & 0x2)
        pos = body + 4
        if flags & 0x1:  # PIFF override: AlgorithmID(3) + IV_size(1) + KID(16)
            return fragment[pos + 3]
        if pos + 4 <= box_end:
            sample_count = struct.unpack(">I", fragment[pos : pos + 4])[0]
            pos += 4
            if sample_count:
                if not senc_has_subsamples:
                    size, rem = divmod(box_end - pos, sample_count)
                    if rem == 0 and size in (8, 16):
                        return size
                else:
                    # Walk the entries with each candidate IV size; the one that
                    # lands exactly on the box end is correct.
                    for iv_size in (8, 16):
                        cursor = pos
                        for _ in range(sample_count):
                            cursor += iv_size
                            if cursor + 2 > box_end:
                                cursor = -1
                                break
                            entries = struct.unpack(">H", fragment[cursor : cursor + 2])[0]
                            cursor += 2 + 6 * entries
                            if cursor > box_end:
                                cursor = -1
                                break
                        if cursor == box_end:
                            return iv_size
    if not senc_has_subsamples and saiz_default in (8, 16):
        return saiz_default
    return None
 def build_avcc(codec_private_data: bytes, nal_length_size: int = 4) -> bytes:
    """Build an avcC (AVC decoder config) box from SPS+PPS CodecPrivateData."""
    nals = split_nal_units(codec_private_data)
    # Pick parameter sets by H.264 NAL type (low 5 bits): 7 = SPS, 8 = PPS.
    # Manifests do not guarantee SPS-first ordering.
    sps = next((n for n in nals if n[0] & 0x1F == 7), None)
    pps = next((n for n in nals if n[0] & 0x1F == 8), None)
    if not sps or not pps:
        raise ValueError("AVC CodecPrivateData must contain SPS and PPS NAL units")
    payload = u8.pack(1)  # configuration version
    payload += sps[1:4]  # profile / compat / level (from SPS NAL body)
    payload += u8.pack(0xFC | (nal_length_size - 1))  # reserved + length size minus one
    payload += u8.pack(0xE0 | 1)  # reserved + number of SPS (1)
    payload += u16.pack(len(sps)) + sps
    payload += u8.pack(1)  # number of PPS
    payload += u16.pack(len(pps)) + pps
    return box(b"avcC", payload)
 def build_hvcc(codec_private_data: bytes, nal_length_size: int = 4) -> bytes:
    """
    Build an hvcC (HEVC decoder config) box from VPS+SPS+PPS CodecPrivateData.
    Profile/tier/level bytes are lifted from the SPS profile_tier_level; chroma
    format and bit depths are parsed from the SPS so 10-bit/HDR streams signal
    correctly (falls back to 8-bit 4:2:0 on malformed SPS data).
    """
    nals = split_nal_units(codec_private_data)
    if len(nals) < 3:
        raise ValueError("HEVC CodecPrivateData must contain VPS, SPS and PPS NAL units")
    # Group NAL units by type (HEVC NAL type = (first byte >> 1) & 0x3F).
    by_type: dict[int, list[bytes]] = {}
    for nal in nals:
        nal_type = (nal[0] >> 1) & 0x3F
        by_type.setdefault(nal_type, []).append(nal)
    sps = by_type.get(33, [b""])[0]
    # profile_tier_level must be read from the de-emulated SPS RBSP, after the
    # 2-byte NAL header + 1 byte (sps_video_parameter_set_id(4) +
    # sps_max_sub_layers_minus1(3) + sps_temporal_id_nesting_flag(1)). PTL is 12
    # bytes: profile byte(1) + compat flags(4) + constraint flags(6) + level(1).
    sps_rbsp = remove_emulation_prevention(sps)
    ptl = sps_rbsp[3:15] if len(sps_rbsp) >= 15 else b"\x00" * 12
    general_profile_space_tier_profile = ptl[0:1] or b"\x00"
    general_profile_compat = ptl[1:5].ljust(4, b"\x00")
    general_constraint = ptl[5:11].ljust(6, b"\x00")
    general_level_idc = ptl[11:12] or b"\x00"
    try:
        chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8 = parse_hevc_sps_format(sps_rbsp)
    except (IndexError, ValueError):
        chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8 = 1, 0, 0
    payload = u8.pack(1)  # configurationVersion
    payload += general_profile_space_tier_profile
    payload += general_profile_compat
    payload += general_constraint
    payload += general_level_idc
    payload += u16.pack(0xF000)  # reserved(4) + min_spatial_segmentation_idc(12)
    payload += u8.pack(0xFC)  # reserved(6) + parallelismType(2)
    payload += u8.pack(0xFC | (chroma_format_idc & 0x03))  # reserved(6) + chromaFormat(2)
    payload += u8.pack(0xF8 | (bit_depth_luma_minus8 & 0x07))  # reserved(5) + bitDepthLumaMinus8(3)
    payload += u8.pack(0xF8 | (bit_depth_chroma_minus8 & 0x07))  # reserved(5) + bitDepthChromaMinus8(3)
    payload += u16.pack(0)  # avgFrameRate
    # constantFrameRate(2)+numTemporalLayers(3)+temporalIdNested(1)+lengthSizeMinusOne(2)
    payload += u8.pack((nal_length_size - 1) & 0x03)
    arrays = bytearray()
    num_arrays = 0
    for nal_type in (32, 33, 34):  # VPS, SPS, PPS
        units = by_type.get(nal_type)
        if not units:
            continue
        num_arrays += 1
        arrays += u8.pack(0x80 | nal_type)  # array_completeness(1)+reserved(1)+NAL type(6)
        arrays += u16.pack(len(units))
        for unit in units:
            arrays += u16.pack(len(unit)) + unit
    payload += u8.pack(num_arrays) + bytes(arrays)
    return box(b"hvcC", payload)
 def build_esds(codec_private_data: bytes) -> bytes:
    """Build an esds box wrapping the AAC AudioSpecificConfig."""
    asc = codec_private_data
    # DecoderSpecificInfo (tag 0x05)
    dsi = u8.pack(0x05) + u8.pack(len(asc)) + asc
    # DecoderConfigDescriptor (tag 0x04): objectType=0x40 (AAC), stream type audio
    dcd = (
        u8.pack(0x40)  # object type indication = MPEG-4 AAC
        + u8.pack(0x15)  # stream type (audio) << 2 | upstream | reserved
        + b"\x00\x00\x00"  # buffer size
        + u32.pack(0)  # max bitrate
        + u32.pack(0)  # avg bitrate
        + dsi
    )
    dcd_box = u8.pack(0x04) + u8.pack(len(dcd)) + dcd
    # SLConfigDescriptor (tag 0x06)
    sl = u8.pack(0x06) + u8.pack(1) + u8.pack(0x02)
    # ES_Descriptor (tag 0x03)
    es = u8.pack(0x03) + u8.pack(len(dcd_box) + len(sl) + 3) + u16.pack(0) + u8.pack(0) + dcd_box + sl
    return full_box(b"esds", 0, 0, es)
 def build_dec3(codec_private_data: bytes) -> Optional[bytes]:
    """Build a dec3 (EC-3 specific) box from Smooth EC-3 CodecPrivateData.
    Smooth EC-3 CodecPrivateData ([MS-SSTR] AudioTag 65534) serializes a
    WAVEFORMATEXTENSIBLE — sometimes the full structure, sometimes only its
    extension (samples-per-block + channel mask + DD+ SubFormat GUID) — with
    the raw dec3 payload (ETSI TS 102 366 F.6) after the GUID. Returns None
    when the GUID is absent — decoders still sync from EC-3 frames in mdat.
    """
    guid_at = codec_private_data.find(DOLBY_DIGITAL_PLUS_GUID)
    if guid_at != -1 and len(codec_private_data) > guid_at + 16:
        return box(b"dec3", codec_private_data[guid_at + 16 :])
    return None
 def synthesize_aac_codec_private_data(fourcc: str, sampling_rate: int, channels: int) -> bytes:
    """Generate the AAC AudioSpecificConfig when the manifest omits it.
    AACL -> 2-byte AAC-LC config; AACH -> 4-byte HE-AAC (SBR, AOT 5) config
    with the extension sampling frequency at twice the core rate.
    """
    freq = AAC_SAMPLING_FREQUENCY_INDEX.get(sampling_rate, 0x0)
    if fourcc == "AACH":
        ext_freq = AAC_SAMPLING_FREQUENCY_INDEX.get(sampling_rate * 2, 0x0)
        return bytes(
            (
                (0x05 << 3) | (freq >> 1),
                ((freq & 0x01) << 7) | (channels << 3) | (ext_freq >> 1),
                ((ext_freq & 0x01) << 7) | (0x02 << 2),  # core object type = AAC LC
                0x00,  # alignment bits
            )
        )
    return bytes(((0x02 << 3) | (freq >> 1), ((freq & 0x01) << 7) | (channels << 3)))
 def build_sinf(
    original_format: bytes,
    kid: bytes,
    iv_size: int = 8,
    constant_iv: Optional[bytes] = None,
 ) -> bytes:
    """Build a sinf protection box (frma + schm cenc + schi/tenc) for CENC.
    iv_size is the tenc default_Per_Sample_IV_Size (8 or 16). When constant_iv
    is given, the per-sample IV size is 0 and the constant IV is appended per
    ISO/IEC 23001-7 (cbcs-style constant-IV form).
    """
    frma = box(b"frma", original_format)
    schm = full_box(b"schm", 0, 0, b"cenc" + u32.pack(0x00010000))
    tenc_payload = (
        u8.pack(0)  # reserved
        + u8.pack(0)  # default_crypt_byte_block / skip_byte_block (cenc)
        + u8.pack(1)  # default_isProtected
        + u8.pack(0 if constant_iv else iv_size)  # default_Per_Sample_IV_Size
        + kid  # default_KID (16 bytes)
    )
    if constant_iv:
        tenc_payload += u8.pack(len(constant_iv)) + constant_iv
    schi = box(b"schi", full_box(b"tenc", 0, 0, tenc_payload))
    return box(b"sinf", frma + schm + schi)
 def build_init_segment(
    *,
    stream_type: str,
    fourcc: str,
    codec_private_data: str,
    timescale: int = 10000000,
    duration: int = 0,
    language: str = "und",
    width: int = 0,
    height: int = 0,
    channels: int = 2,
    bits_per_sample: int = 16,
    sampling_rate: int = 48000,
    track_id: int = 1,
    nal_length_size: int = 4,
    kid: Optional[bytes] = None,
    iv_size: int = 8,
    constant_iv: Optional[bytes] = None,
 ) -> bytes:
    """
    Build a complete ftyp + moov initialization segment.
    stream_type: "video" | "audio" | "text".
    fourcc: Smooth FourCC ("H264"/"AVC1"/"DAVC", "HVC1"/"HEV1", "DVHE"/"DVH1",
            "AACL"/"AACH", "EC-3", "TTML").
    codec_private_data: hex string from the manifest QualityLevel.
    nal_length_size: manifest NALUnitLengthField (bytes per NAL length prefix).
    kid: 16-byte default key id; when set, the sample entry is wrapped for CENC.
    iv_size / constant_iv: tenc IV form (see build_sinf).
    """
    if stream_type not in ("video", "audio", "text"):
        raise ValueError(f"Unsupported stream type: {stream_type}")
    fourcc = (fourcc or "").upper()
    cpd = binascii.unhexlify(codec_private_data) if codec_private_data else b""
    encrypted = kid is not None
    # mdhd packs exactly three a-z letters; anything else (2-letter tags,
    # uppercase) would underflow the 5-bit fields, so fall back to "und".
    lang = (language or "").lower()
    if len(lang) != 3 or not all("a" <= c <= "z" for c in lang):
        lang = "und"
    # --- ftyp ---
    ftyp = box(b"ftyp", b"isml" + u32.pack(1) + b"iso5" + b"iso6" + b"piff" + b"msdh")
    # --- mvhd ---
    mvhd = full_box(
        b"mvhd", 1, 0,
        u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(timescale) + u64.pack(duration)
        + s1616.pack(1) + s88.pack(1) + u16.pack(0) + u32.pack(0) * 2
        + UNITY_MATRIX + u32.pack(0) * 6 + u32.pack(0xFFFFFFFF),
    )
    # --- tkhd ---
    tkhd = full_box(
        b"tkhd", 1, TRACK_ENABLED | TRACK_IN_MOVIE | TRACK_IN_PREVIEW,
        u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(track_id) + u32.pack(0)
        + u64.pack(duration) + u32.pack(0) * 2 + s16.pack(0) + s16.pack(0)
        + s88.pack(1 if stream_type == "audio" else 0) + u16.pack(0) + UNITY_MATRIX
        + u1616.pack(width) + u1616.pack(height),
    )
    # --- mdhd + hdlr ---
    packed_lang = ((ord(lang[0]) - 0x60) << 10) | ((ord(lang[1]) - 0x60) << 5) | (ord(lang[2]) - 0x60)
    mdhd = full_box(
        b"mdhd", 1, 0,
        u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(timescale) + u64.pack(duration)
        + u16.pack(packed_lang) + u16.pack(0),
    )
    if stream_type == "audio":
        hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"soun" + u32.pack(0) * 3 + b"SoundHandler\0")
        media_header = full_box(b"smhd", 0, 0, s88.pack(0) + u16.pack(0))
    elif stream_type == "text":
        hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"subt" + u32.pack(0) * 3 + b"SubtitleHandler\0")
        media_header = full_box(b"sthd", 0, 0, b"")
    else:
        hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"vide" + u32.pack(0) * 3 + b"VideoHandler\0")
        media_header = full_box(b"vmhd", 0, 1, u16.pack(0) + u16.pack(0) * 3)
    # --- dinf ---
    dref = full_box(b"dref", 0, 0, u32.pack(1) + full_box(b"url ", 0, SELF_CONTAINED, b""))
    dinf = box(b"dinf", dref)
    # --- stsd sample entry ---
    sample_entry_payload = u8.pack(0) * 6 + u16.pack(1)  # reserved + data reference index
    if stream_type == "video":
        sample_entry_payload += (
            u16.pack(0) + u16.pack(0) + u32.pack(0) * 3
            + u16.pack(width) + u16.pack(height)
            + u1616.pack(0x48) + u1616.pack(0x48) + u32.pack(0) + u16.pack(1)
            + u8.pack(0) * 32 + u16.pack(0x18) + s16.pack(-1)
        )
        if fourcc in ("H264", "AVC1", "DAVC"):
            config_box = build_avcc(cpd, nal_length_size)
            codec_fourcc = b"avc1"
        elif fourcc in ("HVC1", "HEV1", "HEVC", "H265"):
            config_box = build_hvcc(cpd, nal_length_size)
            codec_fourcc = b"hvc1"
        elif fourcc in ("DVHE", "DVH1"):
            # Dolby Vision over HEVC: same hvcC config, dvh1 sample entry.
            config_box = build_hvcc(cpd, nal_length_size)
            codec_fourcc = b"dvh1"
        else:
            raise NotImplementedError(f"Unsupported video FourCC: {fourcc}")
        sample_entry_payload += config_box
        if encrypted:
            sample_entry_payload += build_sinf(codec_fourcc, kid, iv_size, constant_iv)
            sample_entry_box = box(b"encv", sample_entry_payload)
        else:
            sample_entry_box = box(codec_fourcc, sample_entry_payload)
    elif stream_type == "audio":
        # samplerate is 16.16 fixed-point; rates above 65535 Hz are written as 0
        # (decoders read the real rate from the codec config), matching ffmpeg.
        sample_entry_payload += (
            u32.pack(0) * 2 + u16.pack(channels) + u16.pack(bits_per_sample)
            + u16.pack(0) + u16.pack(0) + u32.pack((sampling_rate if sampling_rate <= 0xFFFF else 0) << 16)
        )
        if fourcc in ("AACL", "AACH", "AAC"):
            if not cpd:
                cpd = synthesize_aac_codec_private_data(fourcc, sampling_rate, channels)
            sample_entry_payload += build_esds(cpd)
            codec_fourcc = b"mp4a"
        elif fourcc == "EC-3":
            dec3 = build_dec3(cpd)
            if dec3:
                sample_entry_payload += dec3
            codec_fourcc = b"ec-3"
        else:
            raise NotImplementedError(f"Unsupported audio FourCC: {fourcc}")
        if encrypted:
            sample_entry_payload += build_sinf(codec_fourcc, kid, iv_size, constant_iv)
            sample_entry_box = box(b"enca", sample_entry_payload)
        else:
            sample_entry_box = box(codec_fourcc, sample_entry_payload)
    else:  # text
        if fourcc in ("TTML", "STPP", "DFXP"):
            # XMLSubtitleSampleEntry: namespace + schema_location + aux mime types.
            sample_entry_payload += TTML_NAMESPACE + b"\0" + b"\0"
            sample_entry_box = box(b"stpp", sample_entry_payload)
        else:
            raise NotImplementedError(f"Unsupported text FourCC: {fourcc}")
    stsd = full_box(b"stsd", 0, 0, u32.pack(1) + sample_entry_box)
    # --- empty sample tables (fragmented: real samples live in moof/traf) ---
    stbl = box(
        b"stbl",
        stsd
        + full_box(b"stts", 0, 0, u32.pack(0))
        + full_box(b"stsc", 0, 0, u32.pack(0))
        + full_box(b"stsz", 0, 0, u32.pack(0) + u32.pack(0))
        + full_box(b"stco", 0, 0, u32.pack(0)),
    )
    minf = box(b"minf", media_header + dinf + stbl)
    mdia = box(b"mdia", mdhd + hdlr + minf)
    trak = box(b"trak", tkhd + mdia)
    # --- mvex (mehd + trex) signals a fragmented file ---
    mehd = full_box(b"mehd", 1, 0, u64.pack(duration))
    trex = full_box(
        b"trex", 0, 0,
        u32.pack(track_id) + u32.pack(1) + u32.pack(0) + u32.pack(0) + u32.pack(0),
    )
    mvex = box(b"mvex", mehd + trex)
    moov = box(b"moov", mvhd + trak + mvex)
    return ftyp + moov