fix(ism): rebuild moov init segment for Smooth Streaming decrypt

ISM (Smooth Streaming) tracks raw-concatenate moof+mdat fragments with no ftyp/moov, so shaka-packager/mp4decrypt fail with PARSER_FAILURE (exit 2) on decrypt. The init box was previously built by n_m3u8dl_re, removed in the downloader consolidation. Add ism_init.py, a dependency-free byte-level MP4 init-segment synthesizer that rebuilds ftyp+moov from the manifest CodecPrivateData, ported from yt-dlp's write_piff_header and N_m3u8DL-RE's MSSMoovProcessor with full codec parity: - AVC (H264/AVC1/DAVC), with SPS/PPS picked by NAL type rather than position and NALUnitLengthField honored - HEVC (HVC1/HEV1) with chroma format and bit depths parsed from the de-emulated SPS via exp-Golomb so 10-bit/HDR signals correctly, and profile/tier/level lifted from the SPS PTL - Dolby Vision (DVHE/DVH1) as hvcC with a dvh1 sample entry - AAC (AACL/AACH) with the AudioSpecificConfig synthesized from SamplingRate/Channels when the manifest omits CodecPrivateData - EC-3 with a real dec3 box extracted from the WAVEFORMATEXTENSIBLE CodecPrivateData (Dolby GUID located by search, not fixed offset) - TTML subtitles as stpp/sthd/subt, wired for fragmented-TTML tracks CENC wrapping (encv/enca + sinf/tenc with default_KID) covers encrypted tracks: the per-sample IV size is derived from the fragment senc/saiz (PIFF override flag, payload arithmetic, saiz fallback) instead of assuming 8, and the constant-IV tenc form is supported. Read the track_ID from the first fragment's tfhd so the moov matches and the muxer does not drop samples. Wire ISM.download_track to prepend the synthesized init before merging; unsupported codecs soft-fail to raw concatenation with a warning. Harden against real-world inputs: 2-letter/uppercase manifest language tags normalize to ISO-639-2 (und fallback), >65535 Hz sample rates no longer overflow the 16.16 field, truncated tfhd returns None, struct.error joins the soft-fail handler, and the emulation-prevention scan no longer over-strips consecutive escapes. Add regression tests (37) covering box structure, every supported FourCC, 10-bit SPS parsing, ASC synthesis, dec3 extraction, IV-size derivation and the crash fixes. Validated structurally per codec with ffmpeg-minted fragments: shaka-packager parses synth-init+fragments with exit 0 and ffprobe reports the expected codec, including a live run against a public Smooth Streaming server.
2026-06-22 17:07:23 +00:00 · 2026-06-11 13:41:58 -06:00
parent 466bf610cc
commit 39034f2bb5
3 changed files with 1139 additions and 2 deletions
--- a/tests/core/test_ism_init.py
+++ b/tests/core/test_ism_init.py
@@ -0,0 +1,410 @@
+"""Regression tests for ISM init-segment synthesis (ftyp + moov).
+
+Smooth Streaming fragments carry no moov; the init box must be rebuilt from the
+manifest CodecPrivateData before shaka/mp4decrypt can parse the stream. These
+guard the byte-level box structure so a future downloader refactor cannot
+silently drop it again (the c323db9 regression).
+"""
+
+from __future__ import annotations
+
+import struct
+
+import pytest
+
+from unshackle.core.manifests.ism_init import (NAL_START_CODE, PIFF_SENC_UUID, box, build_avcc, build_dec3,
+                                               build_hvcc, build_init_segment, full_box, parse_hevc_sps_format,
+                                               read_per_sample_iv_size, read_track_id, remove_emulation_prevention,
+                                               split_nal_units, synthesize_aac_codec_private_data)
+
+# Real CodecPrivateData taken from a Smooth Streaming manifest.
+VIDEO_HEVC_CPD = (
+    "0000000140010C01FFFF01600000030090000003000003009695980900000001420101016000000300900000"
+    "030000030096A001E020064165959A4930BC05A80808082000007D20000BB801000000014401C172B66240"
+)
+# H.264 SPS+PPS (start-code delimited) for the AVC path.
+VIDEO_AVC_CPD = "00000001674d401e9a6602800b76020000003e90000bb800f18311200000000168ebccb22c"
+# 10-bit (Main 10) HEVC VPS+SPS+PPS minted with x265; ffprobe reads the
+# synthesized init as "Main 10 / yuv420p10le".
+VIDEO_HEVC10_CPD = (
+    "0000000140010c01ffff02200000030090000003000003003c959809000000000142010102200000030090"
+    "000003000003003ca00a080b9f6d96566924caf0168080000003008000000c8400000000014401c172b4624000"
+)
+AAC_LC_CPD = "1190"
+# Real Smooth EC-3 CodecPrivateData: WAVEFORMATEXTENSIBLE extension (samples
+# per block + channel mask + DD+ GUID) followed by the 5-byte dec3 payload.
+EC3_CPD = "00063F000000AF87FBA7022DFB42A4D405CD93843BDD0600200F00"
+KID = bytes.fromhex("09fd2bd778bb544785ed2322dc6a7d87")
+
+
+def top_level_boxes(data: bytes) -> list[tuple[str, int]]:
+    boxes, offset = [], 0
+    while offset + 8 <= len(data):
+        size = struct.unpack(">I", data[offset : offset + 4])[0]
+        box_type = data[offset + 4 : offset + 8].decode("latin1")
+        if size == 1:
+            size = struct.unpack(">Q", data[offset + 8 : offset + 16])[0]
+        if size == 0:
+            size = len(data) - offset
+        boxes.append((box_type, size))
+        offset += size
+    return boxes
+
+
+def test_split_nal_units_drops_start_codes():
+    nals = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD))
+    # VPS (32), SPS (33), PPS (34) by HEVC NAL type = (first_byte >> 1) & 0x3F.
+    assert [(n[0] >> 1) & 0x3F for n in nals] == [32, 33, 34]
+
+
+def test_hevc_init_structure():
+    init = build_init_segment(
+        stream_type="video",
+        fourcc="HVC1",
+        codec_private_data=VIDEO_HEVC_CPD,
+        timescale=10000000,
+        width=3840,
+        height=1600,
+    )
+    boxes = top_level_boxes(init)
+    assert [b[0] for b in boxes] == ["ftyp", "moov"]
+    assert boxes[0][1] + boxes[1][1] == len(init)
+    assert b"hvcC" in init
+    assert b"hvc1" in init
+    # Unencrypted: no protection scheme boxes.
+    assert b"encv" not in init and b"sinf" not in init
+
+
+def test_avc_init_structure():
+    init = build_init_segment(
+        stream_type="video",
+        fourcc="H264",
+        codec_private_data=VIDEO_AVC_CPD,
+        timescale=10000000,
+        width=1280,
+        height=720,
+    )
+    assert init[4:8] == b"ftyp"
+    assert b"avcC" in init and b"avc1" in init
+
+
+def test_aac_audio_init_structure():
+    init = build_init_segment(
+        stream_type="audio",
+        fourcc="AACL",
+        codec_private_data=AAC_LC_CPD,
+        timescale=10000000,
+        channels=2,
+        sampling_rate=48000,
+    )
+    assert b"mp4a" in init and b"esds" in init
+    assert b"smhd" in init  # sound media header, not video
+
+
+def test_encrypted_init_has_cenc_boxes():
+    init = build_init_segment(
+        stream_type="video",
+        fourcc="HVC1",
+        codec_private_data=VIDEO_HEVC_CPD,
+        timescale=10000000,
+        width=3840,
+        height=1600,
+        kid=KID,
+    )
+    # Encrypted sample entry is wrapped: encv -> sinf(frma+schm+schi(tenc)).
+    assert b"encv" in init
+    assert b"sinf" in init and b"frma" in init and b"tenc" in init
+    assert b"cenc" in init
+    # The 16-byte default_KID must be embedded verbatim for shaka to map the key.
+    assert KID in init
+    # Original codec preserved inside frma for the muxer.
+    assert b"hvc1" in init
+
+
+def test_unsupported_codec_raises():
+    # Unknown FourCC (e.g. VC-1); caller soft-fails to raw concat.
+    with pytest.raises(NotImplementedError):
+        build_init_segment(
+            stream_type="video",
+            fourcc="WVC1",
+            codec_private_data="00063F00",
+            timescale=10000000,
+        )
+
+
+def test_ec3_init_embeds_dec3_from_codec_private_data():
+    init = build_init_segment(
+        stream_type="audio",
+        fourcc="EC-3",
+        codec_private_data=EC3_CPD,
+        timescale=10000000,
+        channels=6,
+        sampling_rate=48000,
+    )
+    assert b"ec-3" in init
+    # dec3 payload = CodecPrivateData past the 22-byte WAVEFORMATEXTENSIBLE header.
+    assert box(b"dec3", bytes.fromhex(EC3_CPD)[22:]) in init
+    assert b"esds" not in init  # no MPEG-4 descriptor inside an ec-3 entry
+
+
+def test_ec3_encrypted_wraps_enca_with_frma():
+    init = build_init_segment(
+        stream_type="audio",
+        fourcc="EC-3",
+        codec_private_data=EC3_CPD,
+        timescale=10000000,
+        channels=6,
+        kid=KID,
+    )
+    assert b"enca" in init and b"sinf" in init and b"tenc" in init
+    assert box(b"frma", b"ec-3") in init
+    assert KID in init
+
+
+def test_ec3_dec3_found_in_full_waveformatextensible():
+    # Some services ship the full WAVEFORMATEX header (18 bytes) before the
+    # extension; the dec3 payload still follows the DD+ GUID.
+    full = b"\xfe\xff" + b"\x00" * 16 + bytes.fromhex(EC3_CPD)
+    payload = bytes.fromhex(EC3_CPD)[22:]
+    assert build_dec3(full) == box(b"dec3", payload)
+
+
+def test_ec3_without_dolby_guid_builds_bare_entry():
+    assert build_dec3(b"\x00\x06\x3f\x00") is None
+    init = build_init_segment(
+        stream_type="audio",
+        fourcc="EC-3",
+        codec_private_data="",
+        timescale=10000000,
+        channels=6,
+    )
+    assert b"ec-3" in init and b"dec3" not in init
+
+
+def test_aac_codec_private_data_synthesis_matches_real_manifest():
+    # 48 kHz stereo AAC-LC must produce 0x1190 — the exact ASC real manifests carry.
+    assert synthesize_aac_codec_private_data("AACL", 48000, 2).hex() == "1190"
+
+
+def test_aach_synthesis_signals_sbr():
+    asc = synthesize_aac_codec_private_data("AACH", 24000, 2)
+    assert len(asc) == 4
+    assert asc[0] >> 3 == 0x05  # AOT 5 = SBR (HE-AAC)
+    # Extension sampling frequency = core * 2 = 48 kHz (index 3).
+    assert ((asc[1] & 0x01) << 1) | (asc[2] >> 7) == 0x03
+
+
+def test_aac_init_without_codec_private_data_synthesizes_asc():
+    init = build_init_segment(
+        stream_type="audio",
+        fourcc="AACL",
+        codec_private_data="",
+        timescale=10000000,
+        channels=2,
+        sampling_rate=48000,
+    )
+    assert b"mp4a" in init and b"esds" in init
+    assert bytes.fromhex(AAC_LC_CPD) in init
+
+
+def test_dolby_vision_uses_dvh1_sample_entry():
+    init = build_init_segment(
+        stream_type="video",
+        fourcc="DVH1",
+        codec_private_data=VIDEO_HEVC_CPD,
+        timescale=10000000,
+        width=3840,
+        height=1600,
+    )
+    assert b"dvh1" in init and b"hvcC" in init
+    assert b"hvc1" not in init
+
+
+def test_davc_maps_to_avc1():
+    init = build_init_segment(
+        stream_type="video",
+        fourcc="DAVC",
+        codec_private_data=VIDEO_AVC_CPD,
+        timescale=10000000,
+    )
+    assert b"avc1" in init and b"avcC" in init
+
+
+def test_lowercase_fourcc_normalized():
+    # Real manifests ship FourCC="hvc1" in lowercase.
+    init = build_init_segment(
+        stream_type="video",
+        fourcc="hvc1",
+        codec_private_data=VIDEO_HEVC_CPD,
+        timescale=10000000,
+    )
+    assert b"hvcC" in init
+
+
+def test_avcc_selects_sps_pps_by_nal_type_not_position():
+    nals = split_nal_units(bytes.fromhex(VIDEO_AVC_CPD))
+    swapped = NAL_START_CODE + nals[1] + NAL_START_CODE + nals[0]  # PPS first
+    avcc = build_avcc(swapped)
+    # Profile/compat/level must still come from the SPS body.
+    assert avcc[9:12] == nals[0][1:4]
+
+
+def test_nal_length_field_respected():
+    avcc = build_avcc(bytes.fromhex(VIDEO_AVC_CPD), nal_length_size=2)
+    # avcC payload byte 4 low 2 bits = lengthSizeMinusOne.
+    assert avcc[12] & 0x03 == 1
+
+
+def test_parse_hevc_sps_format_8bit():
+    sps = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD))[1]
+    assert parse_hevc_sps_format(remove_emulation_prevention(sps)) == (1, 0, 0)  # 4:2:0, 8-bit
+
+
+def test_hvcc_signals_10bit_from_sps():
+    sps = next(n for n in split_nal_units(bytes.fromhex(VIDEO_HEVC10_CPD)) if (n[0] >> 1) & 0x3F == 33)
+    assert parse_hevc_sps_format(remove_emulation_prevention(sps)) == (1, 2, 2)  # 4:2:0, 10-bit
+    payload = build_hvcc(bytes.fromhex(VIDEO_HEVC10_CPD))[8:]  # strip box header
+    assert payload[16] == 0xFC | 0x01  # chromaFormat 4:2:0
+    assert payload[17] == 0xF8 | 0x02  # bitDepthLumaMinus8 = 2
+    assert payload[18] == 0xF8 | 0x02  # bitDepthChromaMinus8 = 2
+
+
+def test_ttml_init_structure():
+    init = build_init_segment(
+        stream_type="text",
+        fourcc="TTML",
+        codec_private_data="",
+        timescale=10000000,
+        language="eng",
+    )
+    assert b"stpp" in init
+    assert b"sthd" in init  # subtitle media header
+    assert b"subt" in init and b"SubtitleHandler\0" in init
+    assert b"http://www.w3.org/ns/ttml\0" in init
+
+
+def test_constant_iv_tenc_form():
+    constant_iv = bytes(range(16))
+    init = build_init_segment(
+        stream_type="video",
+        fourcc="HVC1",
+        codec_private_data=VIDEO_HEVC_CPD,
+        timescale=10000000,
+        kid=KID,
+        constant_iv=constant_iv,
+    )
+    # Constant-IV form: default_Per_Sample_IV_Size = 0, then size + IV after the KID.
+    assert KID + bytes([len(constant_iv)]) + constant_iv in init
+    tenc_at = init.index(b"tenc")
+    assert init[tenc_at + 4 + 4 + 3] == 0  # default_Per_Sample_IV_Size
+
+
+def make_fragment(senc: bytes = b"", saiz: bytes = b"") -> bytes:
+    tfhd = full_box(b"tfhd", 0, 0, struct.pack(">I", 1) + b"\x00" * 4)
+    traf = box(b"traf", tfhd + senc + saiz)
+    return box(b"moof", traf) + box(b"mdat", b"\x00" * 4)
+
+
+def test_iv_size_from_piff_senc_override_flag():
+    # PIFF senc uuid with flags&1: AlgorithmID(3) + IV_size(1) + KID(16) override.
+    payload = b"\x00\x00\x00\x01" + b"\x00\x00\x01" + bytes([16]) + KID + struct.pack(">I", 0)
+    senc = box(b"uuid", PIFF_SENC_UUID + payload)
+    assert read_per_sample_iv_size(make_fragment(senc=senc)) == 16
+
+
+def test_iv_size_from_senc_payload_length():
+    # Standard senc, no subsamples: 3 samples x 8-byte IVs.
+    senc = full_box(b"senc", 0, 0, struct.pack(">I", 3) + b"\x11" * 24)
+    assert read_per_sample_iv_size(make_fragment(senc=senc)) == 8
+
+
+def test_iv_size_from_senc_with_subsamples():
+    # senc flags&2: per sample IV(8) + entry_count(2) + 6 bytes per entry.
+    sample = b"\x22" * 8 + struct.pack(">H", 1) + b"\x00" * 6
+    senc = full_box(b"senc", 0, 2, struct.pack(">I", 2) + sample * 2)
+    assert read_per_sample_iv_size(make_fragment(senc=senc)) == 8
+
+
+def test_iv_size_from_saiz_fallback():
+    saiz = full_box(b"saiz", 0, 0, bytes([16]) + struct.pack(">I", 5))
+    assert read_per_sample_iv_size(make_fragment(saiz=saiz)) == 16
+
+
+def test_iv_size_undetermined_returns_none():
+    assert read_per_sample_iv_size(make_fragment()) is None
+
+
+def test_hvcc_embeds_vps_sps_pps():
+    hvcc = build_hvcc(bytes.fromhex(VIDEO_HEVC_CPD))
+    nals = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD))
+    # Each original NAL unit (VPS/SPS/PPS) is embedded verbatim in the arrays.
+    for nal in nals:
+        assert nal in hvcc
+
+
+def test_avcc_requires_sps_and_pps():
+    with pytest.raises(ValueError):
+        build_avcc(b"\x00\x00\x00\x01\x67only_sps")
+
+
+def test_read_track_id_from_fragment():
+    # Minimal moof/traf/tfhd carrying track_ID = 7.
+    tfhd = full_box("tfhd".encode(), 0, 0, struct.pack(">I", 7) + b"\x00" * 4)
+    traf = box(b"traf", tfhd)
+    moof = box(b"moof", traf)
+    mdat = box(b"mdat", b"\x00\x00")
+    assert read_track_id(moof + mdat) == 7
+
+
+def test_read_track_id_missing_returns_none():
+    assert read_track_id(box(b"mdat", b"\x00\x00")) is None
+
+
+def test_remove_emulation_prevention():
+    # 00 00 03 XX -> the 0x03 emulation byte is dropped.
+    assert remove_emulation_prevention(b"\x00\x00\x03\x01") == b"\x00\x00\x01"
+    assert remove_emulation_prevention(b"\x00\x00\x03\x00\x00\x03\x96") == b"\x00\x00\x00\x00\x96"
+    # The byte after a consumed escape is data, even another 0x03.
+    assert remove_emulation_prevention(b"\x00\x00\x03\x03") == b"\x00\x00\x03"
+    assert remove_emulation_prevention(b"\x00\x00\x03\x03\x00\x00\x03\x01") == b"\x00\x00\x03\x00\x00\x01"
+
+
+def test_two_letter_or_uppercase_language_falls_back_to_und():
+    # mdhd packs three a-z letters; "en"/"ENG" must not crash struct.pack.
+    for lang in ("en", "ENG", "", "e1x"):
+        init = build_init_segment(
+            stream_type="audio",
+            fourcc="AACL",
+            codec_private_data=AAC_LC_CPD,
+            timescale=10000000,
+            language=lang,
+        )
+        assert init[4:8] == b"ftyp"
+
+
+def test_high_sampling_rate_does_not_overflow():
+    # 96 kHz exceeds the 16.16 integer field; written as 0 like ffmpeg does.
+    init = build_init_segment(
+        stream_type="audio",
+        fourcc="AACL",
+        codec_private_data="",
+        timescale=10000000,
+        sampling_rate=96000,
+    )
+    assert b"mp4a" in init
+
+
+def test_read_track_id_truncated_tfhd_returns_none():
+    tfhd = full_box(b"tfhd", 0, 0, b"\x00\x00")  # too short for a track_ID
+    fragment = box(b"moof", box(b"traf", tfhd))
+    assert read_track_id(fragment) is None
+
+
+def test_hvcc_profile_tier_level_is_nonzero():
+    # De-emulated PTL must yield real profile/level, not the off-by-one garbage.
+    hvcc = build_hvcc(bytes.fromhex(VIDEO_HEVC_CPD))
+    payload = hvcc[8:]  # strip box header
+    profile_idc = payload[1] & 0x1F
+    level_idc = payload[12]
+    assert profile_idc != 0
+    assert level_idc != 0
--- a/unshackle/core/manifests/ism.py
+++ b/unshackle/core/manifests/ism.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import base64
 import hashlib
 import html
+import struct
 import urllib.parse
 from functools import partial
 from pathlib import Path
@@ -18,6 +19,7 @@ from requests import Session
 from unshackle.core.constants import DOWNLOAD_CANCELLED, DOWNLOAD_LICENCE_ONLY, AnyTrack
 from unshackle.core.drm import DRM_T, PlayReady, Widevine
 from unshackle.core.events import events
+from unshackle.core.manifests.ism_init import build_init_segment, read_per_sample_iv_size, read_track_id
 from unshackle.core.session import RnetSession
 from unshackle.core.tracks import Audio, Subtitle, Track, Tracks, Video
 from unshackle.core.utilities import log_event, try_ensure_utf8
@@ -85,6 +87,104 @@ class ISM:
                drm.append(PlayReady(pssh=pr_pssh, pssh_b64=data))
        return drm

+    @staticmethod
+    def _init_segment(
+        track: AnyTrack, session_drm: Optional[DRM_T], first_segment: Optional[bytes] = None
+    ) -> Optional[bytes]:
+        # Smooth fragments are moof+mdat only; rebuild the ftyp+moov init box from
+        # the manifest CodecPrivateData (and KID, when encrypted) so the merged file
+        # is a valid MP4 that shaka/mp4decrypt can parse.
+        ism = track.data.get("ism") if isinstance(getattr(track, "data", None), dict) else None
+        if not ism:
+            return None
+        stream_index = ism.get("stream_index")
+        quality_level = ism.get("quality_level")
+        manifest = ism.get("manifest")
+        if stream_index is None or quality_level is None:
+            return None
+        # CodecPrivateData may legitimately be empty (AAC config is synthesized,
+        # EC-3 decoders sync from the frames); the builder handles each case.
+        cpd = quality_level.get("CodecPrivateData") or ""
+        fourcc = quality_level.get("FourCC") or ""
+
+        root_timescale = manifest.get("TimeScale") if manifest is not None else None
+        timescale = int(stream_index.get("TimeScale") or root_timescale or 10000000)
+        duration = int((manifest.get("Duration") if manifest is not None else 0) or 0)
+        # mdhd needs a 3-letter ISO-639-2 code; manifests often carry 2-letter tags.
+        lang_attr = (stream_index.get("Language") or "").strip()
+        language = "und"
+        if lang_attr and tag_is_valid(lang_attr):
+            try:
+                language = Language.get(lang_attr).to_alpha3()
+            except LookupError:
+                language = "und"
+
+        kid: Optional[bytes] = None
+        if session_drm is not None:
+            kid_uuid = next(iter(getattr(session_drm, "kids", None) or []), None)
+            if kid_uuid is not None:
+                kid = bytes.fromhex(kid_uuid.hex)
+
+        # Match the moov track_ID to the fragment's tfhd, else the muxer drops samples.
+        track_id = (read_track_id(first_segment) if first_segment else None) or 1
+        # NALUnitLengthField: bytes per NAL length prefix, default 4.
+        nal_length_size = int(quality_level.get("NALUnitLengthField") or stream_index.get("NALUnitLengthField") or 4)
+        # Per-sample IV size derived from the fragment senc/saiz (PIFF default 8).
+        iv_size = (read_per_sample_iv_size(first_segment) if first_segment and kid else None) or 8
+
+        try:
+            if isinstance(track, Subtitle):
+                if track.codec != Subtitle.Codec.fTTML:
+                    return None  # plain-text subtitle formats concatenate fine
+                return build_init_segment(
+                    stream_type="text",
+                    fourcc="TTML",
+                    codec_private_data="",
+                    timescale=timescale,
+                    duration=duration,
+                    language=language,
+                    track_id=track_id,
+                )
+            if isinstance(track, Video):
+                return build_init_segment(
+                    stream_type="video",
+                    fourcc=fourcc,
+                    codec_private_data=cpd,
+                    timescale=timescale,
+                    duration=duration,
+                    language=language,
+                    width=int(quality_level.get("MaxWidth") or stream_index.get("MaxWidth") or 0),
+                    height=int(quality_level.get("MaxHeight") or stream_index.get("MaxHeight") or 0),
+                    track_id=track_id,
+                    nal_length_size=nal_length_size,
+                    kid=kid,
+                    iv_size=iv_size,
+                )
+            return build_init_segment(
+                stream_type="audio",
+                fourcc=fourcc,
+                codec_private_data=cpd,
+                timescale=timescale,
+                duration=duration,
+                language=language,
+                channels=int(quality_level.get("Channels") or 2),
+                bits_per_sample=int(quality_level.get("BitsPerSample") or 16),
+                sampling_rate=int(quality_level.get("SamplingRate") or 48000),
+                track_id=track_id,
+                kid=kid,
+                iv_size=iv_size,
+            )
+        except (NotImplementedError, ValueError, struct.error) as e:
+            # Unsupported codec, malformed CodecPrivateData or out-of-range field —
+            # fall back to raw concatenation rather than aborting the download.
+            log_event(
+                "manifest_ism_init_unsupported",
+                level="WARNING",
+                message=f"Could not synthesize ISM init segment ({fourcc}): {e}",
+                context={"track_id": getattr(track, "id", None), "fourcc": fourcc},
+            )
+            return None
+
    def to_tracks(self, language: Optional[Union[str, Language]] = None) -> Tracks:
        tracks = Tracks()
        base_url = self.url
@@ -383,8 +483,13 @@ class ISM:
            raise FileNotFoundError(error_msg)

        with open(save_path, "wb") as f:
-            for segment_file in segments_to_merge:
-                segment_data = segment_file.read_bytes()
+            first_segment = segments_to_merge[0].read_bytes() if segments_to_merge else None
+            init_segment = ISM._init_segment(track, session_drm, first_segment)
+            if init_segment:
+                f.write(init_segment)
+            for index, segment_file in enumerate(segments_to_merge):
+                # First segment was already read for the init synthesis — reuse it.
+                segment_data = first_segment if index == 0 and first_segment else segment_file.read_bytes()
                if (
                    not session_drm
                    and isinstance(track, Subtitle)
--- a/unshackle/core/manifests/ism_init.py
+++ b/unshackle/core/manifests/ism_init.py
@@ -0,0 +1,622 @@
+"""
+Synthesize an ISO-BMFF initialization segment (ftyp + moov) for ISM / Smooth
+Streaming tracks.
+
+Smooth Streaming fragments are bare ``moof`` + ``mdat`` pairs; the server never
+sends a ``moov``. The init box must be reconstructed from the manifest's
+``CodecPrivateData`` (and, for protected content, the track KID) before a muxer
+or decryptor such as shaka-packager can parse the stream. Ported from yt-dlp's
+``write_piff_header`` and N_m3u8DL-RE's ``MSSMoovProcessor`` with HEVC, Dolby
+Vision, EC-3, TTML and CENC (PIFF) support.
+"""
+
+from __future__ import annotations
+
+import binascii
+import struct
+from typing import Iterator, Optional
+
+# Big-endian field packers (named for the bit widths they encode).
+u8 = struct.Struct(">B")
+u16 = struct.Struct(">H")
+u32 = struct.Struct(">I")
+u64 = struct.Struct(">Q")
+s16 = struct.Struct(">h")
+s88 = struct.Struct(">bx")  # 8.8 fixed-point
+s1616 = struct.Struct(">hxx")  # 16.16 fixed-point
+u1616 = struct.Struct(">Hxx")
+s32 = struct.Struct(">i")
+
+# 3x3 transformation matrix (identity), as stored in tkhd/mvhd.
+UNITY_MATRIX = (
+    s32.pack(0x10000) + s32.pack(0) * 3
+    + s32.pack(0) + s32.pack(0x10000) + s32.pack(0) * 2
+    + s32.pack(0) * 2 + s32.pack(0x40000000)
+)
+
+TRACK_ENABLED = 0x1
+TRACK_IN_MOVIE = 0x2
+TRACK_IN_PREVIEW = 0x4
+SELF_CONTAINED = 0x1
+
+# Fixed creation/modification time — deterministic output (no wall clock).
+EPOCH = 0
+
+NAL_START_CODE = b"\x00\x00\x00\x01"
+
+# WAVEFORMATEXTENSIBLE SubFormat GUID for Dolby Digital Plus, as serialized
+# (little-endian) inside Smooth EC-3 CodecPrivateData.
+DOLBY_DIGITAL_PLUS_GUID = bytes.fromhex("AF87FBA7022DFB42A4D405CD93843BDD")
+
+# PIFF SampleEncryptionBox usertype (the pre-CENC 'senc' carried as a uuid box).
+PIFF_SENC_UUID = bytes.fromhex("A2394F525A9B4F14A2446C427C648DF4")
+
+TTML_NAMESPACE = b"http://www.w3.org/ns/ttml\0"
+
+# ISO/IEC 14496-3 samplingFrequencyIndex table for AudioSpecificConfig.
+AAC_SAMPLING_FREQUENCY_INDEX = {
+    96000: 0x0,
+    88200: 0x1,
+    64000: 0x2,
+    48000: 0x3,
+    44100: 0x4,
+    32000: 0x5,
+    24000: 0x6,
+    22050: 0x7,
+    16000: 0x8,
+    12000: 0x9,
+    11025: 0xA,
+    8000: 0xB,
+    7350: 0xC,
+}
+
+
+def box(box_type: bytes, payload: bytes) -> bytes:
+    """Wrap payload in a basic ISO-BMFF box (size + fourcc + payload)."""
+    return u32.pack(8 + len(payload)) + box_type + payload
+
+
+def full_box(box_type: bytes, version: int, flags: int, payload: bytes) -> bytes:
+    """Wrap payload in a FullBox (adds 1-byte version + 3-byte flags)."""
+    return box(box_type, u8.pack(version) + u32.pack(flags)[1:] + payload)
+
+
+def split_nal_units(codec_private_data: bytes) -> list[bytes]:
+    """Split CodecPrivateData into its NAL units (drops the start codes)."""
+    units = [u for u in codec_private_data.split(NAL_START_CODE) if u]
+    return units
+
+
+def remove_emulation_prevention(data: bytes) -> bytes:
+    """Strip H.26x emulation-prevention bytes (the 0x03 in any 00 00 03 run).
+
+    The byte after a consumed escape is data — even another 0x03 — so the scan
+    must skip past it rather than re-examine (a naive trailing-window check
+    over-strips consecutive escapes and shifts every later bit position).
+    """
+    out = bytearray()
+    i = 0
+    while i < len(data):
+        if i + 2 < len(data) and data[i] == 0 and data[i + 1] == 0 and data[i + 2] == 3:
+            out += b"\x00\x00"
+            i += 3
+        else:
+            out.append(data[i])
+            i += 1
+    return bytes(out)
+
+
+class BitReader:
+    """MSB-first bit reader with the exp-Golomb decode H.26x headers need."""
+
+    def __init__(self, data: bytes) -> None:
+        self.data = data
+        self.pos = 0
+
+    def read_bits(self, count: int) -> int:
+        value = 0
+        for _ in range(count):
+            byte = self.data[self.pos >> 3]
+            value = (value << 1) | ((byte >> (7 - (self.pos & 7))) & 1)
+            self.pos += 1
+        return value
+
+    def read_ue(self) -> int:
+        zeros = 0
+        while self.read_bits(1) == 0:
+            zeros += 1
+            if zeros > 32:
+                raise ValueError("Invalid exp-Golomb code")
+        return (1 << zeros) - 1 + (self.read_bits(zeros) if zeros else 0)
+
+
+def parse_hevc_sps_format(sps_rbsp: bytes) -> tuple[int, int, int]:
+    """
+    Parse (chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8)
+    from a de-emulated HEVC SPS RBSP (including its 2-byte NAL header).
+    """
+    r = BitReader(sps_rbsp)
+    r.read_bits(16)  # NAL unit header
+    r.read_bits(4)  # sps_video_parameter_set_id
+    max_sub_layers_minus1 = r.read_bits(3)
+    r.read_bits(1)  # sps_temporal_id_nesting_flag
+    r.read_bits(96)  # general profile_tier_level (12 bytes)
+    profile_present = []
+    level_present = []
+    for _ in range(max_sub_layers_minus1):
+        profile_present.append(r.read_bits(1))
+        level_present.append(r.read_bits(1))
+    if max_sub_layers_minus1 > 0:
+        r.read_bits((8 - max_sub_layers_minus1) * 2)  # reserved_zero_2bits
+    for i in range(max_sub_layers_minus1):
+        if profile_present[i]:
+            r.read_bits(88)  # sub_layer profile_tier
+        if level_present[i]:
+            r.read_bits(8)  # sub_layer_level_idc
+    r.read_ue()  # sps_seq_parameter_set_id
+    chroma_format_idc = r.read_ue()
+    if chroma_format_idc == 3:
+        r.read_bits(1)  # separate_colour_plane_flag
+    r.read_ue()  # pic_width_in_luma_samples
+    r.read_ue()  # pic_height_in_luma_samples
+    if r.read_bits(1):  # conformance_window_flag
+        for _ in range(4):
+            r.read_ue()
+    bit_depth_luma_minus8 = r.read_ue()
+    bit_depth_chroma_minus8 = r.read_ue()
+    return chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8
+
+
+def iter_boxes(data: bytes, start: int, end: int) -> Iterator[tuple[bytes, Optional[bytes], int, int]]:
+    """Yield (type, uuid_usertype, payload_start, box_end) for each child box."""
+    offset = start
+    while offset + 8 <= end:
+        size = struct.unpack(">I", data[offset : offset + 4])[0]
+        box_type = data[offset + 4 : offset + 8]
+        header = 8
+        if size == 1:
+            size = struct.unpack(">Q", data[offset + 8 : offset + 16])[0]
+            header = 16
+        if size == 0:
+            size = end - offset
+        if size < 8:  # corrupt box header; stop rather than loop forever
+            return
+        usertype = None
+        if box_type == b"uuid" and offset + header + 16 <= end:
+            usertype = data[offset + header : offset + header + 16]
+            header += 16
+        yield box_type, usertype, offset + header, offset + size
+        offset += size
+
+
+def find_box(data: bytes, start: int, end: int, target: bytes) -> Optional[tuple[int, int]]:
+    """Find the first child box of the given type; return (payload_start, end)."""
+    for box_type, _, body, box_end in iter_boxes(data, start, end):
+        if box_type == target:
+            return body, box_end
+    return None
+
+
+def read_track_id(fragment: bytes) -> Optional[int]:
+    """Read the track_ID from a fragment's moof/traf/tfhd box, if present.
+
+    Smooth fragments declare their own track_ID; the synthesized moov must use
+    the same value or the muxer cannot associate samples with the track. The
+    track_ID sits before any tfhd optional fields, so the flags don't matter.
+    """
+    moof = find_box(fragment, 0, len(fragment), b"moof")
+    if not moof:
+        return None
+    traf = find_box(fragment, *moof, b"traf")
+    if not traf:
+        return None
+    tfhd = find_box(fragment, *traf, b"tfhd")
+    if not tfhd:
+        return None
+    body, _ = tfhd
+    if body + 8 > len(fragment):  # truncated tfhd
+        return None
+    # tfhd payload: version(1) + flags(3) + track_ID(4)
+    return struct.unpack(">I", fragment[body + 4 : body + 8])[0]
+
+
+def read_per_sample_iv_size(fragment: bytes) -> Optional[int]:
+    """
+    Derive the per-sample IV size (8 or 16) from a fragment's sample-encryption
+    metadata, for the synthesized tenc default_Per_Sample_IV_Size.
+
+    Checks, in order: the PIFF 'senc' uuid override flag (explicit IV size),
+    the senc payload length (sample_count vs IV/subsample entries), and the
+    saiz default_sample_info_size (only unambiguous without subsamples).
+    """
+    moof = find_box(fragment, 0, len(fragment), b"moof")
+    if not moof:
+        return None
+    traf = find_box(fragment, *moof, b"traf")
+    if not traf:
+        return None
+
+    senc: Optional[tuple[int, int]] = None
+    saiz_default: Optional[int] = None
+    senc_has_subsamples = False
+    for box_type, usertype, body, box_end in iter_boxes(fragment, *traf):
+        if box_type == b"senc" or (box_type == b"uuid" and usertype == PIFF_SENC_UUID):
+            senc = (body, box_end)
+        elif box_type == b"saiz":
+            flags = int.from_bytes(fragment[body + 1 : body + 4], "big")
+            pos = body + 4 + (8 if flags & 0x1 else 0)  # skip aux_info_type fields
+            if pos < box_end:
+                saiz_default = fragment[pos]
+
+    if senc:
+        body, box_end = senc
+        flags = int.from_bytes(fragment[body + 1 : body + 4], "big")
+        senc_has_subsamples = bool(flags & 0x2)
+        pos = body + 4
+        if flags & 0x1:  # PIFF override: AlgorithmID(3) + IV_size(1) + KID(16)
+            return fragment[pos + 3]
+        if pos + 4 <= box_end:
+            sample_count = struct.unpack(">I", fragment[pos : pos + 4])[0]
+            pos += 4
+            if sample_count:
+                if not senc_has_subsamples:
+                    size, rem = divmod(box_end - pos, sample_count)
+                    if rem == 0 and size in (8, 16):
+                        return size
+                else:
+                    # Walk the entries with each candidate IV size; the one that
+                    # lands exactly on the box end is correct.
+                    for iv_size in (8, 16):
+                        cursor = pos
+                        for _ in range(sample_count):
+                            cursor += iv_size
+                            if cursor + 2 > box_end:
+                                cursor = -1
+                                break
+                            entries = struct.unpack(">H", fragment[cursor : cursor + 2])[0]
+                            cursor += 2 + 6 * entries
+                            if cursor > box_end:
+                                cursor = -1
+                                break
+                        if cursor == box_end:
+                            return iv_size
+
+    if not senc_has_subsamples and saiz_default in (8, 16):
+        return saiz_default
+    return None
+
+
+def build_avcc(codec_private_data: bytes, nal_length_size: int = 4) -> bytes:
+    """Build an avcC (AVC decoder config) box from SPS+PPS CodecPrivateData."""
+    nals = split_nal_units(codec_private_data)
+    # Pick parameter sets by H.264 NAL type (low 5 bits): 7 = SPS, 8 = PPS.
+    # Manifests do not guarantee SPS-first ordering.
+    sps = next((n for n in nals if n[0] & 0x1F == 7), None)
+    pps = next((n for n in nals if n[0] & 0x1F == 8), None)
+    if not sps or not pps:
+        raise ValueError("AVC CodecPrivateData must contain SPS and PPS NAL units")
+    payload = u8.pack(1)  # configuration version
+    payload += sps[1:4]  # profile / compat / level (from SPS NAL body)
+    payload += u8.pack(0xFC | (nal_length_size - 1))  # reserved + length size minus one
+    payload += u8.pack(0xE0 | 1)  # reserved + number of SPS (1)
+    payload += u16.pack(len(sps)) + sps
+    payload += u8.pack(1)  # number of PPS
+    payload += u16.pack(len(pps)) + pps
+    return box(b"avcC", payload)
+
+
+def build_hvcc(codec_private_data: bytes, nal_length_size: int = 4) -> bytes:
+    """
+    Build an hvcC (HEVC decoder config) box from VPS+SPS+PPS CodecPrivateData.
+
+    Profile/tier/level bytes are lifted from the SPS profile_tier_level; chroma
+    format and bit depths are parsed from the SPS so 10-bit/HDR streams signal
+    correctly (falls back to 8-bit 4:2:0 on malformed SPS data).
+    """
+    nals = split_nal_units(codec_private_data)
+    if len(nals) < 3:
+        raise ValueError("HEVC CodecPrivateData must contain VPS, SPS and PPS NAL units")
+
+    # Group NAL units by type (HEVC NAL type = (first byte >> 1) & 0x3F).
+    by_type: dict[int, list[bytes]] = {}
+    for nal in nals:
+        nal_type = (nal[0] >> 1) & 0x3F
+        by_type.setdefault(nal_type, []).append(nal)
+
+    sps = by_type.get(33, [b""])[0]
+    # profile_tier_level must be read from the de-emulated SPS RBSP, after the
+    # 2-byte NAL header + 1 byte (sps_video_parameter_set_id(4) +
+    # sps_max_sub_layers_minus1(3) + sps_temporal_id_nesting_flag(1)). PTL is 12
+    # bytes: profile byte(1) + compat flags(4) + constraint flags(6) + level(1).
+    sps_rbsp = remove_emulation_prevention(sps)
+    ptl = sps_rbsp[3:15] if len(sps_rbsp) >= 15 else b"\x00" * 12
+    general_profile_space_tier_profile = ptl[0:1] or b"\x00"
+    general_profile_compat = ptl[1:5].ljust(4, b"\x00")
+    general_constraint = ptl[5:11].ljust(6, b"\x00")
+    general_level_idc = ptl[11:12] or b"\x00"
+
+    try:
+        chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8 = parse_hevc_sps_format(sps_rbsp)
+    except (IndexError, ValueError):
+        chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8 = 1, 0, 0
+
+    payload = u8.pack(1)  # configurationVersion
+    payload += general_profile_space_tier_profile
+    payload += general_profile_compat
+    payload += general_constraint
+    payload += general_level_idc
+    payload += u16.pack(0xF000)  # reserved(4) + min_spatial_segmentation_idc(12)
+    payload += u8.pack(0xFC)  # reserved(6) + parallelismType(2)
+    payload += u8.pack(0xFC | (chroma_format_idc & 0x03))  # reserved(6) + chromaFormat(2)
+    payload += u8.pack(0xF8 | (bit_depth_luma_minus8 & 0x07))  # reserved(5) + bitDepthLumaMinus8(3)
+    payload += u8.pack(0xF8 | (bit_depth_chroma_minus8 & 0x07))  # reserved(5) + bitDepthChromaMinus8(3)
+    payload += u16.pack(0)  # avgFrameRate
+    # constantFrameRate(2)+numTemporalLayers(3)+temporalIdNested(1)+lengthSizeMinusOne(2)
+    payload += u8.pack((nal_length_size - 1) & 0x03)
+
+    arrays = bytearray()
+    num_arrays = 0
+    for nal_type in (32, 33, 34):  # VPS, SPS, PPS
+        units = by_type.get(nal_type)
+        if not units:
+            continue
+        num_arrays += 1
+        arrays += u8.pack(0x80 | nal_type)  # array_completeness(1)+reserved(1)+NAL type(6)
+        arrays += u16.pack(len(units))
+        for unit in units:
+            arrays += u16.pack(len(unit)) + unit
+    payload += u8.pack(num_arrays) + bytes(arrays)
+    return box(b"hvcC", payload)
+
+
+def build_esds(codec_private_data: bytes) -> bytes:
+    """Build an esds box wrapping the AAC AudioSpecificConfig."""
+    asc = codec_private_data
+    # DecoderSpecificInfo (tag 0x05)
+    dsi = u8.pack(0x05) + u8.pack(len(asc)) + asc
+    # DecoderConfigDescriptor (tag 0x04): objectType=0x40 (AAC), stream type audio
+    dcd = (
+        u8.pack(0x40)  # object type indication = MPEG-4 AAC
+        + u8.pack(0x15)  # stream type (audio) << 2 | upstream | reserved
+        + b"\x00\x00\x00"  # buffer size
+        + u32.pack(0)  # max bitrate
+        + u32.pack(0)  # avg bitrate
+        + dsi
+    )
+    dcd_box = u8.pack(0x04) + u8.pack(len(dcd)) + dcd
+    # SLConfigDescriptor (tag 0x06)
+    sl = u8.pack(0x06) + u8.pack(1) + u8.pack(0x02)
+    # ES_Descriptor (tag 0x03)
+    es = u8.pack(0x03) + u8.pack(len(dcd_box) + len(sl) + 3) + u16.pack(0) + u8.pack(0) + dcd_box + sl
+    return full_box(b"esds", 0, 0, es)
+
+
+def build_dec3(codec_private_data: bytes) -> Optional[bytes]:
+    """Build a dec3 (EC-3 specific) box from Smooth EC-3 CodecPrivateData.
+
+    Smooth EC-3 CodecPrivateData ([MS-SSTR] AudioTag 65534) serializes a
+    WAVEFORMATEXTENSIBLE — sometimes the full structure, sometimes only its
+    extension (samples-per-block + channel mask + DD+ SubFormat GUID) — with
+    the raw dec3 payload (ETSI TS 102 366 F.6) after the GUID. Returns None
+    when the GUID is absent — decoders still sync from EC-3 frames in mdat.
+    """
+    guid_at = codec_private_data.find(DOLBY_DIGITAL_PLUS_GUID)
+    if guid_at != -1 and len(codec_private_data) > guid_at + 16:
+        return box(b"dec3", codec_private_data[guid_at + 16 :])
+    return None
+
+
+def synthesize_aac_codec_private_data(fourcc: str, sampling_rate: int, channels: int) -> bytes:
+    """Generate the AAC AudioSpecificConfig when the manifest omits it.
+
+    AACL -> 2-byte AAC-LC config; AACH -> 4-byte HE-AAC (SBR, AOT 5) config
+    with the extension sampling frequency at twice the core rate.
+    """
+    freq = AAC_SAMPLING_FREQUENCY_INDEX.get(sampling_rate, 0x0)
+    if fourcc == "AACH":
+        ext_freq = AAC_SAMPLING_FREQUENCY_INDEX.get(sampling_rate * 2, 0x0)
+        return bytes(
+            (
+                (0x05 << 3) | (freq >> 1),
+                ((freq & 0x01) << 7) | (channels << 3) | (ext_freq >> 1),
+                ((ext_freq & 0x01) << 7) | (0x02 << 2),  # core object type = AAC LC
+                0x00,  # alignment bits
+            )
+        )
+    return bytes(((0x02 << 3) | (freq >> 1), ((freq & 0x01) << 7) | (channels << 3)))
+
+
+def build_sinf(
+    original_format: bytes,
+    kid: bytes,
+    iv_size: int = 8,
+    constant_iv: Optional[bytes] = None,
+) -> bytes:
+    """Build a sinf protection box (frma + schm cenc + schi/tenc) for CENC.
+
+    iv_size is the tenc default_Per_Sample_IV_Size (8 or 16). When constant_iv
+    is given, the per-sample IV size is 0 and the constant IV is appended per
+    ISO/IEC 23001-7 (cbcs-style constant-IV form).
+    """
+    frma = box(b"frma", original_format)
+    schm = full_box(b"schm", 0, 0, b"cenc" + u32.pack(0x00010000))
+    tenc_payload = (
+        u8.pack(0)  # reserved
+        + u8.pack(0)  # default_crypt_byte_block / skip_byte_block (cenc)
+        + u8.pack(1)  # default_isProtected
+        + u8.pack(0 if constant_iv else iv_size)  # default_Per_Sample_IV_Size
+        + kid  # default_KID (16 bytes)
+    )
+    if constant_iv:
+        tenc_payload += u8.pack(len(constant_iv)) + constant_iv
+    schi = box(b"schi", full_box(b"tenc", 0, 0, tenc_payload))
+    return box(b"sinf", frma + schm + schi)
+
+
+def build_init_segment(
+    *,
+    stream_type: str,
+    fourcc: str,
+    codec_private_data: str,
+    timescale: int = 10000000,
+    duration: int = 0,
+    language: str = "und",
+    width: int = 0,
+    height: int = 0,
+    channels: int = 2,
+    bits_per_sample: int = 16,
+    sampling_rate: int = 48000,
+    track_id: int = 1,
+    nal_length_size: int = 4,
+    kid: Optional[bytes] = None,
+    iv_size: int = 8,
+    constant_iv: Optional[bytes] = None,
+) -> bytes:
+    """
+    Build a complete ftyp + moov initialization segment.
+
+    stream_type: "video" | "audio" | "text".
+    fourcc: Smooth FourCC ("H264"/"AVC1"/"DAVC", "HVC1"/"HEV1", "DVHE"/"DVH1",
+            "AACL"/"AACH", "EC-3", "TTML").
+    codec_private_data: hex string from the manifest QualityLevel.
+    nal_length_size: manifest NALUnitLengthField (bytes per NAL length prefix).
+    kid: 16-byte default key id; when set, the sample entry is wrapped for CENC.
+    iv_size / constant_iv: tenc IV form (see build_sinf).
+    """
+    if stream_type not in ("video", "audio", "text"):
+        raise ValueError(f"Unsupported stream type: {stream_type}")
+    fourcc = (fourcc or "").upper()
+    cpd = binascii.unhexlify(codec_private_data) if codec_private_data else b""
+    encrypted = kid is not None
+    # mdhd packs exactly three a-z letters; anything else (2-letter tags,
+    # uppercase) would underflow the 5-bit fields, so fall back to "und".
+    lang = (language or "").lower()
+    if len(lang) != 3 or not all("a" <= c <= "z" for c in lang):
+        lang = "und"
+
+    # --- ftyp ---
+    ftyp = box(b"ftyp", b"isml" + u32.pack(1) + b"iso5" + b"iso6" + b"piff" + b"msdh")
+
+    # --- mvhd ---
+    mvhd = full_box(
+        b"mvhd", 1, 0,
+        u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(timescale) + u64.pack(duration)
+        + s1616.pack(1) + s88.pack(1) + u16.pack(0) + u32.pack(0) * 2
+        + UNITY_MATRIX + u32.pack(0) * 6 + u32.pack(0xFFFFFFFF),
+    )
+
+    # --- tkhd ---
+    tkhd = full_box(
+        b"tkhd", 1, TRACK_ENABLED | TRACK_IN_MOVIE | TRACK_IN_PREVIEW,
+        u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(track_id) + u32.pack(0)
+        + u64.pack(duration) + u32.pack(0) * 2 + s16.pack(0) + s16.pack(0)
+        + s88.pack(1 if stream_type == "audio" else 0) + u16.pack(0) + UNITY_MATRIX
+        + u1616.pack(width) + u1616.pack(height),
+    )
+
+    # --- mdhd + hdlr ---
+    packed_lang = ((ord(lang[0]) - 0x60) << 10) | ((ord(lang[1]) - 0x60) << 5) | (ord(lang[2]) - 0x60)
+    mdhd = full_box(
+        b"mdhd", 1, 0,
+        u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(timescale) + u64.pack(duration)
+        + u16.pack(packed_lang) + u16.pack(0),
+    )
+    if stream_type == "audio":
+        hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"soun" + u32.pack(0) * 3 + b"SoundHandler\0")
+        media_header = full_box(b"smhd", 0, 0, s88.pack(0) + u16.pack(0))
+    elif stream_type == "text":
+        hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"subt" + u32.pack(0) * 3 + b"SubtitleHandler\0")
+        media_header = full_box(b"sthd", 0, 0, b"")
+    else:
+        hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"vide" + u32.pack(0) * 3 + b"VideoHandler\0")
+        media_header = full_box(b"vmhd", 0, 1, u16.pack(0) + u16.pack(0) * 3)
+
+    # --- dinf ---
+    dref = full_box(b"dref", 0, 0, u32.pack(1) + full_box(b"url ", 0, SELF_CONTAINED, b""))
+    dinf = box(b"dinf", dref)
+
+    # --- stsd sample entry ---
+    sample_entry_payload = u8.pack(0) * 6 + u16.pack(1)  # reserved + data reference index
+    if stream_type == "video":
+        sample_entry_payload += (
+            u16.pack(0) + u16.pack(0) + u32.pack(0) * 3
+            + u16.pack(width) + u16.pack(height)
+            + u1616.pack(0x48) + u1616.pack(0x48) + u32.pack(0) + u16.pack(1)
+            + u8.pack(0) * 32 + u16.pack(0x18) + s16.pack(-1)
+        )
+        if fourcc in ("H264", "AVC1", "DAVC"):
+            config_box = build_avcc(cpd, nal_length_size)
+            codec_fourcc = b"avc1"
+        elif fourcc in ("HVC1", "HEV1", "HEVC", "H265"):
+            config_box = build_hvcc(cpd, nal_length_size)
+            codec_fourcc = b"hvc1"
+        elif fourcc in ("DVHE", "DVH1"):
+            # Dolby Vision over HEVC: same hvcC config, dvh1 sample entry.
+            config_box = build_hvcc(cpd, nal_length_size)
+            codec_fourcc = b"dvh1"
+        else:
+            raise NotImplementedError(f"Unsupported video FourCC: {fourcc}")
+        sample_entry_payload += config_box
+        if encrypted:
+            sample_entry_payload += build_sinf(codec_fourcc, kid, iv_size, constant_iv)
+            sample_entry_box = box(b"encv", sample_entry_payload)
+        else:
+            sample_entry_box = box(codec_fourcc, sample_entry_payload)
+    elif stream_type == "audio":
+        # samplerate is 16.16 fixed-point; rates above 65535 Hz are written as 0
+        # (decoders read the real rate from the codec config), matching ffmpeg.
+        sample_entry_payload += (
+            u32.pack(0) * 2 + u16.pack(channels) + u16.pack(bits_per_sample)
+            + u16.pack(0) + u16.pack(0) + u32.pack((sampling_rate if sampling_rate <= 0xFFFF else 0) << 16)
+        )
+        if fourcc in ("AACL", "AACH", "AAC"):
+            if not cpd:
+                cpd = synthesize_aac_codec_private_data(fourcc, sampling_rate, channels)
+            sample_entry_payload += build_esds(cpd)
+            codec_fourcc = b"mp4a"
+        elif fourcc == "EC-3":
+            dec3 = build_dec3(cpd)
+            if dec3:
+                sample_entry_payload += dec3
+            codec_fourcc = b"ec-3"
+        else:
+            raise NotImplementedError(f"Unsupported audio FourCC: {fourcc}")
+        if encrypted:
+            sample_entry_payload += build_sinf(codec_fourcc, kid, iv_size, constant_iv)
+            sample_entry_box = box(b"enca", sample_entry_payload)
+        else:
+            sample_entry_box = box(codec_fourcc, sample_entry_payload)
+    else:  # text
+        if fourcc in ("TTML", "STPP", "DFXP"):
+            # XMLSubtitleSampleEntry: namespace + schema_location + aux mime types.
+            sample_entry_payload += TTML_NAMESPACE + b"\0" + b"\0"
+            sample_entry_box = box(b"stpp", sample_entry_payload)
+        else:
+            raise NotImplementedError(f"Unsupported text FourCC: {fourcc}")
+
+    stsd = full_box(b"stsd", 0, 0, u32.pack(1) + sample_entry_box)
+
+    # --- empty sample tables (fragmented: real samples live in moof/traf) ---
+    stbl = box(
+        b"stbl",
+        stsd
+        + full_box(b"stts", 0, 0, u32.pack(0))
+        + full_box(b"stsc", 0, 0, u32.pack(0))
+        + full_box(b"stsz", 0, 0, u32.pack(0) + u32.pack(0))
+        + full_box(b"stco", 0, 0, u32.pack(0)),
+    )
+
+    minf = box(b"minf", media_header + dinf + stbl)
+    mdia = box(b"mdia", mdhd + hdlr + minf)
+    trak = box(b"trak", tkhd + mdia)
+
+    # --- mvex (mehd + trex) signals a fragmented file ---
+    mehd = full_box(b"mehd", 1, 0, u64.pack(duration))
+    trex = full_box(
+        b"trex", 0, 0,
+        u32.pack(track_id) + u32.pack(1) + u32.pack(0) + u32.pack(0) + u32.pack(0),
+    )
+    mvex = box(b"mvex", mehd + trex)
+
+    moov = box(b"moov", mvhd + trak + mvex)
+    return ftyp + moov