diff --git a/tests/core/test_ism_init.py b/tests/core/test_ism_init.py new file mode 100644 index 0000000..c2eb7d1 --- /dev/null +++ b/tests/core/test_ism_init.py @@ -0,0 +1,410 @@ +"""Regression tests for ISM init-segment synthesis (ftyp + moov). + +Smooth Streaming fragments carry no moov; the init box must be rebuilt from the +manifest CodecPrivateData before shaka/mp4decrypt can parse the stream. These +guard the byte-level box structure so a future downloader refactor cannot +silently drop it again (the c323db9 regression). +""" + +from __future__ import annotations + +import struct + +import pytest + +from unshackle.core.manifests.ism_init import (NAL_START_CODE, PIFF_SENC_UUID, box, build_avcc, build_dec3, + build_hvcc, build_init_segment, full_box, parse_hevc_sps_format, + read_per_sample_iv_size, read_track_id, remove_emulation_prevention, + split_nal_units, synthesize_aac_codec_private_data) + +# Real CodecPrivateData taken from a Smooth Streaming manifest. +VIDEO_HEVC_CPD = ( + "0000000140010C01FFFF01600000030090000003000003009695980900000001420101016000000300900000" + "030000030096A001E020064165959A4930BC05A80808082000007D20000BB801000000014401C172B66240" +) +# H.264 SPS+PPS (start-code delimited) for the AVC path. +VIDEO_AVC_CPD = "00000001674d401e9a6602800b76020000003e90000bb800f18311200000000168ebccb22c" +# 10-bit (Main 10) HEVC VPS+SPS+PPS minted with x265; ffprobe reads the +# synthesized init as "Main 10 / yuv420p10le". +VIDEO_HEVC10_CPD = ( + "0000000140010c01ffff02200000030090000003000003003c959809000000000142010102200000030090" + "000003000003003ca00a080b9f6d96566924caf0168080000003008000000c8400000000014401c172b4624000" +) +AAC_LC_CPD = "1190" +# Real Smooth EC-3 CodecPrivateData: WAVEFORMATEXTENSIBLE extension (samples +# per block + channel mask + DD+ GUID) followed by the 5-byte dec3 payload. +EC3_CPD = "00063F000000AF87FBA7022DFB42A4D405CD93843BDD0600200F00" +KID = bytes.fromhex("09fd2bd778bb544785ed2322dc6a7d87") + + +def top_level_boxes(data: bytes) -> list[tuple[str, int]]: + boxes, offset = [], 0 + while offset + 8 <= len(data): + size = struct.unpack(">I", data[offset : offset + 4])[0] + box_type = data[offset + 4 : offset + 8].decode("latin1") + if size == 1: + size = struct.unpack(">Q", data[offset + 8 : offset + 16])[0] + if size == 0: + size = len(data) - offset + boxes.append((box_type, size)) + offset += size + return boxes + + +def test_split_nal_units_drops_start_codes(): + nals = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD)) + # VPS (32), SPS (33), PPS (34) by HEVC NAL type = (first_byte >> 1) & 0x3F. + assert [(n[0] >> 1) & 0x3F for n in nals] == [32, 33, 34] + + +def test_hevc_init_structure(): + init = build_init_segment( + stream_type="video", + fourcc="HVC1", + codec_private_data=VIDEO_HEVC_CPD, + timescale=10000000, + width=3840, + height=1600, + ) + boxes = top_level_boxes(init) + assert [b[0] for b in boxes] == ["ftyp", "moov"] + assert boxes[0][1] + boxes[1][1] == len(init) + assert b"hvcC" in init + assert b"hvc1" in init + # Unencrypted: no protection scheme boxes. + assert b"encv" not in init and b"sinf" not in init + + +def test_avc_init_structure(): + init = build_init_segment( + stream_type="video", + fourcc="H264", + codec_private_data=VIDEO_AVC_CPD, + timescale=10000000, + width=1280, + height=720, + ) + assert init[4:8] == b"ftyp" + assert b"avcC" in init and b"avc1" in init + + +def test_aac_audio_init_structure(): + init = build_init_segment( + stream_type="audio", + fourcc="AACL", + codec_private_data=AAC_LC_CPD, + timescale=10000000, + channels=2, + sampling_rate=48000, + ) + assert b"mp4a" in init and b"esds" in init + assert b"smhd" in init # sound media header, not video + + +def test_encrypted_init_has_cenc_boxes(): + init = build_init_segment( + stream_type="video", + fourcc="HVC1", + codec_private_data=VIDEO_HEVC_CPD, + timescale=10000000, + width=3840, + height=1600, + kid=KID, + ) + # Encrypted sample entry is wrapped: encv -> sinf(frma+schm+schi(tenc)). + assert b"encv" in init + assert b"sinf" in init and b"frma" in init and b"tenc" in init + assert b"cenc" in init + # The 16-byte default_KID must be embedded verbatim for shaka to map the key. + assert KID in init + # Original codec preserved inside frma for the muxer. + assert b"hvc1" in init + + +def test_unsupported_codec_raises(): + # Unknown FourCC (e.g. VC-1); caller soft-fails to raw concat. + with pytest.raises(NotImplementedError): + build_init_segment( + stream_type="video", + fourcc="WVC1", + codec_private_data="00063F00", + timescale=10000000, + ) + + +def test_ec3_init_embeds_dec3_from_codec_private_data(): + init = build_init_segment( + stream_type="audio", + fourcc="EC-3", + codec_private_data=EC3_CPD, + timescale=10000000, + channels=6, + sampling_rate=48000, + ) + assert b"ec-3" in init + # dec3 payload = CodecPrivateData past the 22-byte WAVEFORMATEXTENSIBLE header. + assert box(b"dec3", bytes.fromhex(EC3_CPD)[22:]) in init + assert b"esds" not in init # no MPEG-4 descriptor inside an ec-3 entry + + +def test_ec3_encrypted_wraps_enca_with_frma(): + init = build_init_segment( + stream_type="audio", + fourcc="EC-3", + codec_private_data=EC3_CPD, + timescale=10000000, + channels=6, + kid=KID, + ) + assert b"enca" in init and b"sinf" in init and b"tenc" in init + assert box(b"frma", b"ec-3") in init + assert KID in init + + +def test_ec3_dec3_found_in_full_waveformatextensible(): + # Some services ship the full WAVEFORMATEX header (18 bytes) before the + # extension; the dec3 payload still follows the DD+ GUID. + full = b"\xfe\xff" + b"\x00" * 16 + bytes.fromhex(EC3_CPD) + payload = bytes.fromhex(EC3_CPD)[22:] + assert build_dec3(full) == box(b"dec3", payload) + + +def test_ec3_without_dolby_guid_builds_bare_entry(): + assert build_dec3(b"\x00\x06\x3f\x00") is None + init = build_init_segment( + stream_type="audio", + fourcc="EC-3", + codec_private_data="", + timescale=10000000, + channels=6, + ) + assert b"ec-3" in init and b"dec3" not in init + + +def test_aac_codec_private_data_synthesis_matches_real_manifest(): + # 48 kHz stereo AAC-LC must produce 0x1190 — the exact ASC real manifests carry. + assert synthesize_aac_codec_private_data("AACL", 48000, 2).hex() == "1190" + + +def test_aach_synthesis_signals_sbr(): + asc = synthesize_aac_codec_private_data("AACH", 24000, 2) + assert len(asc) == 4 + assert asc[0] >> 3 == 0x05 # AOT 5 = SBR (HE-AAC) + # Extension sampling frequency = core * 2 = 48 kHz (index 3). + assert ((asc[1] & 0x01) << 1) | (asc[2] >> 7) == 0x03 + + +def test_aac_init_without_codec_private_data_synthesizes_asc(): + init = build_init_segment( + stream_type="audio", + fourcc="AACL", + codec_private_data="", + timescale=10000000, + channels=2, + sampling_rate=48000, + ) + assert b"mp4a" in init and b"esds" in init + assert bytes.fromhex(AAC_LC_CPD) in init + + +def test_dolby_vision_uses_dvh1_sample_entry(): + init = build_init_segment( + stream_type="video", + fourcc="DVH1", + codec_private_data=VIDEO_HEVC_CPD, + timescale=10000000, + width=3840, + height=1600, + ) + assert b"dvh1" in init and b"hvcC" in init + assert b"hvc1" not in init + + +def test_davc_maps_to_avc1(): + init = build_init_segment( + stream_type="video", + fourcc="DAVC", + codec_private_data=VIDEO_AVC_CPD, + timescale=10000000, + ) + assert b"avc1" in init and b"avcC" in init + + +def test_lowercase_fourcc_normalized(): + # Real manifests ship FourCC="hvc1" in lowercase. + init = build_init_segment( + stream_type="video", + fourcc="hvc1", + codec_private_data=VIDEO_HEVC_CPD, + timescale=10000000, + ) + assert b"hvcC" in init + + +def test_avcc_selects_sps_pps_by_nal_type_not_position(): + nals = split_nal_units(bytes.fromhex(VIDEO_AVC_CPD)) + swapped = NAL_START_CODE + nals[1] + NAL_START_CODE + nals[0] # PPS first + avcc = build_avcc(swapped) + # Profile/compat/level must still come from the SPS body. + assert avcc[9:12] == nals[0][1:4] + + +def test_nal_length_field_respected(): + avcc = build_avcc(bytes.fromhex(VIDEO_AVC_CPD), nal_length_size=2) + # avcC payload byte 4 low 2 bits = lengthSizeMinusOne. + assert avcc[12] & 0x03 == 1 + + +def test_parse_hevc_sps_format_8bit(): + sps = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD))[1] + assert parse_hevc_sps_format(remove_emulation_prevention(sps)) == (1, 0, 0) # 4:2:0, 8-bit + + +def test_hvcc_signals_10bit_from_sps(): + sps = next(n for n in split_nal_units(bytes.fromhex(VIDEO_HEVC10_CPD)) if (n[0] >> 1) & 0x3F == 33) + assert parse_hevc_sps_format(remove_emulation_prevention(sps)) == (1, 2, 2) # 4:2:0, 10-bit + payload = build_hvcc(bytes.fromhex(VIDEO_HEVC10_CPD))[8:] # strip box header + assert payload[16] == 0xFC | 0x01 # chromaFormat 4:2:0 + assert payload[17] == 0xF8 | 0x02 # bitDepthLumaMinus8 = 2 + assert payload[18] == 0xF8 | 0x02 # bitDepthChromaMinus8 = 2 + + +def test_ttml_init_structure(): + init = build_init_segment( + stream_type="text", + fourcc="TTML", + codec_private_data="", + timescale=10000000, + language="eng", + ) + assert b"stpp" in init + assert b"sthd" in init # subtitle media header + assert b"subt" in init and b"SubtitleHandler\0" in init + assert b"http://www.w3.org/ns/ttml\0" in init + + +def test_constant_iv_tenc_form(): + constant_iv = bytes(range(16)) + init = build_init_segment( + stream_type="video", + fourcc="HVC1", + codec_private_data=VIDEO_HEVC_CPD, + timescale=10000000, + kid=KID, + constant_iv=constant_iv, + ) + # Constant-IV form: default_Per_Sample_IV_Size = 0, then size + IV after the KID. + assert KID + bytes([len(constant_iv)]) + constant_iv in init + tenc_at = init.index(b"tenc") + assert init[tenc_at + 4 + 4 + 3] == 0 # default_Per_Sample_IV_Size + + +def make_fragment(senc: bytes = b"", saiz: bytes = b"") -> bytes: + tfhd = full_box(b"tfhd", 0, 0, struct.pack(">I", 1) + b"\x00" * 4) + traf = box(b"traf", tfhd + senc + saiz) + return box(b"moof", traf) + box(b"mdat", b"\x00" * 4) + + +def test_iv_size_from_piff_senc_override_flag(): + # PIFF senc uuid with flags&1: AlgorithmID(3) + IV_size(1) + KID(16) override. + payload = b"\x00\x00\x00\x01" + b"\x00\x00\x01" + bytes([16]) + KID + struct.pack(">I", 0) + senc = box(b"uuid", PIFF_SENC_UUID + payload) + assert read_per_sample_iv_size(make_fragment(senc=senc)) == 16 + + +def test_iv_size_from_senc_payload_length(): + # Standard senc, no subsamples: 3 samples x 8-byte IVs. + senc = full_box(b"senc", 0, 0, struct.pack(">I", 3) + b"\x11" * 24) + assert read_per_sample_iv_size(make_fragment(senc=senc)) == 8 + + +def test_iv_size_from_senc_with_subsamples(): + # senc flags&2: per sample IV(8) + entry_count(2) + 6 bytes per entry. + sample = b"\x22" * 8 + struct.pack(">H", 1) + b"\x00" * 6 + senc = full_box(b"senc", 0, 2, struct.pack(">I", 2) + sample * 2) + assert read_per_sample_iv_size(make_fragment(senc=senc)) == 8 + + +def test_iv_size_from_saiz_fallback(): + saiz = full_box(b"saiz", 0, 0, bytes([16]) + struct.pack(">I", 5)) + assert read_per_sample_iv_size(make_fragment(saiz=saiz)) == 16 + + +def test_iv_size_undetermined_returns_none(): + assert read_per_sample_iv_size(make_fragment()) is None + + +def test_hvcc_embeds_vps_sps_pps(): + hvcc = build_hvcc(bytes.fromhex(VIDEO_HEVC_CPD)) + nals = split_nal_units(bytes.fromhex(VIDEO_HEVC_CPD)) + # Each original NAL unit (VPS/SPS/PPS) is embedded verbatim in the arrays. + for nal in nals: + assert nal in hvcc + + +def test_avcc_requires_sps_and_pps(): + with pytest.raises(ValueError): + build_avcc(b"\x00\x00\x00\x01\x67only_sps") + + +def test_read_track_id_from_fragment(): + # Minimal moof/traf/tfhd carrying track_ID = 7. + tfhd = full_box("tfhd".encode(), 0, 0, struct.pack(">I", 7) + b"\x00" * 4) + traf = box(b"traf", tfhd) + moof = box(b"moof", traf) + mdat = box(b"mdat", b"\x00\x00") + assert read_track_id(moof + mdat) == 7 + + +def test_read_track_id_missing_returns_none(): + assert read_track_id(box(b"mdat", b"\x00\x00")) is None + + +def test_remove_emulation_prevention(): + # 00 00 03 XX -> the 0x03 emulation byte is dropped. + assert remove_emulation_prevention(b"\x00\x00\x03\x01") == b"\x00\x00\x01" + assert remove_emulation_prevention(b"\x00\x00\x03\x00\x00\x03\x96") == b"\x00\x00\x00\x00\x96" + # The byte after a consumed escape is data, even another 0x03. + assert remove_emulation_prevention(b"\x00\x00\x03\x03") == b"\x00\x00\x03" + assert remove_emulation_prevention(b"\x00\x00\x03\x03\x00\x00\x03\x01") == b"\x00\x00\x03\x00\x00\x01" + + +def test_two_letter_or_uppercase_language_falls_back_to_und(): + # mdhd packs three a-z letters; "en"/"ENG" must not crash struct.pack. + for lang in ("en", "ENG", "", "e1x"): + init = build_init_segment( + stream_type="audio", + fourcc="AACL", + codec_private_data=AAC_LC_CPD, + timescale=10000000, + language=lang, + ) + assert init[4:8] == b"ftyp" + + +def test_high_sampling_rate_does_not_overflow(): + # 96 kHz exceeds the 16.16 integer field; written as 0 like ffmpeg does. + init = build_init_segment( + stream_type="audio", + fourcc="AACL", + codec_private_data="", + timescale=10000000, + sampling_rate=96000, + ) + assert b"mp4a" in init + + +def test_read_track_id_truncated_tfhd_returns_none(): + tfhd = full_box(b"tfhd", 0, 0, b"\x00\x00") # too short for a track_ID + fragment = box(b"moof", box(b"traf", tfhd)) + assert read_track_id(fragment) is None + + +def test_hvcc_profile_tier_level_is_nonzero(): + # De-emulated PTL must yield real profile/level, not the off-by-one garbage. + hvcc = build_hvcc(bytes.fromhex(VIDEO_HEVC_CPD)) + payload = hvcc[8:] # strip box header + profile_idc = payload[1] & 0x1F + level_idc = payload[12] + assert profile_idc != 0 + assert level_idc != 0 diff --git a/unshackle/core/manifests/ism.py b/unshackle/core/manifests/ism.py index 61d9378..7cd2634 100644 --- a/unshackle/core/manifests/ism.py +++ b/unshackle/core/manifests/ism.py @@ -3,6 +3,7 @@ from __future__ import annotations import base64 import hashlib import html +import struct import urllib.parse from functools import partial from pathlib import Path @@ -18,6 +19,7 @@ from requests import Session from unshackle.core.constants import DOWNLOAD_CANCELLED, DOWNLOAD_LICENCE_ONLY, AnyTrack from unshackle.core.drm import DRM_T, PlayReady, Widevine from unshackle.core.events import events +from unshackle.core.manifests.ism_init import build_init_segment, read_per_sample_iv_size, read_track_id from unshackle.core.session import RnetSession from unshackle.core.tracks import Audio, Subtitle, Track, Tracks, Video from unshackle.core.utilities import log_event, try_ensure_utf8 @@ -85,6 +87,104 @@ class ISM: drm.append(PlayReady(pssh=pr_pssh, pssh_b64=data)) return drm + @staticmethod + def _init_segment( + track: AnyTrack, session_drm: Optional[DRM_T], first_segment: Optional[bytes] = None + ) -> Optional[bytes]: + # Smooth fragments are moof+mdat only; rebuild the ftyp+moov init box from + # the manifest CodecPrivateData (and KID, when encrypted) so the merged file + # is a valid MP4 that shaka/mp4decrypt can parse. + ism = track.data.get("ism") if isinstance(getattr(track, "data", None), dict) else None + if not ism: + return None + stream_index = ism.get("stream_index") + quality_level = ism.get("quality_level") + manifest = ism.get("manifest") + if stream_index is None or quality_level is None: + return None + # CodecPrivateData may legitimately be empty (AAC config is synthesized, + # EC-3 decoders sync from the frames); the builder handles each case. + cpd = quality_level.get("CodecPrivateData") or "" + fourcc = quality_level.get("FourCC") or "" + + root_timescale = manifest.get("TimeScale") if manifest is not None else None + timescale = int(stream_index.get("TimeScale") or root_timescale or 10000000) + duration = int((manifest.get("Duration") if manifest is not None else 0) or 0) + # mdhd needs a 3-letter ISO-639-2 code; manifests often carry 2-letter tags. + lang_attr = (stream_index.get("Language") or "").strip() + language = "und" + if lang_attr and tag_is_valid(lang_attr): + try: + language = Language.get(lang_attr).to_alpha3() + except LookupError: + language = "und" + + kid: Optional[bytes] = None + if session_drm is not None: + kid_uuid = next(iter(getattr(session_drm, "kids", None) or []), None) + if kid_uuid is not None: + kid = bytes.fromhex(kid_uuid.hex) + + # Match the moov track_ID to the fragment's tfhd, else the muxer drops samples. + track_id = (read_track_id(first_segment) if first_segment else None) or 1 + # NALUnitLengthField: bytes per NAL length prefix, default 4. + nal_length_size = int(quality_level.get("NALUnitLengthField") or stream_index.get("NALUnitLengthField") or 4) + # Per-sample IV size derived from the fragment senc/saiz (PIFF default 8). + iv_size = (read_per_sample_iv_size(first_segment) if first_segment and kid else None) or 8 + + try: + if isinstance(track, Subtitle): + if track.codec != Subtitle.Codec.fTTML: + return None # plain-text subtitle formats concatenate fine + return build_init_segment( + stream_type="text", + fourcc="TTML", + codec_private_data="", + timescale=timescale, + duration=duration, + language=language, + track_id=track_id, + ) + if isinstance(track, Video): + return build_init_segment( + stream_type="video", + fourcc=fourcc, + codec_private_data=cpd, + timescale=timescale, + duration=duration, + language=language, + width=int(quality_level.get("MaxWidth") or stream_index.get("MaxWidth") or 0), + height=int(quality_level.get("MaxHeight") or stream_index.get("MaxHeight") or 0), + track_id=track_id, + nal_length_size=nal_length_size, + kid=kid, + iv_size=iv_size, + ) + return build_init_segment( + stream_type="audio", + fourcc=fourcc, + codec_private_data=cpd, + timescale=timescale, + duration=duration, + language=language, + channels=int(quality_level.get("Channels") or 2), + bits_per_sample=int(quality_level.get("BitsPerSample") or 16), + sampling_rate=int(quality_level.get("SamplingRate") or 48000), + track_id=track_id, + kid=kid, + iv_size=iv_size, + ) + except (NotImplementedError, ValueError, struct.error) as e: + # Unsupported codec, malformed CodecPrivateData or out-of-range field — + # fall back to raw concatenation rather than aborting the download. + log_event( + "manifest_ism_init_unsupported", + level="WARNING", + message=f"Could not synthesize ISM init segment ({fourcc}): {e}", + context={"track_id": getattr(track, "id", None), "fourcc": fourcc}, + ) + return None + def to_tracks(self, language: Optional[Union[str, Language]] = None) -> Tracks: tracks = Tracks() base_url = self.url @@ -383,8 +483,13 @@ class ISM: raise FileNotFoundError(error_msg) with open(save_path, "wb") as f: - for segment_file in segments_to_merge: - segment_data = segment_file.read_bytes() + first_segment = segments_to_merge[0].read_bytes() if segments_to_merge else None + init_segment = ISM._init_segment(track, session_drm, first_segment) + if init_segment: + f.write(init_segment) + for index, segment_file in enumerate(segments_to_merge): + # First segment was already read for the init synthesis — reuse it. + segment_data = first_segment if index == 0 and first_segment else segment_file.read_bytes() if ( not session_drm and isinstance(track, Subtitle) diff --git a/unshackle/core/manifests/ism_init.py b/unshackle/core/manifests/ism_init.py new file mode 100644 index 0000000..8ed55f0 --- /dev/null +++ b/unshackle/core/manifests/ism_init.py @@ -0,0 +1,622 @@ +""" +Synthesize an ISO-BMFF initialization segment (ftyp + moov) for ISM / Smooth +Streaming tracks. + +Smooth Streaming fragments are bare ``moof`` + ``mdat`` pairs; the server never +sends a ``moov``. The init box must be reconstructed from the manifest's +``CodecPrivateData`` (and, for protected content, the track KID) before a muxer +or decryptor such as shaka-packager can parse the stream. Ported from yt-dlp's +``write_piff_header`` and N_m3u8DL-RE's ``MSSMoovProcessor`` with HEVC, Dolby +Vision, EC-3, TTML and CENC (PIFF) support. +""" + +from __future__ import annotations + +import binascii +import struct +from typing import Iterator, Optional + +# Big-endian field packers (named for the bit widths they encode). +u8 = struct.Struct(">B") +u16 = struct.Struct(">H") +u32 = struct.Struct(">I") +u64 = struct.Struct(">Q") +s16 = struct.Struct(">h") +s88 = struct.Struct(">bx") # 8.8 fixed-point +s1616 = struct.Struct(">hxx") # 16.16 fixed-point +u1616 = struct.Struct(">Hxx") +s32 = struct.Struct(">i") + +# 3x3 transformation matrix (identity), as stored in tkhd/mvhd. +UNITY_MATRIX = ( + s32.pack(0x10000) + s32.pack(0) * 3 + + s32.pack(0) + s32.pack(0x10000) + s32.pack(0) * 2 + + s32.pack(0) * 2 + s32.pack(0x40000000) +) + +TRACK_ENABLED = 0x1 +TRACK_IN_MOVIE = 0x2 +TRACK_IN_PREVIEW = 0x4 +SELF_CONTAINED = 0x1 + +# Fixed creation/modification time — deterministic output (no wall clock). +EPOCH = 0 + +NAL_START_CODE = b"\x00\x00\x00\x01" + +# WAVEFORMATEXTENSIBLE SubFormat GUID for Dolby Digital Plus, as serialized +# (little-endian) inside Smooth EC-3 CodecPrivateData. +DOLBY_DIGITAL_PLUS_GUID = bytes.fromhex("AF87FBA7022DFB42A4D405CD93843BDD") + +# PIFF SampleEncryptionBox usertype (the pre-CENC 'senc' carried as a uuid box). +PIFF_SENC_UUID = bytes.fromhex("A2394F525A9B4F14A2446C427C648DF4") + +TTML_NAMESPACE = b"http://www.w3.org/ns/ttml\0" + +# ISO/IEC 14496-3 samplingFrequencyIndex table for AudioSpecificConfig. +AAC_SAMPLING_FREQUENCY_INDEX = { + 96000: 0x0, + 88200: 0x1, + 64000: 0x2, + 48000: 0x3, + 44100: 0x4, + 32000: 0x5, + 24000: 0x6, + 22050: 0x7, + 16000: 0x8, + 12000: 0x9, + 11025: 0xA, + 8000: 0xB, + 7350: 0xC, +} + + +def box(box_type: bytes, payload: bytes) -> bytes: + """Wrap payload in a basic ISO-BMFF box (size + fourcc + payload).""" + return u32.pack(8 + len(payload)) + box_type + payload + + +def full_box(box_type: bytes, version: int, flags: int, payload: bytes) -> bytes: + """Wrap payload in a FullBox (adds 1-byte version + 3-byte flags).""" + return box(box_type, u8.pack(version) + u32.pack(flags)[1:] + payload) + + +def split_nal_units(codec_private_data: bytes) -> list[bytes]: + """Split CodecPrivateData into its NAL units (drops the start codes).""" + units = [u for u in codec_private_data.split(NAL_START_CODE) if u] + return units + + +def remove_emulation_prevention(data: bytes) -> bytes: + """Strip H.26x emulation-prevention bytes (the 0x03 in any 00 00 03 run). + + The byte after a consumed escape is data — even another 0x03 — so the scan + must skip past it rather than re-examine (a naive trailing-window check + over-strips consecutive escapes and shifts every later bit position). + """ + out = bytearray() + i = 0 + while i < len(data): + if i + 2 < len(data) and data[i] == 0 and data[i + 1] == 0 and data[i + 2] == 3: + out += b"\x00\x00" + i += 3 + else: + out.append(data[i]) + i += 1 + return bytes(out) + + +class BitReader: + """MSB-first bit reader with the exp-Golomb decode H.26x headers need.""" + + def __init__(self, data: bytes) -> None: + self.data = data + self.pos = 0 + + def read_bits(self, count: int) -> int: + value = 0 + for _ in range(count): + byte = self.data[self.pos >> 3] + value = (value << 1) | ((byte >> (7 - (self.pos & 7))) & 1) + self.pos += 1 + return value + + def read_ue(self) -> int: + zeros = 0 + while self.read_bits(1) == 0: + zeros += 1 + if zeros > 32: + raise ValueError("Invalid exp-Golomb code") + return (1 << zeros) - 1 + (self.read_bits(zeros) if zeros else 0) + + +def parse_hevc_sps_format(sps_rbsp: bytes) -> tuple[int, int, int]: + """ + Parse (chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8) + from a de-emulated HEVC SPS RBSP (including its 2-byte NAL header). + """ + r = BitReader(sps_rbsp) + r.read_bits(16) # NAL unit header + r.read_bits(4) # sps_video_parameter_set_id + max_sub_layers_minus1 = r.read_bits(3) + r.read_bits(1) # sps_temporal_id_nesting_flag + r.read_bits(96) # general profile_tier_level (12 bytes) + profile_present = [] + level_present = [] + for _ in range(max_sub_layers_minus1): + profile_present.append(r.read_bits(1)) + level_present.append(r.read_bits(1)) + if max_sub_layers_minus1 > 0: + r.read_bits((8 - max_sub_layers_minus1) * 2) # reserved_zero_2bits + for i in range(max_sub_layers_minus1): + if profile_present[i]: + r.read_bits(88) # sub_layer profile_tier + if level_present[i]: + r.read_bits(8) # sub_layer_level_idc + r.read_ue() # sps_seq_parameter_set_id + chroma_format_idc = r.read_ue() + if chroma_format_idc == 3: + r.read_bits(1) # separate_colour_plane_flag + r.read_ue() # pic_width_in_luma_samples + r.read_ue() # pic_height_in_luma_samples + if r.read_bits(1): # conformance_window_flag + for _ in range(4): + r.read_ue() + bit_depth_luma_minus8 = r.read_ue() + bit_depth_chroma_minus8 = r.read_ue() + return chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8 + + +def iter_boxes(data: bytes, start: int, end: int) -> Iterator[tuple[bytes, Optional[bytes], int, int]]: + """Yield (type, uuid_usertype, payload_start, box_end) for each child box.""" + offset = start + while offset + 8 <= end: + size = struct.unpack(">I", data[offset : offset + 4])[0] + box_type = data[offset + 4 : offset + 8] + header = 8 + if size == 1: + size = struct.unpack(">Q", data[offset + 8 : offset + 16])[0] + header = 16 + if size == 0: + size = end - offset + if size < 8: # corrupt box header; stop rather than loop forever + return + usertype = None + if box_type == b"uuid" and offset + header + 16 <= end: + usertype = data[offset + header : offset + header + 16] + header += 16 + yield box_type, usertype, offset + header, offset + size + offset += size + + +def find_box(data: bytes, start: int, end: int, target: bytes) -> Optional[tuple[int, int]]: + """Find the first child box of the given type; return (payload_start, end).""" + for box_type, _, body, box_end in iter_boxes(data, start, end): + if box_type == target: + return body, box_end + return None + + +def read_track_id(fragment: bytes) -> Optional[int]: + """Read the track_ID from a fragment's moof/traf/tfhd box, if present. + + Smooth fragments declare their own track_ID; the synthesized moov must use + the same value or the muxer cannot associate samples with the track. The + track_ID sits before any tfhd optional fields, so the flags don't matter. + """ + moof = find_box(fragment, 0, len(fragment), b"moof") + if not moof: + return None + traf = find_box(fragment, *moof, b"traf") + if not traf: + return None + tfhd = find_box(fragment, *traf, b"tfhd") + if not tfhd: + return None + body, _ = tfhd + if body + 8 > len(fragment): # truncated tfhd + return None + # tfhd payload: version(1) + flags(3) + track_ID(4) + return struct.unpack(">I", fragment[body + 4 : body + 8])[0] + + +def read_per_sample_iv_size(fragment: bytes) -> Optional[int]: + """ + Derive the per-sample IV size (8 or 16) from a fragment's sample-encryption + metadata, for the synthesized tenc default_Per_Sample_IV_Size. + + Checks, in order: the PIFF 'senc' uuid override flag (explicit IV size), + the senc payload length (sample_count vs IV/subsample entries), and the + saiz default_sample_info_size (only unambiguous without subsamples). + """ + moof = find_box(fragment, 0, len(fragment), b"moof") + if not moof: + return None + traf = find_box(fragment, *moof, b"traf") + if not traf: + return None + + senc: Optional[tuple[int, int]] = None + saiz_default: Optional[int] = None + senc_has_subsamples = False + for box_type, usertype, body, box_end in iter_boxes(fragment, *traf): + if box_type == b"senc" or (box_type == b"uuid" and usertype == PIFF_SENC_UUID): + senc = (body, box_end) + elif box_type == b"saiz": + flags = int.from_bytes(fragment[body + 1 : body + 4], "big") + pos = body + 4 + (8 if flags & 0x1 else 0) # skip aux_info_type fields + if pos < box_end: + saiz_default = fragment[pos] + + if senc: + body, box_end = senc + flags = int.from_bytes(fragment[body + 1 : body + 4], "big") + senc_has_subsamples = bool(flags & 0x2) + pos = body + 4 + if flags & 0x1: # PIFF override: AlgorithmID(3) + IV_size(1) + KID(16) + return fragment[pos + 3] + if pos + 4 <= box_end: + sample_count = struct.unpack(">I", fragment[pos : pos + 4])[0] + pos += 4 + if sample_count: + if not senc_has_subsamples: + size, rem = divmod(box_end - pos, sample_count) + if rem == 0 and size in (8, 16): + return size + else: + # Walk the entries with each candidate IV size; the one that + # lands exactly on the box end is correct. + for iv_size in (8, 16): + cursor = pos + for _ in range(sample_count): + cursor += iv_size + if cursor + 2 > box_end: + cursor = -1 + break + entries = struct.unpack(">H", fragment[cursor : cursor + 2])[0] + cursor += 2 + 6 * entries + if cursor > box_end: + cursor = -1 + break + if cursor == box_end: + return iv_size + + if not senc_has_subsamples and saiz_default in (8, 16): + return saiz_default + return None + + +def build_avcc(codec_private_data: bytes, nal_length_size: int = 4) -> bytes: + """Build an avcC (AVC decoder config) box from SPS+PPS CodecPrivateData.""" + nals = split_nal_units(codec_private_data) + # Pick parameter sets by H.264 NAL type (low 5 bits): 7 = SPS, 8 = PPS. + # Manifests do not guarantee SPS-first ordering. + sps = next((n for n in nals if n[0] & 0x1F == 7), None) + pps = next((n for n in nals if n[0] & 0x1F == 8), None) + if not sps or not pps: + raise ValueError("AVC CodecPrivateData must contain SPS and PPS NAL units") + payload = u8.pack(1) # configuration version + payload += sps[1:4] # profile / compat / level (from SPS NAL body) + payload += u8.pack(0xFC | (nal_length_size - 1)) # reserved + length size minus one + payload += u8.pack(0xE0 | 1) # reserved + number of SPS (1) + payload += u16.pack(len(sps)) + sps + payload += u8.pack(1) # number of PPS + payload += u16.pack(len(pps)) + pps + return box(b"avcC", payload) + + +def build_hvcc(codec_private_data: bytes, nal_length_size: int = 4) -> bytes: + """ + Build an hvcC (HEVC decoder config) box from VPS+SPS+PPS CodecPrivateData. + + Profile/tier/level bytes are lifted from the SPS profile_tier_level; chroma + format and bit depths are parsed from the SPS so 10-bit/HDR streams signal + correctly (falls back to 8-bit 4:2:0 on malformed SPS data). + """ + nals = split_nal_units(codec_private_data) + if len(nals) < 3: + raise ValueError("HEVC CodecPrivateData must contain VPS, SPS and PPS NAL units") + + # Group NAL units by type (HEVC NAL type = (first byte >> 1) & 0x3F). + by_type: dict[int, list[bytes]] = {} + for nal in nals: + nal_type = (nal[0] >> 1) & 0x3F + by_type.setdefault(nal_type, []).append(nal) + + sps = by_type.get(33, [b""])[0] + # profile_tier_level must be read from the de-emulated SPS RBSP, after the + # 2-byte NAL header + 1 byte (sps_video_parameter_set_id(4) + + # sps_max_sub_layers_minus1(3) + sps_temporal_id_nesting_flag(1)). PTL is 12 + # bytes: profile byte(1) + compat flags(4) + constraint flags(6) + level(1). + sps_rbsp = remove_emulation_prevention(sps) + ptl = sps_rbsp[3:15] if len(sps_rbsp) >= 15 else b"\x00" * 12 + general_profile_space_tier_profile = ptl[0:1] or b"\x00" + general_profile_compat = ptl[1:5].ljust(4, b"\x00") + general_constraint = ptl[5:11].ljust(6, b"\x00") + general_level_idc = ptl[11:12] or b"\x00" + + try: + chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8 = parse_hevc_sps_format(sps_rbsp) + except (IndexError, ValueError): + chroma_format_idc, bit_depth_luma_minus8, bit_depth_chroma_minus8 = 1, 0, 0 + + payload = u8.pack(1) # configurationVersion + payload += general_profile_space_tier_profile + payload += general_profile_compat + payload += general_constraint + payload += general_level_idc + payload += u16.pack(0xF000) # reserved(4) + min_spatial_segmentation_idc(12) + payload += u8.pack(0xFC) # reserved(6) + parallelismType(2) + payload += u8.pack(0xFC | (chroma_format_idc & 0x03)) # reserved(6) + chromaFormat(2) + payload += u8.pack(0xF8 | (bit_depth_luma_minus8 & 0x07)) # reserved(5) + bitDepthLumaMinus8(3) + payload += u8.pack(0xF8 | (bit_depth_chroma_minus8 & 0x07)) # reserved(5) + bitDepthChromaMinus8(3) + payload += u16.pack(0) # avgFrameRate + # constantFrameRate(2)+numTemporalLayers(3)+temporalIdNested(1)+lengthSizeMinusOne(2) + payload += u8.pack((nal_length_size - 1) & 0x03) + + arrays = bytearray() + num_arrays = 0 + for nal_type in (32, 33, 34): # VPS, SPS, PPS + units = by_type.get(nal_type) + if not units: + continue + num_arrays += 1 + arrays += u8.pack(0x80 | nal_type) # array_completeness(1)+reserved(1)+NAL type(6) + arrays += u16.pack(len(units)) + for unit in units: + arrays += u16.pack(len(unit)) + unit + payload += u8.pack(num_arrays) + bytes(arrays) + return box(b"hvcC", payload) + + +def build_esds(codec_private_data: bytes) -> bytes: + """Build an esds box wrapping the AAC AudioSpecificConfig.""" + asc = codec_private_data + # DecoderSpecificInfo (tag 0x05) + dsi = u8.pack(0x05) + u8.pack(len(asc)) + asc + # DecoderConfigDescriptor (tag 0x04): objectType=0x40 (AAC), stream type audio + dcd = ( + u8.pack(0x40) # object type indication = MPEG-4 AAC + + u8.pack(0x15) # stream type (audio) << 2 | upstream | reserved + + b"\x00\x00\x00" # buffer size + + u32.pack(0) # max bitrate + + u32.pack(0) # avg bitrate + + dsi + ) + dcd_box = u8.pack(0x04) + u8.pack(len(dcd)) + dcd + # SLConfigDescriptor (tag 0x06) + sl = u8.pack(0x06) + u8.pack(1) + u8.pack(0x02) + # ES_Descriptor (tag 0x03) + es = u8.pack(0x03) + u8.pack(len(dcd_box) + len(sl) + 3) + u16.pack(0) + u8.pack(0) + dcd_box + sl + return full_box(b"esds", 0, 0, es) + + +def build_dec3(codec_private_data: bytes) -> Optional[bytes]: + """Build a dec3 (EC-3 specific) box from Smooth EC-3 CodecPrivateData. + + Smooth EC-3 CodecPrivateData ([MS-SSTR] AudioTag 65534) serializes a + WAVEFORMATEXTENSIBLE — sometimes the full structure, sometimes only its + extension (samples-per-block + channel mask + DD+ SubFormat GUID) — with + the raw dec3 payload (ETSI TS 102 366 F.6) after the GUID. Returns None + when the GUID is absent — decoders still sync from EC-3 frames in mdat. + """ + guid_at = codec_private_data.find(DOLBY_DIGITAL_PLUS_GUID) + if guid_at != -1 and len(codec_private_data) > guid_at + 16: + return box(b"dec3", codec_private_data[guid_at + 16 :]) + return None + + +def synthesize_aac_codec_private_data(fourcc: str, sampling_rate: int, channels: int) -> bytes: + """Generate the AAC AudioSpecificConfig when the manifest omits it. + + AACL -> 2-byte AAC-LC config; AACH -> 4-byte HE-AAC (SBR, AOT 5) config + with the extension sampling frequency at twice the core rate. + """ + freq = AAC_SAMPLING_FREQUENCY_INDEX.get(sampling_rate, 0x0) + if fourcc == "AACH": + ext_freq = AAC_SAMPLING_FREQUENCY_INDEX.get(sampling_rate * 2, 0x0) + return bytes( + ( + (0x05 << 3) | (freq >> 1), + ((freq & 0x01) << 7) | (channels << 3) | (ext_freq >> 1), + ((ext_freq & 0x01) << 7) | (0x02 << 2), # core object type = AAC LC + 0x00, # alignment bits + ) + ) + return bytes(((0x02 << 3) | (freq >> 1), ((freq & 0x01) << 7) | (channels << 3))) + + +def build_sinf( + original_format: bytes, + kid: bytes, + iv_size: int = 8, + constant_iv: Optional[bytes] = None, +) -> bytes: + """Build a sinf protection box (frma + schm cenc + schi/tenc) for CENC. + + iv_size is the tenc default_Per_Sample_IV_Size (8 or 16). When constant_iv + is given, the per-sample IV size is 0 and the constant IV is appended per + ISO/IEC 23001-7 (cbcs-style constant-IV form). + """ + frma = box(b"frma", original_format) + schm = full_box(b"schm", 0, 0, b"cenc" + u32.pack(0x00010000)) + tenc_payload = ( + u8.pack(0) # reserved + + u8.pack(0) # default_crypt_byte_block / skip_byte_block (cenc) + + u8.pack(1) # default_isProtected + + u8.pack(0 if constant_iv else iv_size) # default_Per_Sample_IV_Size + + kid # default_KID (16 bytes) + ) + if constant_iv: + tenc_payload += u8.pack(len(constant_iv)) + constant_iv + schi = box(b"schi", full_box(b"tenc", 0, 0, tenc_payload)) + return box(b"sinf", frma + schm + schi) + + +def build_init_segment( + *, + stream_type: str, + fourcc: str, + codec_private_data: str, + timescale: int = 10000000, + duration: int = 0, + language: str = "und", + width: int = 0, + height: int = 0, + channels: int = 2, + bits_per_sample: int = 16, + sampling_rate: int = 48000, + track_id: int = 1, + nal_length_size: int = 4, + kid: Optional[bytes] = None, + iv_size: int = 8, + constant_iv: Optional[bytes] = None, +) -> bytes: + """ + Build a complete ftyp + moov initialization segment. + + stream_type: "video" | "audio" | "text". + fourcc: Smooth FourCC ("H264"/"AVC1"/"DAVC", "HVC1"/"HEV1", "DVHE"/"DVH1", + "AACL"/"AACH", "EC-3", "TTML"). + codec_private_data: hex string from the manifest QualityLevel. + nal_length_size: manifest NALUnitLengthField (bytes per NAL length prefix). + kid: 16-byte default key id; when set, the sample entry is wrapped for CENC. + iv_size / constant_iv: tenc IV form (see build_sinf). + """ + if stream_type not in ("video", "audio", "text"): + raise ValueError(f"Unsupported stream type: {stream_type}") + fourcc = (fourcc or "").upper() + cpd = binascii.unhexlify(codec_private_data) if codec_private_data else b"" + encrypted = kid is not None + # mdhd packs exactly three a-z letters; anything else (2-letter tags, + # uppercase) would underflow the 5-bit fields, so fall back to "und". + lang = (language or "").lower() + if len(lang) != 3 or not all("a" <= c <= "z" for c in lang): + lang = "und" + + # --- ftyp --- + ftyp = box(b"ftyp", b"isml" + u32.pack(1) + b"iso5" + b"iso6" + b"piff" + b"msdh") + + # --- mvhd --- + mvhd = full_box( + b"mvhd", 1, 0, + u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(timescale) + u64.pack(duration) + + s1616.pack(1) + s88.pack(1) + u16.pack(0) + u32.pack(0) * 2 + + UNITY_MATRIX + u32.pack(0) * 6 + u32.pack(0xFFFFFFFF), + ) + + # --- tkhd --- + tkhd = full_box( + b"tkhd", 1, TRACK_ENABLED | TRACK_IN_MOVIE | TRACK_IN_PREVIEW, + u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(track_id) + u32.pack(0) + + u64.pack(duration) + u32.pack(0) * 2 + s16.pack(0) + s16.pack(0) + + s88.pack(1 if stream_type == "audio" else 0) + u16.pack(0) + UNITY_MATRIX + + u1616.pack(width) + u1616.pack(height), + ) + + # --- mdhd + hdlr --- + packed_lang = ((ord(lang[0]) - 0x60) << 10) | ((ord(lang[1]) - 0x60) << 5) | (ord(lang[2]) - 0x60) + mdhd = full_box( + b"mdhd", 1, 0, + u64.pack(EPOCH) + u64.pack(EPOCH) + u32.pack(timescale) + u64.pack(duration) + + u16.pack(packed_lang) + u16.pack(0), + ) + if stream_type == "audio": + hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"soun" + u32.pack(0) * 3 + b"SoundHandler\0") + media_header = full_box(b"smhd", 0, 0, s88.pack(0) + u16.pack(0)) + elif stream_type == "text": + hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"subt" + u32.pack(0) * 3 + b"SubtitleHandler\0") + media_header = full_box(b"sthd", 0, 0, b"") + else: + hdlr = full_box(b"hdlr", 0, 0, u32.pack(0) + b"vide" + u32.pack(0) * 3 + b"VideoHandler\0") + media_header = full_box(b"vmhd", 0, 1, u16.pack(0) + u16.pack(0) * 3) + + # --- dinf --- + dref = full_box(b"dref", 0, 0, u32.pack(1) + full_box(b"url ", 0, SELF_CONTAINED, b"")) + dinf = box(b"dinf", dref) + + # --- stsd sample entry --- + sample_entry_payload = u8.pack(0) * 6 + u16.pack(1) # reserved + data reference index + if stream_type == "video": + sample_entry_payload += ( + u16.pack(0) + u16.pack(0) + u32.pack(0) * 3 + + u16.pack(width) + u16.pack(height) + + u1616.pack(0x48) + u1616.pack(0x48) + u32.pack(0) + u16.pack(1) + + u8.pack(0) * 32 + u16.pack(0x18) + s16.pack(-1) + ) + if fourcc in ("H264", "AVC1", "DAVC"): + config_box = build_avcc(cpd, nal_length_size) + codec_fourcc = b"avc1" + elif fourcc in ("HVC1", "HEV1", "HEVC", "H265"): + config_box = build_hvcc(cpd, nal_length_size) + codec_fourcc = b"hvc1" + elif fourcc in ("DVHE", "DVH1"): + # Dolby Vision over HEVC: same hvcC config, dvh1 sample entry. + config_box = build_hvcc(cpd, nal_length_size) + codec_fourcc = b"dvh1" + else: + raise NotImplementedError(f"Unsupported video FourCC: {fourcc}") + sample_entry_payload += config_box + if encrypted: + sample_entry_payload += build_sinf(codec_fourcc, kid, iv_size, constant_iv) + sample_entry_box = box(b"encv", sample_entry_payload) + else: + sample_entry_box = box(codec_fourcc, sample_entry_payload) + elif stream_type == "audio": + # samplerate is 16.16 fixed-point; rates above 65535 Hz are written as 0 + # (decoders read the real rate from the codec config), matching ffmpeg. + sample_entry_payload += ( + u32.pack(0) * 2 + u16.pack(channels) + u16.pack(bits_per_sample) + + u16.pack(0) + u16.pack(0) + u32.pack((sampling_rate if sampling_rate <= 0xFFFF else 0) << 16) + ) + if fourcc in ("AACL", "AACH", "AAC"): + if not cpd: + cpd = synthesize_aac_codec_private_data(fourcc, sampling_rate, channels) + sample_entry_payload += build_esds(cpd) + codec_fourcc = b"mp4a" + elif fourcc == "EC-3": + dec3 = build_dec3(cpd) + if dec3: + sample_entry_payload += dec3 + codec_fourcc = b"ec-3" + else: + raise NotImplementedError(f"Unsupported audio FourCC: {fourcc}") + if encrypted: + sample_entry_payload += build_sinf(codec_fourcc, kid, iv_size, constant_iv) + sample_entry_box = box(b"enca", sample_entry_payload) + else: + sample_entry_box = box(codec_fourcc, sample_entry_payload) + else: # text + if fourcc in ("TTML", "STPP", "DFXP"): + # XMLSubtitleSampleEntry: namespace + schema_location + aux mime types. + sample_entry_payload += TTML_NAMESPACE + b"\0" + b"\0" + sample_entry_box = box(b"stpp", sample_entry_payload) + else: + raise NotImplementedError(f"Unsupported text FourCC: {fourcc}") + + stsd = full_box(b"stsd", 0, 0, u32.pack(1) + sample_entry_box) + + # --- empty sample tables (fragmented: real samples live in moof/traf) --- + stbl = box( + b"stbl", + stsd + + full_box(b"stts", 0, 0, u32.pack(0)) + + full_box(b"stsc", 0, 0, u32.pack(0)) + + full_box(b"stsz", 0, 0, u32.pack(0) + u32.pack(0)) + + full_box(b"stco", 0, 0, u32.pack(0)), + ) + + minf = box(b"minf", media_header + dinf + stbl) + mdia = box(b"mdia", mdhd + hdlr + minf) + trak = box(b"trak", tkhd + mdia) + + # --- mvex (mehd + trex) signals a fragmented file --- + mehd = full_box(b"mehd", 1, 0, u64.pack(duration)) + trex = full_box( + b"trex", 0, 0, + u32.pack(track_id) + u32.pack(1) + u32.pack(0) + u32.pack(0) + u32.pack(0), + ) + mvex = box(b"mvex", mehd + trex) + + moov = box(b"moov", mvhd + trak + mvex) + return ftyp + moov