Initial Commit

This commit is contained in:
Andy
2025-07-18 00:46:05 +00:00
commit d37014f53f
94 changed files with 17458 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
from .dash import DASH
from .hls import HLS
from .ism import ISM
__all__ = ("DASH", "HLS", "ISM")

View File

@@ -0,0 +1,800 @@
from __future__ import annotations
import base64
import html
import logging
import math
import re
import sys
from copy import copy
from functools import partial
from pathlib import Path
from typing import Any, Callable, Optional, Union
from urllib.parse import urljoin, urlparse
from uuid import UUID
from zlib import crc32
import requests
from langcodes import Language, tag_is_valid
from lxml.etree import Element, ElementTree
from pyplayready.system.pssh import PSSH as PR_PSSH
from pywidevine.cdm import Cdm as WidevineCdm
from pywidevine.pssh import PSSH
from requests import Session
from unshackle.core.constants import DOWNLOAD_CANCELLED, DOWNLOAD_LICENCE_ONLY, AnyTrack
from unshackle.core.downloaders import requests as requests_downloader
from unshackle.core.drm import DRM_T, PlayReady, Widevine
from unshackle.core.events import events
from unshackle.core.tracks import Audio, Subtitle, Tracks, Video
from unshackle.core.utilities import is_close_match, try_ensure_utf8
from unshackle.core.utils.xml import load_xml
class DASH:
def __init__(self, manifest, url: str):
if manifest is None:
raise ValueError("DASH manifest must be provided.")
if manifest.tag != "MPD":
raise TypeError(f"Expected 'MPD' document, but received a '{manifest.tag}' document instead.")
if not url:
raise requests.URLRequired("DASH manifest URL must be provided for relative path computations.")
if not isinstance(url, str):
raise TypeError(f"Expected url to be a {str}, not {url!r}")
self.manifest = manifest
self.url = url
@classmethod
def from_url(cls, url: str, session: Optional[Session] = None, **args: Any) -> DASH:
if not url:
raise requests.URLRequired("DASH manifest URL must be provided for relative path computations.")
if not isinstance(url, str):
raise TypeError(f"Expected url to be a {str}, not {url!r}")
if not session:
session = Session()
elif not isinstance(session, Session):
raise TypeError(f"Expected session to be a {Session}, not {session!r}")
res = session.get(url, **args)
if res.url != url:
url = res.url
if not res.ok:
raise requests.ConnectionError("Failed to request the MPD document.", response=res)
return DASH.from_text(res.text, url)
@classmethod
def from_text(cls, text: str, url: str) -> DASH:
if not text:
raise ValueError("DASH manifest Text must be provided.")
if not isinstance(text, str):
raise TypeError(f"Expected text to be a {str}, not {text!r}")
if not url:
raise requests.URLRequired("DASH manifest URL must be provided for relative path computations.")
if not isinstance(url, str):
raise TypeError(f"Expected url to be a {str}, not {url!r}")
manifest = load_xml(text)
return cls(manifest, url)
def to_tracks(
self, language: Optional[Union[str, Language]] = None, period_filter: Optional[Callable] = None
) -> Tracks:
"""
Convert an MPEG-DASH document to Video, Audio and Subtitle Track objects.
Parameters:
language: The Title's Original Recorded Language. It will also be used as a fallback
track language value if the manifest does not list language information.
period_filter: Filter out period's within the manifest.
All Track URLs will be a list of segment URLs.
"""
tracks = Tracks()
for period in self.manifest.findall("Period"):
if callable(period_filter) and period_filter(period):
continue
if next(iter(period.xpath("SegmentType/@value")), "content") != "content":
continue
for adaptation_set in period.findall("AdaptationSet"):
if self.is_trick_mode(adaptation_set):
# we don't want trick mode streams (they are only used for fast-forward/rewind)
continue
for rep in adaptation_set.findall("Representation"):
get = partial(self._get, adaptation_set=adaptation_set, representation=rep)
findall = partial(self._findall, adaptation_set=adaptation_set, representation=rep, both=True)
segment_base = rep.find("SegmentBase")
codecs = get("codecs")
content_type = get("contentType")
mime_type = get("mimeType")
if not content_type and mime_type:
content_type = mime_type.split("/")[0]
if not content_type and not mime_type:
raise ValueError("Unable to determine the format of a Representation, cannot continue...")
if mime_type == "application/mp4" or content_type == "application":
# likely mp4-boxed subtitles
# TODO: It may not actually be subtitles
try:
real_codec = Subtitle.Codec.from_mime(codecs)
content_type = "text"
mime_type = f"application/mp4; codecs='{real_codec.value.lower()}'"
except ValueError:
raise ValueError(f"Unsupported content type '{content_type}' with codecs of '{codecs}'")
if content_type == "text" and mime_type and "/mp4" not in mime_type:
# mimeType likely specifies the subtitle codec better than `codecs`
codecs = mime_type.split("/")[1]
if content_type == "video":
track_type = Video
track_codec = Video.Codec.from_codecs(codecs)
track_fps = get("frameRate")
if not track_fps and segment_base is not None:
track_fps = segment_base.get("timescale")
track_args = dict(
range_=self.get_video_range(
codecs, findall("SupplementalProperty"), findall("EssentialProperty")
),
bitrate=get("bandwidth") or None,
width=get("width") or 0,
height=get("height") or 0,
fps=track_fps or None,
)
elif content_type == "audio":
track_type = Audio
track_codec = Audio.Codec.from_codecs(codecs)
track_args = dict(
bitrate=get("bandwidth") or None,
channels=next(
iter(
rep.xpath("AudioChannelConfiguration/@value")
or adaptation_set.xpath("AudioChannelConfiguration/@value")
),
None,
),
joc=self.get_ddp_complexity_index(adaptation_set, rep),
descriptive=self.is_descriptive(adaptation_set),
)
elif content_type == "text":
track_type = Subtitle
track_codec = Subtitle.Codec.from_codecs(codecs or "vtt")
track_args = dict(
cc=self.is_closed_caption(adaptation_set),
sdh=self.is_sdh(adaptation_set),
forced=self.is_forced(adaptation_set),
)
elif content_type == "image":
# we don't want what's likely thumbnails for the seekbar
continue
else:
raise ValueError(f"Unknown Track Type '{content_type}'")
track_lang = self.get_language(adaptation_set, rep, fallback=language)
if not track_lang:
msg = "Language information could not be derived from a Representation."
if language is None:
msg += " No fallback language was provided when calling DASH.to_tracks()."
elif not tag_is_valid((str(language) or "").strip()) or str(language).startswith("und"):
msg += f" The fallback language provided is also invalid: {language}"
raise ValueError(msg)
# for some reason it's incredibly common for services to not provide
# a good and actually unique track ID, sometimes because of the lang
# dialect not being represented in the id, or the bitrate, or such.
# this combines all of them as one and hashes it to keep it small(ish).
track_id = hex(
crc32(
"{codec}-{lang}-{bitrate}-{base_url}-{ids}-{track_args}".format(
codec=codecs,
lang=track_lang,
bitrate=get("bitrate"),
base_url=(rep.findtext("BaseURL") or "").split("?")[0],
ids=[get("audioTrackId"), get("id"), period.get("id")],
track_args=track_args,
).encode()
)
)[2:]
tracks.add(
track_type(
id_=track_id,
url=self.url,
codec=track_codec,
language=track_lang,
is_original_lang=bool(language and is_close_match(track_lang, [language])),
descriptor=Video.Descriptor.DASH,
data={
"dash": {
"manifest": self.manifest,
"period": period,
"adaptation_set": adaptation_set,
"representation": rep,
}
},
**track_args,
)
)
# only get tracks from the first main-content period
break
return tracks
@staticmethod
def download_track(
track: AnyTrack,
save_path: Path,
save_dir: Path,
progress: partial,
session: Optional[Session] = None,
proxy: Optional[str] = None,
max_workers: Optional[int] = None,
license_widevine: Optional[Callable] = None,
*,
cdm: Optional[object] = None,
):
if not session:
session = Session()
elif not isinstance(session, Session):
raise TypeError(f"Expected session to be a {Session}, not {session!r}")
if proxy:
session.proxies.update({"all": proxy})
log = logging.getLogger("DASH")
manifest: ElementTree = track.data["dash"]["manifest"]
period: Element = track.data["dash"]["period"]
adaptation_set: Element = track.data["dash"]["adaptation_set"]
representation: Element = track.data["dash"]["representation"]
# Preserve existing DRM if it was set by the service, especially when service set Widevine
# but manifest only contains PlayReady protection (common scenario for some services)
existing_drm = track.drm
manifest_drm = DASH.get_drm(
representation.findall("ContentProtection") + adaptation_set.findall("ContentProtection")
)
# Only override existing DRM if:
# 1. No existing DRM was set, OR
# 2. Existing DRM contains same type as manifest DRM, OR
# 3. Existing DRM is not Widevine (preserve Widevine when service explicitly set it)
should_override_drm = (
not existing_drm
or (
existing_drm
and manifest_drm
and any(isinstance(existing, type(manifest)) for existing in existing_drm for manifest in manifest_drm)
)
or (existing_drm and not any(isinstance(drm, Widevine) for drm in existing_drm))
)
if should_override_drm:
track.drm = manifest_drm
else:
track.drm = existing_drm
manifest_base_url = manifest.findtext("BaseURL")
if not manifest_base_url:
manifest_base_url = track.url
elif not re.match("^https?://", manifest_base_url, re.IGNORECASE):
manifest_base_url = urljoin(track.url, f"./{manifest_base_url}")
period_base_url = urljoin(manifest_base_url, period.findtext("BaseURL"))
rep_base_url = urljoin(period_base_url, representation.findtext("BaseURL"))
period_duration = period.get("duration") or manifest.get("mediaPresentationDuration")
init_data: Optional[bytes] = None
segment_template = representation.find("SegmentTemplate")
if segment_template is None:
segment_template = adaptation_set.find("SegmentTemplate")
segment_list = representation.find("SegmentList")
if segment_list is None:
segment_list = adaptation_set.find("SegmentList")
segment_base = representation.find("SegmentBase")
if segment_base is None:
segment_base = adaptation_set.find("SegmentBase")
segments: list[tuple[str, Optional[str]]] = []
segment_timescale: float = 0
segment_durations: list[int] = []
track_kid: Optional[UUID] = None
if segment_template is not None:
segment_template = copy(segment_template)
start_number = int(segment_template.get("startNumber") or 1)
end_number = int(segment_template.get("endNumber") or 0) or None
segment_timeline = segment_template.find("SegmentTimeline")
segment_timescale = float(segment_template.get("timescale") or 1)
for item in ("initialization", "media"):
value = segment_template.get(item)
if not value:
continue
if not re.match("^https?://", value, re.IGNORECASE):
if not rep_base_url:
raise ValueError("Resolved Segment URL is not absolute, and no Base URL is available.")
value = urljoin(rep_base_url, value)
if not urlparse(value).query:
manifest_url_query = urlparse(track.url).query
if manifest_url_query:
value += f"?{manifest_url_query}"
segment_template.set(item, value)
init_url = segment_template.get("initialization")
if init_url:
res = session.get(
DASH.replace_fields(
init_url, Bandwidth=representation.get("bandwidth"), RepresentationID=representation.get("id")
)
)
res.raise_for_status()
init_data = res.content
track_kid = track.get_key_id(init_data)
if segment_timeline is not None:
current_time = 0
for s in segment_timeline.findall("S"):
if s.get("t"):
current_time = int(s.get("t"))
for _ in range(1 + (int(s.get("r") or 0))):
segment_durations.append(current_time)
current_time += int(s.get("d"))
if not end_number:
end_number = len(segment_durations)
for t, n in zip(segment_durations, range(start_number, end_number + 1)):
segments.append(
(
DASH.replace_fields(
segment_template.get("media"),
Bandwidth=representation.get("bandwidth"),
Number=n,
RepresentationID=representation.get("id"),
Time=t,
),
None,
)
)
else:
if not period_duration:
raise ValueError("Duration of the Period was unable to be determined.")
period_duration = DASH.pt_to_sec(period_duration)
segment_duration = float(segment_template.get("duration")) or 1
if not end_number:
end_number = math.ceil(period_duration / (segment_duration / segment_timescale))
for s in range(start_number, end_number + 1):
segments.append(
(
DASH.replace_fields(
segment_template.get("media"),
Bandwidth=representation.get("bandwidth"),
Number=s,
RepresentationID=representation.get("id"),
Time=s,
),
None,
)
)
# TODO: Should we floor/ceil/round, or is int() ok?
segment_durations.append(int(segment_duration))
elif segment_list is not None:
segment_timescale = float(segment_list.get("timescale") or 1)
init_data = None
initialization = segment_list.find("Initialization")
if initialization is not None:
source_url = initialization.get("sourceURL")
if not source_url:
source_url = rep_base_url
elif not re.match("^https?://", source_url, re.IGNORECASE):
source_url = urljoin(rep_base_url, f"./{source_url}")
if initialization.get("range"):
init_range_header = {"Range": f"bytes={initialization.get('range')}"}
else:
init_range_header = None
res = session.get(url=source_url, headers=init_range_header)
res.raise_for_status()
init_data = res.content
track_kid = track.get_key_id(init_data)
segment_urls = segment_list.findall("SegmentURL")
for segment_url in segment_urls:
media_url = segment_url.get("media")
if not media_url:
media_url = rep_base_url
elif not re.match("^https?://", media_url, re.IGNORECASE):
media_url = urljoin(rep_base_url, f"./{media_url}")
segments.append((media_url, segment_url.get("mediaRange")))
segment_durations.append(int(segment_url.get("duration") or 1))
elif segment_base is not None:
media_range = None
init_data = None
initialization = segment_base.find("Initialization")
if initialization is not None:
if initialization.get("range"):
init_range_header = {"Range": f"bytes={initialization.get('range')}"}
else:
init_range_header = None
res = session.get(url=rep_base_url, headers=init_range_header)
res.raise_for_status()
init_data = res.content
track_kid = track.get_key_id(init_data)
total_size = res.headers.get("Content-Range", "").split("/")[-1]
if total_size:
media_range = f"{len(init_data)}-{total_size}"
segments.append((rep_base_url, media_range))
elif rep_base_url:
segments.append((rep_base_url, None))
else:
log.error("Could not find a way to get segments from this MPD manifest.")
log.debug(track.url)
sys.exit(1)
# TODO: Should we floor/ceil/round, or is int() ok?
track.data["dash"]["timescale"] = int(segment_timescale)
track.data["dash"]["segment_durations"] = segment_durations
if not track.drm and isinstance(track, (Video, Audio)):
try:
track.drm = [Widevine.from_init_data(init_data)]
except Widevine.Exceptions.PSSHNotFound:
# it might not have Widevine DRM, or might not have found the PSSH
log.warning("No Widevine PSSH was found for this track, is it DRM free?")
if track.drm:
track_kid = track_kid or track.get_key_id(url=segments[0][0], session=session)
drm = track.get_drm_for_cdm(cdm)
if isinstance(drm, (Widevine, PlayReady)):
# license and grab content keys
try:
if not license_widevine:
raise ValueError("license_widevine func must be supplied to use DRM")
progress(downloaded="LICENSING")
license_widevine(drm, track_kid=track_kid)
progress(downloaded="[yellow]LICENSED")
except Exception: # noqa
DOWNLOAD_CANCELLED.set() # skip pending track downloads
progress(downloaded="[red]FAILED")
raise
else:
drm = None
if DOWNLOAD_LICENCE_ONLY.is_set():
progress(downloaded="[yellow]SKIPPED")
return
progress(total=len(segments))
downloader = track.downloader
if downloader.__name__ == "aria2c" and any(bytes_range is not None for url, bytes_range in segments):
# aria2(c) is shit and doesn't support the Range header, fallback to the requests downloader
downloader = requests_downloader
log.warning("Falling back to the requests downloader as aria2(c) doesn't support the Range header")
downloader_args = dict(
urls=[
{"url": url, "headers": {"Range": f"bytes={bytes_range}"} if bytes_range else {}}
for url, bytes_range in segments
],
output_dir=save_dir,
filename="{i:0%d}.mp4" % (len(str(len(segments)))),
headers=session.headers,
cookies=session.cookies,
proxy=proxy,
max_workers=max_workers,
)
if downloader.__name__ == "n_m3u8dl_re":
downloader_args.update({"filename": track.id, "track": track})
for status_update in downloader(**downloader_args):
file_downloaded = status_update.get("file_downloaded")
if file_downloaded:
events.emit(events.Types.SEGMENT_DOWNLOADED, track=track, segment=file_downloaded)
else:
downloaded = status_update.get("downloaded")
if downloaded and downloaded.endswith("/s"):
status_update["downloaded"] = f"DASH {downloaded}"
progress(**status_update)
# see https://github.com/devine-dl/devine/issues/71
for control_file in save_dir.glob("*.aria2__temp"):
control_file.unlink()
segments_to_merge = [x for x in sorted(save_dir.iterdir()) if x.is_file()]
with open(save_path, "wb") as f:
if init_data:
f.write(init_data)
if len(segments_to_merge) > 1:
progress(downloaded="Merging", completed=0, total=len(segments_to_merge))
for segment_file in segments_to_merge:
segment_data = segment_file.read_bytes()
# TODO: fix encoding after decryption?
if (
not drm
and isinstance(track, Subtitle)
and track.codec not in (Subtitle.Codec.fVTT, Subtitle.Codec.fTTML)
):
segment_data = try_ensure_utf8(segment_data)
segment_data = (
segment_data.decode("utf8")
.replace("‎", html.unescape("‎"))
.replace("‏", html.unescape("‏"))
.encode("utf8")
)
f.write(segment_data)
f.flush()
segment_file.unlink()
progress(advance=1)
track.path = save_path
events.emit(events.Types.TRACK_DOWNLOADED, track=track)
if drm:
progress(downloaded="Decrypting", completed=0, total=100)
drm.decrypt(save_path)
track.drm = None
events.emit(events.Types.TRACK_DECRYPTED, track=track, drm=drm, segment=None)
progress(downloaded="Decrypting", advance=100)
save_dir.rmdir()
progress(downloaded="Downloaded")
@staticmethod
def _get(item: str, adaptation_set: Element, representation: Optional[Element] = None) -> Optional[Any]:
"""Helper to get a requested item from the Representation, otherwise from the AdaptationSet."""
adaptation_set_item = adaptation_set.get(item)
if representation is None:
return adaptation_set_item
representation_item = representation.get(item)
if representation_item is not None:
return representation_item
return adaptation_set_item
@staticmethod
def _findall(
item: str, adaptation_set: Element, representation: Optional[Element] = None, both: bool = False
) -> list[Any]:
"""
Helper to get all requested items from the Representation, otherwise from the AdaptationSet.
Optionally, you may pass both=True to keep both values (where available).
"""
adaptation_set_items = adaptation_set.findall(item)
if representation is None:
return adaptation_set_items
representation_items = representation.findall(item)
if both:
return representation_items + adaptation_set_items
if representation_items:
return representation_items
return adaptation_set_items
@staticmethod
def get_language(
adaptation_set: Element,
representation: Optional[Element] = None,
fallback: Optional[Union[str, Language]] = None,
) -> Optional[Language]:
"""
Get Language (if any) from the AdaptationSet or Representation.
A fallback language may be provided if no language information could be
retrieved.
"""
options = []
if representation is not None:
options.append(representation.get("lang"))
# derive language from somewhat common id string format
# the format is typically "{rep_id}_{lang}={bitrate}" or similar
rep_id = representation.get("id")
if rep_id:
m = re.match(r"\w+_(\w+)=\d+", rep_id)
if m:
options.append(m.group(1))
options.append(adaptation_set.get("lang"))
if fallback:
options.append(fallback)
for option in options:
option = (str(option) or "").strip()
if not tag_is_valid(option) or option.startswith("und"):
continue
return Language.get(option)
@staticmethod
def get_video_range(
codecs: str, all_supplemental_props: list[Element], all_essential_props: list[Element]
) -> Video.Range:
if codecs.startswith(("dva1", "dvav", "dvhe", "dvh1")):
return Video.Range.DV
return Video.Range.from_cicp(
primaries=next(
(
int(x.get("value"))
for x in all_supplemental_props + all_essential_props
if x.get("schemeIdUri") == "urn:mpeg:mpegB:cicp:ColourPrimaries"
),
0,
),
transfer=next(
(
int(x.get("value"))
for x in all_supplemental_props + all_essential_props
if x.get("schemeIdUri") == "urn:mpeg:mpegB:cicp:TransferCharacteristics"
),
0,
),
matrix=next(
(
int(x.get("value"))
for x in all_supplemental_props + all_essential_props
if x.get("schemeIdUri") == "urn:mpeg:mpegB:cicp:MatrixCoefficients"
),
0,
),
)
@staticmethod
def is_trick_mode(adaptation_set: Element) -> bool:
"""Check if contents of Adaptation Set is a Trick-Mode stream."""
essential_props = adaptation_set.findall("EssentialProperty")
supplemental_props = adaptation_set.findall("SupplementalProperty")
return any(
prop.get("schemeIdUri") == "http://dashif.org/guidelines/trickmode"
for prop in essential_props + supplemental_props
)
@staticmethod
def is_descriptive(adaptation_set: Element) -> bool:
"""Check if contents of Adaptation Set is Descriptive."""
return any(
(x.get("schemeIdUri"), x.get("value"))
in (("urn:mpeg:dash:role:2011", "descriptive"), ("urn:tva:metadata:cs:AudioPurposeCS:2007", "1"))
for x in adaptation_set.findall("Accessibility")
)
@staticmethod
def is_forced(adaptation_set: Element) -> bool:
"""Check if contents of Adaptation Set is a Forced Subtitle."""
return any(
x.get("schemeIdUri") == "urn:mpeg:dash:role:2011"
and x.get("value") in ("forced-subtitle", "forced_subtitle")
for x in adaptation_set.findall("Role")
)
@staticmethod
def is_sdh(adaptation_set: Element) -> bool:
"""Check if contents of Adaptation Set is for the Hearing Impaired."""
return any(
(x.get("schemeIdUri"), x.get("value")) == ("urn:tva:metadata:cs:AudioPurposeCS:2007", "2")
for x in adaptation_set.findall("Accessibility")
)
@staticmethod
def is_closed_caption(adaptation_set: Element) -> bool:
"""Check if contents of Adaptation Set is a Closed Caption Subtitle."""
return any(
(x.get("schemeIdUri"), x.get("value")) == ("urn:mpeg:dash:role:2011", "caption")
for x in adaptation_set.findall("Role")
)
@staticmethod
def get_ddp_complexity_index(adaptation_set: Element, representation: Optional[Element]) -> Optional[int]:
"""Get the DD+ Complexity Index (if any) from the AdaptationSet or Representation."""
return next(
(
int(x.get("value"))
for x in DASH._findall("SupplementalProperty", adaptation_set, representation, both=True)
if x.get("schemeIdUri") == "tag:dolby.com,2018:dash:EC3_ExtensionComplexityIndex:2018"
),
None,
)
@staticmethod
def get_drm(protections: list[Element]) -> list[DRM_T]:
drm: list[DRM_T] = []
for protection in protections:
urn = (protection.get("schemeIdUri") or "").lower()
if urn == WidevineCdm.urn:
pssh_text = protection.findtext("pssh")
if not pssh_text:
continue
pssh = PSSH(pssh_text)
kid = protection.get("kid")
if kid:
kid = UUID(bytes=base64.b64decode(kid))
default_kid = protection.get("default_KID")
if default_kid:
kid = UUID(default_kid)
if not pssh.key_ids and not kid:
kid = next((UUID(p.get("default_KID")) for p in protections if p.get("default_KID")), None)
drm.append(Widevine(pssh=pssh, kid=kid))
elif urn in ("urn:uuid:9a04f079-9840-4286-ab92-e65be0885f95", "urn:microsoft:playready"):
pr_pssh_b64 = (
protection.findtext("pssh")
or protection.findtext("pro")
or protection.findtext("{urn:microsoft:playready}pro")
)
if not pr_pssh_b64:
continue
pr_pssh = PR_PSSH(pr_pssh_b64)
kid_b64 = protection.findtext("kid")
kid = None
if kid_b64:
try:
kid = UUID(bytes=base64.b64decode(kid_b64))
except Exception:
kid = None
drm.append(PlayReady(pssh=pr_pssh, kid=kid, pssh_b64=pr_pssh_b64))
return drm
@staticmethod
def pt_to_sec(d: Union[str, float]) -> float:
if isinstance(d, float):
return d
has_ymd = d[0:8] == "P0Y0M0DT"
if d[0:2] != "PT" and not has_ymd:
raise ValueError("Input data is not a valid time string.")
if has_ymd:
d = d[6:].upper() # skip `P0Y0M0DT`
else:
d = d[2:].upper() # skip `PT`
m = re.findall(r"([\d.]+.)", d)
return sum(float(x[0:-1]) * {"H": 60 * 60, "M": 60, "S": 1}[x[-1].upper()] for x in m)
@staticmethod
def replace_fields(url: str, **kwargs: Any) -> str:
for field, value in kwargs.items():
url = url.replace(f"${field}$", str(value))
m = re.search(rf"\${re.escape(field)}%([a-z0-9]+)\$", url, flags=re.I)
if m:
url = url.replace(m.group(), f"{value:{m.group(1)}}")
return url
__all__ = ("DASH",)

View File

@@ -0,0 +1,832 @@
from __future__ import annotations
import base64
import html
import json
import logging
import shutil
import subprocess
import sys
from functools import partial
from pathlib import Path
from typing import Any, Callable, Optional, Union
from urllib.parse import urljoin
from zlib import crc32
import httpx
import m3u8
import requests
from langcodes import Language, tag_is_valid
from m3u8 import M3U8
from pyplayready.cdm import Cdm as PlayReadyCdm
from pyplayready.system.pssh import PSSH as PR_PSSH
from pywidevine.cdm import Cdm as WidevineCdm
from pywidevine.pssh import PSSH as WV_PSSH
from requests import Session
from unshackle.core import binaries
from unshackle.core.constants import DOWNLOAD_CANCELLED, DOWNLOAD_LICENCE_ONLY, AnyTrack
from unshackle.core.downloaders import requests as requests_downloader
from unshackle.core.drm import DRM_T, ClearKey, PlayReady, Widevine
from unshackle.core.events import events
from unshackle.core.tracks import Audio, Subtitle, Tracks, Video
from unshackle.core.utilities import get_extension, is_close_match, try_ensure_utf8
class HLS:
def __init__(self, manifest: M3U8, session: Optional[Union[Session, httpx.Client]] = None):
if not manifest:
raise ValueError("HLS manifest must be provided.")
if not isinstance(manifest, M3U8):
raise TypeError(f"Expected manifest to be a {M3U8}, not {manifest!r}")
if not manifest.is_variant:
raise ValueError("Expected the M3U(8) manifest to be a Variant Playlist.")
self.manifest = manifest
self.session = session or Session()
@classmethod
def from_url(cls, url: str, session: Optional[Union[Session, httpx.Client]] = None, **args: Any) -> HLS:
if not url:
raise requests.URLRequired("HLS manifest URL must be provided.")
if not isinstance(url, str):
raise TypeError(f"Expected url to be a {str}, not {url!r}")
if not session:
session = Session()
elif not isinstance(session, (Session, httpx.Client)):
raise TypeError(f"Expected session to be a {Session} or {httpx.Client}, not {session!r}")
res = session.get(url, **args)
# Handle both requests and httpx response objects
if isinstance(res, requests.Response):
if not res.ok:
raise requests.ConnectionError("Failed to request the M3U(8) document.", response=res)
content = res.text
elif isinstance(res, httpx.Response):
if res.status_code >= 400:
raise requests.ConnectionError("Failed to request the M3U(8) document.", response=res)
content = res.text
else:
raise TypeError(f"Expected response to be a requests.Response or httpx.Response, not {type(res)}")
master = m3u8.loads(content, uri=url)
return cls(master, session)
@classmethod
def from_text(cls, text: str, url: str) -> HLS:
if not text:
raise ValueError("HLS manifest Text must be provided.")
if not isinstance(text, str):
raise TypeError(f"Expected text to be a {str}, not {text!r}")
if not url:
raise requests.URLRequired("HLS manifest URL must be provided for relative path computations.")
if not isinstance(url, str):
raise TypeError(f"Expected url to be a {str}, not {url!r}")
master = m3u8.loads(text, uri=url)
return cls(master)
def to_tracks(self, language: Union[str, Language]) -> Tracks:
"""
Convert a Variant Playlist M3U(8) document to Video, Audio and Subtitle Track objects.
Parameters:
language: Language you expect the Primary Track to be in.
All Track objects' URL will be to another M3U(8) document. However, these documents
will be Invariant Playlists and contain the list of segments URIs among other metadata.
"""
session_keys = list(self.manifest.session_keys or [])
if not session_keys:
session_keys = HLS.parse_session_data_keys(self.manifest, self.session)
session_drm = HLS.get_all_drm(session_keys)
audio_codecs_by_group_id: dict[str, Audio.Codec] = {}
tracks = Tracks()
for playlist in self.manifest.playlists:
audio_group = playlist.stream_info.audio
if audio_group:
audio_codec = Audio.Codec.from_codecs(playlist.stream_info.codecs)
audio_codecs_by_group_id[audio_group] = audio_codec
try:
# TODO: Any better way to figure out the primary track type?
if playlist.stream_info.codecs:
Video.Codec.from_codecs(playlist.stream_info.codecs)
except ValueError:
primary_track_type = Audio
else:
primary_track_type = Video
tracks.add(
primary_track_type(
id_=hex(crc32(str(playlist).encode()))[2:],
url=urljoin(playlist.base_uri, playlist.uri),
codec=(
primary_track_type.Codec.from_codecs(playlist.stream_info.codecs)
if playlist.stream_info.codecs
else None
),
language=language, # HLS manifests do not seem to have language info
is_original_lang=True, # TODO: All we can do is assume Yes
bitrate=playlist.stream_info.average_bandwidth or playlist.stream_info.bandwidth,
descriptor=Video.Descriptor.HLS,
drm=session_drm,
data={"hls": {"playlist": playlist}},
# video track args
**(
dict(
range_=Video.Range.DV
if any(
codec.split(".")[0] in ("dva1", "dvav", "dvhe", "dvh1")
for codec in (playlist.stream_info.codecs or "").lower().split(",")
)
else Video.Range.from_m3u_range_tag(playlist.stream_info.video_range),
width=playlist.stream_info.resolution[0] if playlist.stream_info.resolution else None,
height=playlist.stream_info.resolution[1] if playlist.stream_info.resolution else None,
fps=playlist.stream_info.frame_rate,
)
if primary_track_type is Video
else {}
),
)
)
for media in self.manifest.media:
if not media.uri:
continue
joc = 0
if media.type == "AUDIO":
track_type = Audio
codec = audio_codecs_by_group_id.get(media.group_id)
if media.channels and media.channels.endswith("/JOC"):
joc = int(media.channels.split("/JOC")[0])
media.channels = "5.1"
else:
track_type = Subtitle
codec = Subtitle.Codec.WebVTT # assuming WebVTT, codec info isn't shown
track_lang = next(
(
Language.get(option)
for x in (media.language, language)
for option in [(str(x) or "").strip()]
if tag_is_valid(option) and not option.startswith("und")
),
None,
)
if not track_lang:
msg = "Language information could not be derived for a media."
if language is None:
msg += " No fallback language was provided when calling HLS.to_tracks()."
elif not tag_is_valid((str(language) or "").strip()) or str(language).startswith("und"):
msg += f" The fallback language provided is also invalid: {language}"
raise ValueError(msg)
tracks.add(
track_type(
id_=hex(crc32(str(media).encode()))[2:],
url=urljoin(media.base_uri, media.uri),
codec=codec,
language=track_lang, # HLS media may not have language info, fallback if needed
is_original_lang=bool(language and is_close_match(track_lang, [language])),
descriptor=Audio.Descriptor.HLS,
drm=session_drm if media.type == "AUDIO" else None,
data={"hls": {"media": media}},
# audio track args
**(
dict(
bitrate=0, # TODO: M3U doesn't seem to state bitrate?
channels=media.channels,
joc=joc,
descriptive="public.accessibility.describes-video" in (media.characteristics or ""),
)
if track_type is Audio
else dict(
forced=media.forced == "YES",
sdh="public.accessibility.describes-music-and-sound" in (media.characteristics or ""),
)
if track_type is Subtitle
else {}
),
)
)
return tracks
@staticmethod
def download_track(
track: AnyTrack,
save_path: Path,
save_dir: Path,
progress: partial,
session: Optional[Union[Session, httpx.Client]] = None,
proxy: Optional[str] = None,
max_workers: Optional[int] = None,
license_widevine: Optional[Callable] = None,
*,
cdm: Optional[object] = None,
) -> None:
if not session:
session = Session()
elif not isinstance(session, (Session, httpx.Client)):
raise TypeError(f"Expected session to be a {Session} or {httpx.Client}, not {session!r}")
if proxy:
# Handle proxies differently based on session type
if isinstance(session, Session):
session.proxies.update({"all": proxy})
elif isinstance(session, httpx.Client):
session.proxies = {"http://": proxy, "https://": proxy}
log = logging.getLogger("HLS")
# Get the playlist text and handle both session types
response = session.get(track.url)
if isinstance(response, requests.Response):
if not response.ok:
log.error(f"Failed to request the invariant M3U8 playlist: {response.status_code}")
sys.exit(1)
playlist_text = response.text
elif isinstance(response, httpx.Response):
if response.status_code >= 400:
log.error(f"Failed to request the invariant M3U8 playlist: {response.status_code}")
sys.exit(1)
playlist_text = response.text
else:
raise TypeError(f"Expected response to be a requests.Response or httpx.Response, not {type(response)}")
master = m3u8.loads(playlist_text, uri=track.url)
if not master.segments:
log.error("Track's HLS playlist has no segments, expecting an invariant M3U8 playlist.")
sys.exit(1)
if track.drm:
session_drm = track.get_drm_for_cdm(cdm)
if isinstance(session_drm, (Widevine, PlayReady)):
# license and grab content keys
try:
if not license_widevine:
raise ValueError("license_widevine func must be supplied to use DRM")
progress(downloaded="LICENSING")
license_widevine(session_drm)
progress(downloaded="[yellow]LICENSED")
except Exception: # noqa
DOWNLOAD_CANCELLED.set() # skip pending track downloads
progress(downloaded="[red]FAILED")
raise
else:
session_drm = None
if DOWNLOAD_LICENCE_ONLY.is_set():
progress(downloaded="[yellow]SKIPPED")
return
unwanted_segments = [
segment for segment in master.segments if callable(track.OnSegmentFilter) and track.OnSegmentFilter(segment)
]
total_segments = len(master.segments) - len(unwanted_segments)
progress(total=total_segments)
downloader = track.downloader
if downloader.__name__ == "aria2c" and any(x.byterange for x in master.segments if x not in unwanted_segments):
downloader = requests_downloader
log.warning("Falling back to the requests downloader as aria2(c) doesn't support the Range header")
urls: list[dict[str, Any]] = []
segment_durations: list[int] = []
range_offset = 0
for segment in master.segments:
if segment in unwanted_segments:
continue
segment_durations.append(int(segment.duration))
if segment.byterange:
byte_range = HLS.calculate_byte_range(segment.byterange, range_offset)
range_offset = byte_range.split("-")[0]
else:
byte_range = None
urls.append(
{
"url": urljoin(segment.base_uri, segment.uri),
"headers": {"Range": f"bytes={byte_range}"} if byte_range else {},
}
)
track.data["hls"]["segment_durations"] = segment_durations
segment_save_dir = save_dir / "segments"
skip_merge = False
downloader_args = dict(
urls=urls,
output_dir=segment_save_dir,
filename="{i:0%d}{ext}" % len(str(len(urls))),
headers=session.headers,
cookies=session.cookies,
proxy=proxy,
max_workers=max_workers,
)
if downloader.__name__ == "n_m3u8dl_re":
skip_merge = True
downloader_args.update(
{
"output_dir": save_dir,
"filename": track.id,
"track": track,
"content_keys": session_drm.content_keys if session_drm else None,
}
)
for status_update in downloader(**downloader_args):
file_downloaded = status_update.get("file_downloaded")
if file_downloaded:
events.emit(events.Types.SEGMENT_DOWNLOADED, track=track, segment=file_downloaded)
else:
downloaded = status_update.get("downloaded")
if downloaded and downloaded.endswith("/s"):
status_update["downloaded"] = f"HLS {downloaded}"
progress(**status_update)
# see https://github.com/devine-dl/devine/issues/71
for control_file in segment_save_dir.glob("*.aria2__temp"):
control_file.unlink()
if not skip_merge:
progress(total=total_segments, completed=0, downloaded="Merging")
name_len = len(str(total_segments))
discon_i = 0
range_offset = 0
map_data: Optional[tuple[m3u8.model.InitializationSection, bytes]] = None
if session_drm:
encryption_data: Optional[tuple[Optional[m3u8.Key], DRM_T]] = (None, session_drm)
else:
encryption_data: Optional[tuple[Optional[m3u8.Key], DRM_T]] = None
i = -1
for real_i, segment in enumerate(master.segments):
if segment not in unwanted_segments:
i += 1
is_last_segment = (real_i + 1) == len(master.segments)
def merge(to: Path, via: list[Path], delete: bool = False, include_map_data: bool = False):
"""
Merge all files to a given path, optionally including map data.
Parameters:
to: The output file with all merged data.
via: List of files to merge, in sequence.
delete: Delete the file once it's been merged.
include_map_data: Whether to include the init map data.
"""
with open(to, "wb") as x:
if include_map_data and map_data and map_data[1]:
x.write(map_data[1])
for file in via:
x.write(file.read_bytes())
x.flush()
if delete:
file.unlink()
def decrypt(include_this_segment: bool) -> Path:
"""
Decrypt all segments that uses the currently set DRM.
All segments that will be decrypted with this DRM will be merged together
in sequence, prefixed with the init data (if any), and then deleted. Once
merged they will be decrypted. The merged and decrypted file names state
the range of segments that were used.
Parameters:
include_this_segment: Whether to include the current segment in the
list of segments to merge and decrypt. This should be False if
decrypting on EXT-X-KEY changes, or True when decrypting on the
last segment.
Returns the decrypted path.
"""
drm = encryption_data[1]
first_segment_i = next(
int(file.stem) for file in sorted(segment_save_dir.iterdir()) if file.stem.isdigit()
)
last_segment_i = max(0, i - int(not include_this_segment))
range_len = (last_segment_i - first_segment_i) + 1
segment_range = f"{str(first_segment_i).zfill(name_len)}-{str(last_segment_i).zfill(name_len)}"
merged_path = (
segment_save_dir / f"{segment_range}{get_extension(master.segments[last_segment_i].uri)}"
)
decrypted_path = segment_save_dir / f"{merged_path.stem}_decrypted{merged_path.suffix}"
files = [
file
for file in sorted(segment_save_dir.iterdir())
if file.stem.isdigit() and first_segment_i <= int(file.stem) <= last_segment_i
]
if not files:
raise ValueError(f"None of the segment files for {segment_range} exist...")
elif len(files) != range_len:
raise ValueError(f"Missing {range_len - len(files)} segment files for {segment_range}...")
if isinstance(drm, Widevine):
# with widevine we can merge all segments and decrypt once
merge(to=merged_path, via=files, delete=True, include_map_data=True)
drm.decrypt(merged_path)
merged_path.rename(decrypted_path)
else:
# with other drm we must decrypt separately and then merge them
# for aes this is because each segment likely has 16-byte padding
for file in files:
drm.decrypt(file)
merge(to=merged_path, via=files, delete=True, include_map_data=True)
events.emit(events.Types.TRACK_DECRYPTED, track=track, drm=drm, segment=decrypted_path)
return decrypted_path
def merge_discontinuity(include_this_segment: bool, include_map_data: bool = True):
"""
Merge all segments of the discontinuity.
All segment files for this discontinuity must already be downloaded and
already decrypted (if it needs to be decrypted).
Parameters:
include_this_segment: Whether to include the current segment in the
list of segments to merge and decrypt. This should be False if
decrypting on EXT-X-KEY changes, or True when decrypting on the
last segment.
include_map_data: Whether to prepend the init map data before the
segment files when merging.
"""
last_segment_i = max(0, i - int(not include_this_segment))
files = [
file
for file in sorted(segment_save_dir.iterdir())
if int(file.stem.replace("_decrypted", "").split("-")[-1]) <= last_segment_i
]
if files:
to_dir = segment_save_dir.parent
to_path = to_dir / f"{str(discon_i).zfill(name_len)}{files[-1].suffix}"
merge(to=to_path, via=files, delete=True, include_map_data=include_map_data)
if segment not in unwanted_segments:
if isinstance(track, Subtitle):
segment_file_ext = get_extension(segment.uri)
segment_file_path = segment_save_dir / f"{str(i).zfill(name_len)}{segment_file_ext}"
segment_data = try_ensure_utf8(segment_file_path.read_bytes())
if track.codec not in (Subtitle.Codec.fVTT, Subtitle.Codec.fTTML):
segment_data = (
segment_data.decode("utf8")
.replace("&lrm;", html.unescape("&lrm;"))
.replace("&rlm;", html.unescape("&rlm;"))
.encode("utf8")
)
segment_file_path.write_bytes(segment_data)
if segment.discontinuity and i != 0:
if encryption_data:
decrypt(include_this_segment=False)
merge_discontinuity(
include_this_segment=False, include_map_data=not encryption_data or not encryption_data[1]
)
discon_i += 1
range_offset = 0 # TODO: Should this be reset or not?
map_data = None
if encryption_data:
encryption_data = (encryption_data[0], encryption_data[1])
if segment.init_section and (not map_data or segment.init_section != map_data[0]):
if segment.init_section.byterange:
init_byte_range = HLS.calculate_byte_range(segment.init_section.byterange, range_offset)
range_offset = init_byte_range.split("-")[0]
init_range_header = {"Range": f"bytes={init_byte_range}"}
else:
init_range_header = {}
# Handle both session types for init section request
res = session.get(
url=urljoin(segment.init_section.base_uri, segment.init_section.uri),
headers=init_range_header,
)
# Check response based on session type
if isinstance(res, requests.Response):
res.raise_for_status()
init_content = res.content
elif isinstance(res, httpx.Response):
if res.status_code >= 400:
raise requests.HTTPError(f"HTTP Error: {res.status_code}", response=res)
init_content = res.content
else:
raise TypeError(
f"Expected response to be requests.Response or httpx.Response, not {type(res)}"
)
map_data = (segment.init_section, init_content)
segment_keys = getattr(segment, "keys", None)
if segment_keys:
key = HLS.get_supported_key(segment_keys)
if encryption_data and encryption_data[0] != key and i != 0 and segment not in unwanted_segments:
decrypt(include_this_segment=False)
if key is None:
encryption_data = None
elif not encryption_data or encryption_data[0] != key:
drm = HLS.get_drm(key, session)
if isinstance(drm, (Widevine, PlayReady)):
try:
if map_data:
track_kid = track.get_key_id(map_data[1])
else:
track_kid = None
progress(downloaded="LICENSING")
license_widevine(drm, track_kid=track_kid)
progress(downloaded="[yellow]LICENSED")
except Exception: # noqa
DOWNLOAD_CANCELLED.set() # skip pending track downloads
progress(downloaded="[red]FAILED")
raise
encryption_data = (key, drm)
if DOWNLOAD_LICENCE_ONLY.is_set():
continue
if is_last_segment:
# required as it won't end with EXT-X-DISCONTINUITY nor a new key
if encryption_data:
decrypt(include_this_segment=True)
merge_discontinuity(
include_this_segment=True, include_map_data=not encryption_data or not encryption_data[1]
)
progress(advance=1)
if DOWNLOAD_LICENCE_ONLY.is_set():
return
if segment_save_dir.exists():
segment_save_dir.rmdir()
# finally merge all the discontinuity save files together to the final path
segments_to_merge = [x for x in sorted(save_dir.iterdir()) if x.is_file()]
if len(segments_to_merge) == 1:
shutil.move(segments_to_merge[0], save_path)
else:
progress(downloaded="Merging")
if isinstance(track, (Video, Audio)):
HLS.merge_segments(segments=segments_to_merge, save_path=save_path)
else:
with open(save_path, "wb") as f:
for discontinuity_file in segments_to_merge:
discontinuity_data = discontinuity_file.read_bytes()
f.write(discontinuity_data)
f.flush()
discontinuity_file.unlink()
save_dir.rmdir()
progress(downloaded="Downloaded")
track.path = save_path
events.emit(events.Types.TRACK_DOWNLOADED, track=track)
@staticmethod
def merge_segments(segments: list[Path], save_path: Path) -> int:
"""
Concatenate Segments by first demuxing with FFmpeg.
Returns the file size of the merged file.
"""
if not binaries.FFMPEG:
raise EnvironmentError("FFmpeg executable was not found but is required to merge HLS segments.")
demuxer_file = segments[0].parent / "ffmpeg_concat_demuxer.txt"
demuxer_file.write_text("\n".join([f"file '{segment}'" for segment in segments]))
subprocess.check_call(
[
binaries.FFMPEG,
"-hide_banner",
"-loglevel",
"panic",
"-f",
"concat",
"-safe",
"0",
"-i",
demuxer_file,
"-map",
"0",
"-c",
"copy",
save_path,
]
)
demuxer_file.unlink()
for segment in segments:
segment.unlink()
return save_path.stat().st_size
@staticmethod
def parse_session_data_keys(
manifest: M3U8, session: Optional[Union[Session, httpx.Client]] = None
) -> list[m3u8.model.Key]:
"""Parse `com.apple.hls.keys` session data and return Key objects."""
keys: list[m3u8.model.Key] = []
for data in getattr(manifest, "session_data", []) or []:
if getattr(data, "data_id", None) != "com.apple.hls.keys":
continue
value = getattr(data, "value", None)
if not value and data.uri:
if not session:
session = Session()
res = session.get(urljoin(manifest.base_uri or "", data.uri))
value = res.text
if not value:
continue
try:
decoded = base64.b64decode(value).decode()
except Exception:
decoded = value
try:
items = json.loads(decoded)
except Exception:
continue
for item in items if isinstance(items, list) else []:
if not isinstance(item, dict):
continue
key = m3u8.model.Key(
method=item.get("method"),
base_uri=manifest.base_uri or "",
uri=item.get("uri"),
keyformat=item.get("keyformat"),
keyformatversions=",".join(item.get("keyformatversion") or item.get("keyformatversions") or []),
)
if key.method in {"AES-128", "ISO-23001-7"} or (
key.keyformat
and key.keyformat.lower()
in {
WidevineCdm.urn,
PlayReadyCdm,
"com.microsoft.playready",
}
):
keys.append(key)
return keys
@staticmethod
def get_supported_key(keys: list[Union[m3u8.model.SessionKey, m3u8.model.Key]]) -> Optional[m3u8.Key]:
"""
Get a support Key System from a list of Key systems.
Note that the key systems are chosen in an opinionated order.
Returns None if one of the key systems is method=NONE, which means all segments
from hence forth should be treated as plain text until another key system is
encountered, unless it's also method=NONE.
Raises NotImplementedError if none of the key systems are supported.
"""
if any(key.method == "NONE" for key in keys):
return None
unsupported_systems = []
for key in keys:
if not key:
continue
# TODO: Add a way to specify which supported key system to use
# TODO: Add support for 'SAMPLE-AES', 'AES-CTR', 'AES-CBC', 'ClearKey'
elif key.method == "AES-128":
return key
elif key.method == "ISO-23001-7":
return key
elif key.keyformat and key.keyformat.lower() == WidevineCdm.urn:
return key
elif key.keyformat and (
key.keyformat.lower() == PlayReadyCdm or "com.microsoft.playready" in key.keyformat.lower()
):
return key
else:
unsupported_systems.append(key.method + (f" ({key.keyformat})" if key.keyformat else ""))
else:
raise NotImplementedError(f"None of the key systems are supported: {', '.join(unsupported_systems)}")
@staticmethod
def get_drm(
key: Union[m3u8.model.SessionKey, m3u8.model.Key], session: Optional[Union[Session, httpx.Client]] = None
) -> DRM_T:
"""
Convert HLS EXT-X-KEY data to an initialized DRM object.
Parameters:
key: m3u8 key system (EXT-X-KEY) object.
session: Optional session used to request AES-128 URIs.
Useful to set headers, proxies, cookies, and so forth.
Raises a NotImplementedError if the key system is not supported.
"""
if not isinstance(session, (Session, httpx.Client, type(None))):
raise TypeError(f"Expected session to be a {Session} or {httpx.Client}, not {type(session)}")
if not session:
session = Session()
# TODO: Add support for 'SAMPLE-AES', 'AES-CTR', 'AES-CBC', 'ClearKey'
if key.method == "AES-128":
drm = ClearKey.from_m3u_key(key, session)
elif key.method == "ISO-23001-7":
drm = Widevine(pssh=WV_PSSH.new(key_ids=[key.uri.split(",")[-1]], system_id=WV_PSSH.SystemId.Widevine))
elif key.keyformat and key.keyformat.lower() == WidevineCdm.urn:
drm = Widevine(
pssh=WV_PSSH(key.uri.split(",")[-1]),
**key._extra_params, # noqa
)
elif key.keyformat and (
key.keyformat.lower() == PlayReadyCdm or "com.microsoft.playready" in key.keyformat.lower()
):
drm = PlayReady(
pssh=PR_PSSH(key.uri.split(",")[-1]),
pssh_b64=key.uri.split(",")[-1],
)
else:
raise NotImplementedError(f"The key system is not supported: {key}")
return drm
@staticmethod
def get_all_drm(
keys: list[Union[m3u8.model.SessionKey, m3u8.model.Key]], proxy: Optional[str] = None
) -> list[DRM_T]:
"""
Convert HLS EXT-X-KEY data to initialized DRM objects.
Parameters:
keys: m3u8 key system (EXT-X-KEY) objects.
proxy: Optional proxy string used for requesting AES-128 URIs.
Raises a NotImplementedError if none of the key systems are supported.
"""
unsupported_keys: list[m3u8.Key] = []
drm_objects: list[DRM_T] = []
if any(key.method == "NONE" for key in keys):
return []
for key in keys:
try:
drm = HLS.get_drm(key, proxy)
drm_objects.append(drm)
except NotImplementedError:
unsupported_keys.append(key)
if not drm_objects and unsupported_keys:
logging.debug(
"Ignoring unsupported key systems: %s",
", ".join([str(k.keyformat or k.method) for k in unsupported_keys]),
)
return []
return drm_objects
@staticmethod
def calculate_byte_range(m3u_range: str, fallback_offset: int = 0) -> str:
"""
Convert a HLS EXT-X-BYTERANGE value to a more traditional range value.
E.g., '1433@0' -> '0-1432', '357392@1433' -> '1433-358824'.
"""
parts = [int(x) for x in m3u_range.split("@")]
if len(parts) != 2:
parts.append(fallback_offset)
length, offset = parts
return f"{offset}-{offset + length - 1}"
__all__ = ("HLS",)

View File

@@ -0,0 +1,335 @@
from __future__ import annotations
import base64
import hashlib
import html
import shutil
import urllib.parse
from functools import partial
from pathlib import Path
from typing import Any, Callable, Optional, Union
import requests
from langcodes import Language, tag_is_valid
from lxml.etree import Element
from pyplayready.system.pssh import PSSH as PR_PSSH
from pywidevine.pssh import PSSH
from requests import Session
from unshackle.core.constants import DOWNLOAD_CANCELLED, DOWNLOAD_LICENCE_ONLY, AnyTrack
from unshackle.core.drm import DRM_T, PlayReady, Widevine
from unshackle.core.events import events
from unshackle.core.tracks import Audio, Subtitle, Track, Tracks, Video
from unshackle.core.utilities import try_ensure_utf8
from unshackle.core.utils.xml import load_xml
class ISM:
def __init__(self, manifest: Element, url: str) -> None:
if manifest.tag != "SmoothStreamingMedia":
raise TypeError(f"Expected 'SmoothStreamingMedia' document, got '{manifest.tag}'")
if not url:
raise requests.URLRequired("ISM manifest URL must be provided for relative paths")
self.manifest = manifest
self.url = url
@classmethod
def from_url(cls, url: str, session: Optional[Session] = None, **kwargs: Any) -> "ISM":
if not url:
raise requests.URLRequired("ISM manifest URL must be provided")
if not session:
session = Session()
res = session.get(url, **kwargs)
if res.url != url:
url = res.url
res.raise_for_status()
return cls(load_xml(res.content), url)
@classmethod
def from_text(cls, text: str, url: str) -> "ISM":
if not text:
raise ValueError("ISM manifest text must be provided")
if not url:
raise requests.URLRequired("ISM manifest URL must be provided for relative paths")
return cls(load_xml(text), url)
@staticmethod
def _get_drm(headers: list[Element]) -> list[DRM_T]:
drm: list[DRM_T] = []
for header in headers:
system_id = (header.get("SystemID") or header.get("SystemId") or "").lower()
data = "".join(header.itertext()).strip()
if not data:
continue
if system_id == "edef8ba9-79d6-4ace-a3c8-27dcd51d21ed":
try:
pssh = PSSH(base64.b64decode(data))
except Exception:
continue
kid = next(iter(pssh.key_ids), None)
drm.append(Widevine(pssh=pssh, kid=kid))
elif system_id == "9a04f079-9840-4286-ab92-e65be0885f95":
try:
pr_pssh = PR_PSSH(data)
except Exception:
continue
drm.append(PlayReady(pssh=pr_pssh, pssh_b64=data))
return drm
def to_tracks(self, language: Optional[Union[str, Language]] = None) -> Tracks:
tracks = Tracks()
base_url = self.url
duration = int(self.manifest.get("Duration") or 0)
drm = self._get_drm(self.manifest.xpath(".//ProtectionHeader"))
for stream_index in self.manifest.findall("StreamIndex"):
content_type = stream_index.get("Type")
if not content_type:
raise ValueError("No content type value could be found")
for ql in stream_index.findall("QualityLevel"):
codec = ql.get("FourCC")
if codec == "TTML":
codec = "STPP"
track_lang = None
lang = (stream_index.get("Language") or "").strip()
if lang and tag_is_valid(lang) and not lang.startswith("und"):
track_lang = Language.get(lang)
track_urls: list[str] = []
fragment_time = 0
fragments = stream_index.findall("c")
# Some manifests omit the first fragment in the <c> list but
# still expect a request for start time 0 which contains the
# initialization segment. If the first declared fragment is not
# at time 0, prepend the missing initialization URL.
if fragments:
first_time = int(fragments[0].get("t") or 0)
if first_time != 0:
track_urls.append(
urllib.parse.urljoin(
base_url,
stream_index.get("Url").format_map(
{
"bitrate": ql.get("Bitrate"),
"start time": "0",
}
),
)
)
for idx, frag in enumerate(fragments):
fragment_time = int(frag.get("t", fragment_time))
repeat = int(frag.get("r", 1))
duration_frag = int(frag.get("d") or 0)
if not duration_frag:
try:
next_time = int(fragments[idx + 1].get("t"))
except (IndexError, AttributeError):
next_time = duration
duration_frag = (next_time - fragment_time) / repeat
for _ in range(repeat):
track_urls.append(
urllib.parse.urljoin(
base_url,
stream_index.get("Url").format_map(
{
"bitrate": ql.get("Bitrate"),
"start time": str(fragment_time),
}
),
)
)
fragment_time += duration_frag
track_id = hashlib.md5(
f"{codec}-{track_lang}-{ql.get('Bitrate') or 0}-{ql.get('Index') or 0}".encode()
).hexdigest()
data = {
"ism": {
"manifest": self.manifest,
"stream_index": stream_index,
"quality_level": ql,
"segments": track_urls,
}
}
if content_type == "video":
try:
vcodec = Video.Codec.from_mime(codec) if codec else None
except ValueError:
vcodec = None
tracks.add(
Video(
id_=track_id,
url=self.url,
codec=vcodec,
language=track_lang or language,
is_original_lang=bool(language and track_lang and str(track_lang) == str(language)),
bitrate=ql.get("Bitrate"),
width=int(ql.get("MaxWidth") or 0) or int(stream_index.get("MaxWidth") or 0),
height=int(ql.get("MaxHeight") or 0) or int(stream_index.get("MaxHeight") or 0),
descriptor=Video.Descriptor.ISM,
drm=drm,
data=data,
)
)
elif content_type == "audio":
try:
acodec = Audio.Codec.from_mime(codec) if codec else None
except ValueError:
acodec = None
tracks.add(
Audio(
id_=track_id,
url=self.url,
codec=acodec,
language=track_lang or language,
is_original_lang=bool(language and track_lang and str(track_lang) == str(language)),
bitrate=ql.get("Bitrate"),
channels=ql.get("Channels"),
descriptor=Track.Descriptor.ISM,
drm=drm,
data=data,
)
)
else:
try:
scodec = Subtitle.Codec.from_mime(codec) if codec else None
except ValueError:
scodec = None
tracks.add(
Subtitle(
id_=track_id,
url=self.url,
codec=scodec,
language=track_lang or language,
is_original_lang=bool(language and track_lang and str(track_lang) == str(language)),
descriptor=Track.Descriptor.ISM,
drm=drm,
data=data,
)
)
return tracks
@staticmethod
def download_track(
track: AnyTrack,
save_path: Path,
save_dir: Path,
progress: partial,
session: Optional[Session] = None,
proxy: Optional[str] = None,
max_workers: Optional[int] = None,
license_widevine: Optional[Callable] = None,
*,
cdm: Optional[object] = None,
) -> None:
if not session:
session = Session()
elif not isinstance(session, Session):
raise TypeError(f"Expected session to be a {Session}, not {session!r}")
if proxy:
session.proxies.update({"all": proxy})
segments: list[str] = track.data["ism"]["segments"]
session_drm = None
if track.drm:
# Mirror HLS.download_track: pick the DRM matching the provided CDM
# (or the first available) and license it if supported.
session_drm = track.get_drm_for_cdm(cdm)
if isinstance(session_drm, (Widevine, PlayReady)):
try:
if not license_widevine:
raise ValueError("license_widevine func must be supplied to use DRM")
progress(downloaded="LICENSING")
license_widevine(session_drm)
progress(downloaded="[yellow]LICENSED")
except Exception:
DOWNLOAD_CANCELLED.set()
progress(downloaded="[red]FAILED")
raise
if DOWNLOAD_LICENCE_ONLY.is_set():
progress(downloaded="[yellow]SKIPPED")
return
progress(total=len(segments))
downloader = track.downloader
skip_merge = False
downloader_args = dict(
urls=[{"url": url} for url in segments],
output_dir=save_dir,
filename="{i:0%d}.mp4" % len(str(len(segments))),
headers=session.headers,
cookies=session.cookies,
proxy=proxy,
max_workers=max_workers,
)
if downloader.__name__ == "n_m3u8dl_re":
skip_merge = True
downloader_args.update(
{
"filename": track.id,
"track": track,
"content_keys": session_drm.content_keys if session_drm else None,
}
)
for status_update in downloader(**downloader_args):
file_downloaded = status_update.get("file_downloaded")
if file_downloaded:
events.emit(events.Types.SEGMENT_DOWNLOADED, track=track, segment=file_downloaded)
else:
downloaded = status_update.get("downloaded")
if downloaded and downloaded.endswith("/s"):
status_update["downloaded"] = f"ISM {downloaded}"
progress(**status_update)
for control_file in save_dir.glob("*.aria2__temp"):
control_file.unlink()
segments_to_merge = [x for x in sorted(save_dir.iterdir()) if x.is_file()]
if skip_merge:
shutil.move(segments_to_merge[0], save_path)
else:
with open(save_path, "wb") as f:
for segment_file in segments_to_merge:
segment_data = segment_file.read_bytes()
if (
not session_drm
and isinstance(track, Subtitle)
and track.codec not in (Subtitle.Codec.fVTT, Subtitle.Codec.fTTML)
):
segment_data = try_ensure_utf8(segment_data)
segment_data = (
segment_data.decode("utf8")
.replace("&lrm;", html.unescape("&lrm;"))
.replace("&rlm;", html.unescape("&rlm;"))
.encode("utf8")
)
f.write(segment_data)
f.flush()
segment_file.unlink()
progress(advance=1)
track.path = save_path
events.emit(events.Types.TRACK_DOWNLOADED, track=track)
if not skip_merge and session_drm:
progress(downloaded="Decrypting", completed=0, total=100)
session_drm.decrypt(save_path)
track.drm = None
events.emit(events.Types.TRACK_DECRYPTED, track=track, drm=session_drm, segment=None)
progress(downloaded="Decrypting", advance=100)
save_dir.rmdir()
progress(downloaded="Downloaded")
__all__ = ("ISM",)

View File

@@ -0,0 +1,76 @@
"""Utility functions for parsing M3U8 playlists."""
from __future__ import annotations
from typing import Optional, Union
import httpx
import m3u8
from pyplayready.cdm import Cdm as PlayReadyCdm
from pyplayready.system.pssh import PSSH as PR_PSSH
from pywidevine.cdm import Cdm as WidevineCdm
from pywidevine.pssh import PSSH as WV_PSSH
from requests import Session
from unshackle.core.drm import PlayReady, Widevine
from unshackle.core.manifests.hls import HLS
from unshackle.core.tracks import Tracks
def parse(
master: m3u8.M3U8,
language: str,
*,
session: Optional[Union[Session, httpx.Client]] = None,
) -> Tracks:
"""Parse a variant playlist to ``Tracks`` with DRM information."""
tracks = HLS(master, session=session).to_tracks(language)
need_wv = not any(isinstance(d, Widevine) for t in tracks for d in (t.drm or []))
need_pr = not any(isinstance(d, PlayReady) for t in tracks for d in (t.drm or []))
if (need_wv or need_pr) and tracks.videos:
if not session:
session = Session()
session_keys = list(master.session_keys or [])
session_keys.extend(HLS.parse_session_data_keys(master, session))
for drm_obj in HLS.get_all_drm(session_keys):
if need_wv and isinstance(drm_obj, Widevine):
for t in tracks.videos + tracks.audio:
t.drm = [d for d in (t.drm or []) if not isinstance(d, Widevine)] + [drm_obj]
need_wv = False
elif need_pr and isinstance(drm_obj, PlayReady):
for t in tracks.videos + tracks.audio:
t.drm = [d for d in (t.drm or []) if not isinstance(d, PlayReady)] + [drm_obj]
need_pr = False
if not need_wv and not need_pr:
break
if (need_wv or need_pr) and tracks.videos:
first_video = tracks.videos[0]
playlist = m3u8.load(first_video.url)
for key in playlist.keys or []:
if not key or not key.keyformat:
continue
fmt = key.keyformat.lower()
if need_wv and fmt == WidevineCdm.urn:
pssh_b64 = key.uri.split(",")[-1]
drm = Widevine(pssh=WV_PSSH(pssh_b64))
for t in tracks.videos + tracks.audio:
t.drm = [d for d in (t.drm or []) if not isinstance(d, Widevine)] + [drm]
need_wv = False
elif need_pr and (fmt == PlayReadyCdm or "com.microsoft.playready" in fmt):
pssh_b64 = key.uri.split(",")[-1]
drm = PlayReady(pssh=PR_PSSH(pssh_b64), pssh_b64=pssh_b64)
for t in tracks.videos + tracks.audio:
t.drm = [d for d in (t.drm or []) if not isinstance(d, PlayReady)] + [drm]
need_pr = False
if not need_wv and not need_pr:
break
return tracks
__all__ = ["parse"]