aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbashonly <88596187+bashonly@users.noreply.github.com>2025-02-18 20:23:42 -0600
committerGitHub <noreply@github.com>2025-02-19 02:23:42 +0000
commitbe69468752ff598cacee57bb80533deab2367a5d (patch)
tree337d2d5b996dd1d6ab898d7be22c3b7d0fdecb41
parent5271ef48c6f61c145e03e18e960995d2e651d205 (diff)
[fd/hls] Support `--write-pages` for m3u8 media playlists (#12333)
Authored by: bashonly
-rw-r--r--yt_dlp/downloader/hls.py11
-rw-r--r--yt_dlp/extractor/common.py28
-rw-r--r--yt_dlp/utils/_utils.py18
3 files changed, 35 insertions, 22 deletions
diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py
index 7a47f8f83..1f36a07f5 100644
--- a/yt_dlp/downloader/hls.py
+++ b/yt_dlp/downloader/hls.py
@@ -16,6 +16,7 @@ from ..utils import (
update_url_query,
urljoin,
)
+from ..utils._utils import _request_dump_filename
class HlsFD(FragmentFD):
@@ -80,7 +81,15 @@ class HlsFD(FragmentFD):
self.to_screen(f'[{self.FD_NAME}] Downloading m3u8 manifest')
urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
man_url = urlh.url
- s = urlh.read().decode('utf-8', 'ignore')
+ s_bytes = urlh.read()
+ if self.params.get('write_pages'):
+ dump_filename = _request_dump_filename(
+ man_url, info_dict['id'], None,
+ trim_length=self.params.get('trim_file_name'))
+ self.to_screen(f'[{self.FD_NAME}] Saving request to {dump_filename}')
+ with open(dump_filename, 'wb') as outf:
+ outf.write(s_bytes)
+ s = s_bytes.decode('utf-8', 'ignore')
can_download, message = self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')), None
if can_download:
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index 3e7734ce1..8d199b353 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -2,7 +2,6 @@ import base64
import collections
import functools
import getpass
-import hashlib
import http.client
import http.cookiejar
import http.cookies
@@ -78,7 +77,6 @@ from ..utils import (
parse_iso8601,
parse_m3u8_attributes,
parse_resolution,
- sanitize_filename,
sanitize_url,
smuggle_url,
str_or_none,
@@ -100,6 +98,7 @@ from ..utils import (
xpath_text,
xpath_with_ns,
)
+from ..utils._utils import _request_dump_filename
class InfoExtractor:
@@ -1022,23 +1021,6 @@ class InfoExtractor:
'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
expected=True)
- def _request_dump_filename(self, url, video_id, data=None):
- if data is not None:
- data = hashlib.md5(data).hexdigest()
- basen = join_nonempty(video_id, data, url, delim='_')
- trim_length = self.get_param('trim_file_name') or 240
- if len(basen) > trim_length:
- h = '___' + hashlib.md5(basen.encode()).hexdigest()
- basen = basen[:trim_length - len(h)] + h
- filename = sanitize_filename(f'{basen}.dump', restricted=True)
- # Working around MAX_PATH limitation on Windows (see
- # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
- if os.name == 'nt':
- absfilepath = os.path.abspath(filename)
- if len(absfilepath) > 259:
- filename = fR'\\?\{absfilepath}'
- return filename
-
def __decode_webpage(self, webpage_bytes, encoding, headers):
if not encoding:
encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
@@ -1067,7 +1049,9 @@ class InfoExtractor:
if self.get_param('write_pages'):
if isinstance(url_or_request, Request):
data = self._create_request(url_or_request, data).data
- filename = self._request_dump_filename(urlh.url, video_id, data)
+ filename = _request_dump_filename(
+ urlh.url, video_id, data,
+ trim_length=self.get_param('trim_file_name'))
self.to_screen(f'Saving request to {filename}')
with open(filename, 'wb') as outf:
outf.write(webpage_bytes)
@@ -1128,7 +1112,9 @@ class InfoExtractor:
impersonate=None, require_impersonation=False):
if self.get_param('load_pages'):
url_or_request = self._create_request(url_or_request, data, headers, query)
- filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
+ filename = _request_dump_filename(
+ url_or_request.url, video_id, url_or_request.data,
+ trim_length=self.get_param('trim_file_name'))
self.to_screen(f'Loading request from {filename}')
try:
with open(filename, 'rb') as dumpf:
diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py
index a71a381e5..c6a90a0dd 100644
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@@ -5631,6 +5631,24 @@ def filesize_from_tbr(tbr, duration):
return int(duration * tbr * (1000 / 8))
+def _request_dump_filename(url, video_id, data=None, trim_length=None):
+ if data is not None:
+ data = hashlib.md5(data).hexdigest()
+ basen = join_nonempty(video_id, data, url, delim='_')
+ trim_length = trim_length or 240
+ if len(basen) > trim_length:
+ h = '___' + hashlib.md5(basen.encode()).hexdigest()
+ basen = basen[:trim_length - len(h)] + h
+ filename = sanitize_filename(f'{basen}.dump', restricted=True)
+ # Working around MAX_PATH limitation on Windows (see
+ # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
+ if os.name == 'nt':
+ absfilepath = os.path.abspath(filename)
+ if len(absfilepath) > 259:
+ filename = fR'\\?\{absfilepath}'
+ return filename
+
+
# XXX: Temporary
class _YDLLogger:
def __init__(self, ydl=None):