aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/bokecc.py2
-rw-r--r--youtube_dl/extractor/cloudy.py2
-rw-r--r--youtube_dl/extractor/common.py156
-rw-r--r--youtube_dl/extractor/extractors.py18
-rw-r--r--youtube_dl/extractor/itv.py17
-rw-r--r--youtube_dl/extractor/mixcloud.py30
-rw-r--r--youtube_dl/extractor/orf.py1103
-rw-r--r--youtube_dl/extractor/palcomp3.py9
-rw-r--r--youtube_dl/extractor/senateisvp.py2
-rw-r--r--youtube_dl/extractor/vidlii.py59
-rw-r--r--youtube_dl/extractor/yandexmusic.py23
-rw-r--r--youtube_dl/extractor/youtube.py803
12 files changed, 1464 insertions, 760 deletions
diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py
index 6017e8344..4b8bef391 100644
--- a/youtube_dl/extractor/bokecc.py
+++ b/youtube_dl/extractor/bokecc.py
@@ -32,7 +32,7 @@ class BokeCCBaseIE(InfoExtractor):
class BokeCCIE(BokeCCBaseIE):
- _IE_DESC = 'CC视频'
+ IE_DESC = 'CC视频'
_VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)'
_TESTS = [{
diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py
index 85ca20ecc..d39a9a5c2 100644
--- a/youtube_dl/extractor/cloudy.py
+++ b/youtube_dl/extractor/cloudy.py
@@ -9,7 +9,7 @@ from ..utils import (
class CloudyIE(InfoExtractor):
- _IE_DESC = 'cloudy.ec'
+ IE_DESC = 'cloudy.ec'
_VALID_URL = r'https?://(?:www\.)?cloudy\.ec/(?:v/|embed\.php\?.*?\bid=)(?P<id>[A-Za-z0-9]+)'
_TESTS = [{
'url': 'https://www.cloudy.ec/v/af511e2527aac',
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 7fae9e57b..a64fcfccc 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -422,6 +422,8 @@ class InfoExtractor(object):
_GEO_COUNTRIES = None
_GEO_IP_BLOCKS = None
_WORKING = True
+ # supply this in public subclasses: used in supported sites list, etc
+ # IE_DESC = 'short description of IE'
def __init__(self, downloader=None):
"""Constructor. Receives an optional downloader."""
@@ -503,7 +505,7 @@ class InfoExtractor(object):
if not self._x_forwarded_for_ip:
# Geo bypass mechanism is explicitly disabled by user
- if not self._downloader.params.get('geo_bypass', True):
+ if not self.get_param('geo_bypass', True):
return
if not geo_bypass_context:
@@ -525,7 +527,7 @@ class InfoExtractor(object):
# Explicit IP block specified by user, use it right away
# regardless of whether extractor is geo bypassable or not
- ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
+ ip_block = self.get_param('geo_bypass_ip_block', None)
# Otherwise use random IP block from geo bypass context but only
# if extractor is known as geo bypassable
@@ -536,8 +538,8 @@ class InfoExtractor(object):
if ip_block:
self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
- if self._downloader.params.get('verbose', False):
- self._downloader.to_screen(
+ if self.get_param('verbose', False):
+ self.to_screen(
'[debug] Using fake IP %s as X-Forwarded-For.'
% self._x_forwarded_for_ip)
return
@@ -546,7 +548,7 @@ class InfoExtractor(object):
# Explicit country code specified by user, use it right away
# regardless of whether extractor is geo bypassable or not
- country = self._downloader.params.get('geo_bypass_country', None)
+ country = self.get_param('geo_bypass_country', None)
# Otherwise use random country code from geo bypass context but
# only if extractor is known as geo bypassable
@@ -557,8 +559,8 @@ class InfoExtractor(object):
if country:
self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
- if self._downloader.params.get('verbose', False):
- self._downloader.to_screen(
+ if self.get_param('verbose', False):
+ self.to_screen(
'[debug] Using fake IP %s (%s) as X-Forwarded-For.'
% (self._x_forwarded_for_ip, country.upper()))
@@ -584,9 +586,9 @@ class InfoExtractor(object):
raise ExtractorError('An extractor error has occurred.', cause=e)
def __maybe_fake_ip_and_retry(self, countries):
- if (not self._downloader.params.get('geo_bypass_country', None)
+ if (not self.get_param('geo_bypass_country', None)
and self._GEO_BYPASS
- and self._downloader.params.get('geo_bypass', True)
+ and self.get_param('geo_bypass', True)
and not self._x_forwarded_for_ip
and countries):
country_code = random.choice(countries)
@@ -696,7 +698,7 @@ class InfoExtractor(object):
if fatal:
raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
else:
- self._downloader.report_warning(errmsg)
+ self.report_warning(errmsg)
return False
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
@@ -768,11 +770,11 @@ class InfoExtractor(object):
webpage_bytes = prefix + webpage_bytes
if not encoding:
encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
- if self._downloader.params.get('dump_intermediate_pages', False):
+ if self.get_param('dump_intermediate_pages', False):
self.to_screen('Dumping request to ' + urlh.geturl())
dump = base64.b64encode(webpage_bytes).decode('ascii')
- self._downloader.to_screen(dump)
- if self._downloader.params.get('write_pages', False):
+ self.to_screen(dump)
+ if self.get_param('write_pages', False):
basen = '%s_%s' % (video_id, urlh.geturl())
if len(basen) > 240:
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
@@ -974,19 +976,9 @@ class InfoExtractor(object):
"""Print msg to screen, prefixing it with '[ie_name]'"""
self._downloader.to_screen(self.__ie_msg(msg))
- def write_debug(self, msg, only_once=False, _cache=[]):
+ def write_debug(self, msg, only_once=False):
'''Log debug message or Print message to stderr'''
- if not self.get_param('verbose', False):
- return
- message = '[debug] ' + self.__ie_msg(msg)
- logger = self.get_param('logger')
- if logger:
- logger.debug(message)
- else:
- if only_once and hash(message) in _cache:
- return
- self._downloader.to_stderr(message)
- _cache.append(hash(message))
+ self._downloader.write_debug(self.__ie_msg(msg), only_once=only_once)
# name, default=None, *args, **kwargs
def get_param(self, name, *args, **kwargs):
@@ -1082,7 +1074,7 @@ class InfoExtractor(object):
if mobj:
break
- if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
+ if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
_name = '\033[0;34m%s\033[0m' % name
else:
_name = name
@@ -1100,7 +1092,7 @@ class InfoExtractor(object):
elif fatal:
raise RegexNotFoundError('Unable to extract %s' % _name)
else:
- self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
+ self.report_warning('unable to extract %s' % _name + bug_reports_message())
return None
def _search_json(self, start_pattern, string, name, video_id, **kwargs):
@@ -1169,10 +1161,10 @@ class InfoExtractor(object):
def _get_netrc_login_info(self, netrc_machine=None):
username = None
password = None
- netrc_machine = netrc_machine or self._NETRC_MACHINE
- if self._downloader.params.get('usenetrc', False):
+ if self.get_param('usenetrc', False):
try:
+ netrc_machine = netrc_machine or self._NETRC_MACHINE
info = netrc.netrc().authenticators(netrc_machine)
if info is not None:
username = info[0]
@@ -1180,8 +1172,8 @@ class InfoExtractor(object):
else:
raise netrc.NetrcParseError(
'No authenticators for %s' % netrc_machine)
- except (IOError, netrc.NetrcParseError) as err:
- self._downloader.report_warning(
+ except (AttributeError, IOError, netrc.NetrcParseError) as err:
+ self.report_warning(
'parsing .netrc: %s' % error_to_compat_str(err))
return username, password
@@ -1218,10 +1210,10 @@ class InfoExtractor(object):
"""
if self._downloader is None:
return None
- downloader_params = self._downloader.params
- if downloader_params.get('twofactor') is not None:
- return downloader_params['twofactor']
+ twofactor = self.get_param('twofactor')
+ if twofactor is not None:
+ return twofactor
return compat_getpass('Type %s and press [Return]: ' % note)
@@ -1356,7 +1348,7 @@ class InfoExtractor(object):
elif fatal:
raise RegexNotFoundError('Unable to extract JSON-LD')
else:
- self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
+ self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
return {}
def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
@@ -1490,14 +1482,18 @@ class InfoExtractor(object):
return dict((k, v) for k, v in info.items() if v is not None)
def _search_nextjs_data(self, webpage, video_id, **kw):
- nkw = dict((k, v) for k, v in kw.items() if k in ('transform_source', 'fatal'))
- kw.pop('transform_source', None)
- next_data = self._search_regex(
- r'''<script[^>]+\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>(?P<nd>[^<]+)</script>''',
- webpage, 'next.js data', group='nd', **kw)
- if not next_data:
- return {}
- return self._parse_json(next_data, video_id, **nkw)
+ # ..., *, transform_source=None, fatal=True, default=NO_DEFAULT
+
+ # TODO: remove this backward compat
+ default = kw.get('default', NO_DEFAULT)
+ if default == '{}':
+ kw['default'] = {}
+ kw = compat_kwargs(kw)
+
+ return self._search_json(
+ r'''<script\s[^>]*?\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>''',
+ webpage, 'next.js data', video_id, end_pattern='</script>',
+ **kw)
def _search_nuxt_data(self, webpage, video_id, *args, **kwargs):
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
@@ -1583,7 +1579,7 @@ class InfoExtractor(object):
if f.get('vcodec') == 'none': # audio only
preference -= 50
- if self._downloader.params.get('prefer_free_formats'):
+ if self.get_param('prefer_free_formats'):
ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
else:
ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
@@ -1595,7 +1591,7 @@ class InfoExtractor(object):
else:
if f.get('acodec') == 'none': # video only
preference -= 40
- if self._downloader.params.get('prefer_free_formats'):
+ if self.get_param('prefer_free_formats'):
ORDER = ['flv', 'mp4', 'webm']
else:
ORDER = ['webm', 'flv', 'mp4']
@@ -1661,7 +1657,7 @@ class InfoExtractor(object):
""" Either "http:" or "https:", depending on the user's preferences """
return (
'http:'
- if self._downloader.params.get('prefer_insecure', False)
+ if self.get_param('prefer_insecure', False)
else 'https:')
def _proto_relative_url(self, url, scheme=None):
@@ -3029,7 +3025,6 @@ class InfoExtractor(object):
transform_source=transform_source, default=None)
def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
-
# allow passing `transform_source` through to _find_jwplayer_data()
transform_source = kwargs.pop('transform_source', None)
kwfind = compat_kwargs({'transform_source': transform_source}) if transform_source else {}
@@ -3167,7 +3162,7 @@ class InfoExtractor(object):
# See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
# of jwplayer.flash.swf
rtmp_url_parts = re.split(
- r'((?:mp4|mp3|flv):)', source_url, 1)
+ r'((?:mp4|mp3|flv):)', source_url, maxsplit=1)
if len(rtmp_url_parts) == 3:
rtmp_url, prefix, play_path = rtmp_url_parts
a_format.update({
@@ -3194,7 +3189,7 @@ class InfoExtractor(object):
if fatal:
raise ExtractorError(msg)
else:
- self._downloader.report_warning(msg)
+ self.report_warning(msg)
return res
def _float(self, v, name, fatal=False, **kwargs):
@@ -3204,7 +3199,7 @@ class InfoExtractor(object):
if fatal:
raise ExtractorError(msg)
else:
- self._downloader.report_warning(msg)
+ self.report_warning(msg)
return res
def _set_cookie(self, domain, name, value, expire_time=None, port=None,
@@ -3213,12 +3208,12 @@ class InfoExtractor(object):
0, name, value, port, port is not None, domain, True,
domain.startswith('.'), path, True, secure, expire_time,
discard, None, None, rest)
- self._downloader.cookiejar.set_cookie(cookie)
+ self.cookiejar.set_cookie(cookie)
def _get_cookies(self, url):
""" Return a compat_cookies_SimpleCookie with the cookies for the url """
req = sanitized_Request(url)
- self._downloader.cookiejar.add_cookie_header(req)
+ self.cookiejar.add_cookie_header(req)
return compat_cookies_SimpleCookie(req.get_header('Cookie'))
def _apply_first_set_cookie_header(self, url_handle, cookie):
@@ -3278,8 +3273,8 @@ class InfoExtractor(object):
return not any_restricted
def extract_subtitles(self, *args, **kwargs):
- if (self._downloader.params.get('writesubtitles', False)
- or self._downloader.params.get('listsubtitles')):
+ if (self.get_param('writesubtitles', False)
+ or self.get_param('listsubtitles')):
return self._get_subtitles(*args, **kwargs)
return {}
@@ -3296,16 +3291,24 @@ class InfoExtractor(object):
return ret
@classmethod
- def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
- """ Merge two subtitle dictionaries, language by language. """
- ret = dict(subtitle_dict1)
- for lang in subtitle_dict2:
- ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
- return ret
+ def _merge_subtitles(cls, subtitle_dict1, *subtitle_dicts, **kwargs):
+ """ Merge subtitle dictionaries, language by language. """
+
+ # ..., * , target=None
+ target = kwargs.get('target')
+ if target is None:
+ target = dict(subtitle_dict1)
+ else:
+ subtitle_dicts = (subtitle_dict1,) + subtitle_dicts
+
+ for subtitle_dict in subtitle_dicts:
+ for lang in subtitle_dict:
+ target[lang] = cls._merge_subtitle_items(target.get(lang, []), subtitle_dict[lang])
+ return target
def extract_automatic_captions(self, *args, **kwargs):
- if (self._downloader.params.get('writeautomaticsub', False)
- or self._downloader.params.get('listsubtitles')):
+ if (self.get_param('writeautomaticsub', False)
+ or self.get_param('listsubtitles')):
return self._get_automatic_captions(*args, **kwargs)
return {}
@@ -3313,9 +3316,9 @@ class InfoExtractor(object):
raise NotImplementedError('This method must be implemented by subclasses')
def mark_watched(self, *args, **kwargs):
- if (self._downloader.params.get('mark_watched', False)
+ if (self.get_param('mark_watched', False)
and (self._get_login_info()[0] is not None
- or self._downloader.params.get('cookiefile') is not None)):
+ or self.get_param('cookiefile') is not None)):
self._mark_watched(*args, **kwargs)
def _mark_watched(self, *args, **kwargs):
@@ -3323,7 +3326,7 @@ class InfoExtractor(object):
def geo_verification_headers(self):
headers = {}
- geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
+ geo_verification_proxy = self.get_param('geo_verification_proxy')
if geo_verification_proxy:
headers['Ytdl-request-proxy'] = geo_verification_proxy
return headers
@@ -3334,6 +3337,29 @@ class InfoExtractor(object):
def _generic_title(self, url):
return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
+ def _yes_playlist(self, playlist_id, video_id, *args, **kwargs):
+ # smuggled_data=None, *, playlist_label='playlist', video_label='video'
+ smuggled_data = args[0] if len(args) == 1 else kwargs.get('smuggled_data')
+ playlist_label = kwargs.get('playlist_label', 'playlist')
+ video_label = kwargs.get('video_label', 'video')
+
+ if not playlist_id or not video_id:
+ return not video_id
+
+ no_playlist = (smuggled_data or {}).get('force_noplaylist')
+ if no_playlist is not None:
+ return not no_playlist
+
+ video_id = '' if video_id is True else ' ' + video_id
+ noplaylist = self.get_param('noplaylist')
+ self.to_screen(
+ 'Downloading just the {0}{1} because of --no-playlist'.format(video_label, video_id)
+ if noplaylist else
+ 'Downloading {0}{1} - add --no-playlist to download just the {2}{3}'.format(
+ playlist_label, '' if playlist_id is True else ' ' + playlist_id,
+ video_label, video_id))
+ return not noplaylist
+
class SearchInfoExtractor(InfoExtractor):
"""
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index 03d035a27..3da5f8020 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -898,21 +898,13 @@ from .ooyala import (
)
from .ora import OraTVIE
from .orf import (
- ORFTVthekIE,
- ORFFM4IE,
+ ORFONIE,
+ ORFONLiveIE,
ORFFM4StoryIE,
- ORFOE1IE,
- ORFOE3IE,
- ORFNOEIE,
- ORFWIEIE,
- ORFBGLIE,
- ORFOOEIE,
- ORFSTMIE,
- ORFKTNIE,
- ORFSBGIE,
- ORFTIRIE,
- ORFVBGIE,
ORFIPTVIE,
+ ORFPodcastIE,
+ ORFRadioIE,
+ ORFRadioCollectionIE,
)
from .outsidetv import OutsideTVIE
from .packtpub import (
diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py
index c64af3be6..2510ad887 100644
--- a/youtube_dl/extractor/itv.py
+++ b/youtube_dl/extractor/itv.py
@@ -35,15 +35,6 @@ from ..utils import (
class ITVBaseIE(InfoExtractor):
- def _search_nextjs_data(self, webpage, video_id, **kw):
- transform_source = kw.pop('transform_source', None)
- fatal = kw.pop('fatal', True)
- return self._parse_json(
- self._search_regex(
- r'''<script\b[^>]+\bid=('|")__NEXT_DATA__\1[^>]*>(?P<js>[^<]+)</script>''',
- webpage, 'next.js data', group='js', fatal=fatal, **kw),
- video_id, transform_source=transform_source, fatal=fatal)
-
def __handle_request_webpage_error(self, err, video_id=None, errnote=None, fatal=True):
if errnote is False:
return False
@@ -109,7 +100,9 @@ class ITVBaseIE(InfoExtractor):
class ITVIE(ITVBaseIE):
_VALID_URL = r'https?://(?:www\.)?itv\.com/(?:(?P<w>watch)|hub)/[^/]+/(?(w)[\w-]+/)(?P<id>\w+)'
- _IE_DESC = 'ITVX'
+ IE_DESC = 'ITVX'
+ _WORKING = False
+
_TESTS = [{
'note': 'Hub URLs redirect to ITVX',
'url': 'https://www.itv.com/hub/liar/2a4547a0012',
@@ -270,7 +263,7 @@ class ITVIE(ITVBaseIE):
'ext': determine_ext(href, 'vtt'),
})
- next_data = self._search_nextjs_data(webpage, video_id, fatal=False, default='{}')
+ next_data = self._search_nextjs_data(webpage, video_id, fatal=False, default={})
video_data.update(traverse_obj(next_data, ('props', 'pageProps', ('title', 'episode')), expected_type=dict)[0] or {})
title = traverse_obj(video_data, 'headerTitle', 'episodeTitle')
info = self._og_extract(webpage, require_title=not title)
@@ -323,7 +316,7 @@ class ITVIE(ITVBaseIE):
class ITVBTCCIE(ITVBaseIE):
_VALID_URL = r'https?://(?:www\.)?itv\.com/(?!(?:watch|hub)/)(?:[^/]+/)+(?P<id>[^/?#&]+)'
- _IE_DESC = 'ITV articles: News, British Touring Car Championship'
+ IE_DESC = 'ITV articles: News, British Touring Car Championship'
_TESTS = [{
'note': 'British Touring Car Championship',
'url': 'https://www.itv.com/btcc/articles/btcc-2018-all-the-action-from-brands-hatch',
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index 69319857d..2b5e2c15c 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -1,3 +1,4 @@
+# coding: utf-8
from __future__ import unicode_literals
import itertools
@@ -10,7 +11,7 @@ from ..compat import (
compat_ord,
compat_str,
compat_urllib_parse_unquote,
- compat_zip
+ compat_zip as zip,
)
from ..utils import (
int_or_none,
@@ -24,7 +25,7 @@ class MixcloudBaseIE(InfoExtractor):
def _call_api(self, object_type, object_fields, display_id, username, slug=None):
lookup_key = object_type + 'Lookup'
return self._download_json(
- 'https://www.mixcloud.com/graphql', display_id, query={
+ 'https://app.mixcloud.com/graphql', display_id, query={
'query': '''{
%s(lookup: {username: "%s"%s}) {
%s
@@ -44,7 +45,7 @@ class MixcloudIE(MixcloudBaseIE):
'ext': 'm4a',
'title': 'Cryptkeeper',
'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
- 'uploader': 'Daniel Holbach',
+ 'uploader': 'dholbach', # was: 'Daniel Holbach',
'uploader_id': 'dholbach',
'thumbnail': r're:https?://.*\.jpg',
'view_count': int,
@@ -57,7 +58,7 @@ class MixcloudIE(MixcloudBaseIE):
'id': 'gillespeterson_caribou-7-inch-vinyl-mix-chat',
'ext': 'mp3',
'title': 'Caribou 7 inch Vinyl Mix & Chat',
- 'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
+ 'description': r're:Last week Dan Snaith aka Caribou swung by the Brownswood.{136}',
'uploader': 'Gilles Peterson Worldwide',
'uploader_id': 'gillespeterson',
'thumbnail': 're:https?://.*',
@@ -65,6 +66,23 @@ class MixcloudIE(MixcloudBaseIE):
'timestamp': 1422987057,
'upload_date': '20150203',
},
+ 'params': {
+ 'skip_download': '404 not found',
+ },
+ }, {
+ 'url': 'https://www.mixcloud.com/gillespeterson/carnival-m%C3%BAsica-popular-brasileira-mix/',
+ 'info_dict': {
+ 'id': 'gillespeterson_carnival-música-popular-brasileira-mix',
+ 'ext': 'm4a',
+ 'title': 'Carnival Música Popular Brasileira Mix',
+ 'description': r're:Gilles was recently in Brazil to play at Boiler Room.{208}',
+ 'timestamp': 1454347174,
+ 'upload_date': '20160201',
+ 'uploader': 'Gilles Peterson Worldwide',
+ 'uploader_id': 'gillespeterson',
+ 'thumbnail': 're:https?://.*',
+ 'view_count': int,
+ },
}, {
'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/',
'only_matching': True,
@@ -76,10 +94,10 @@ class MixcloudIE(MixcloudBaseIE):
"""Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR."""
return ''.join([
compat_chr(compat_ord(ch) ^ compat_ord(k))
- for ch, k in compat_zip(ciphertext, itertools.cycle(key))])
+ for ch, k in zip(ciphertext, itertools.cycle(key))])
def _real_extract(self, url):
- username, slug = re.match(self._VALID_URL, url).groups()
+ username, slug = self._match_valid_url(url).groups()
username, slug = compat_urllib_parse_unquote(username), compat_urllib_parse_unquote(slug)
track_id = '%s_%s' % (username, slug)
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
index 8d537d7ae..1ee78edbc 100644
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@@ -1,407 +1,394 @@
# coding: utf-8
from __future__ import unicode_literals
+import base64
+import functools
import re
from .common import InfoExtractor
-from ..compat import compat_str
+from .youtube import YoutubeIE
from ..utils import (
clean_html,
determine_ext,
+ ExtractorError,
float_or_none,
- HEADRequest,
int_or_none,
- orderedSet,
- remove_end,
- str_or_none,
+ merge_dicts,
+ mimetype2ext,
+ parse_age_limit,
+ parse_iso8601,
strip_jsonp,
- unescapeHTML,
+ txt_or_none,
unified_strdate,
+ update_url_query,
url_or_none,
)
-
-
-class ORFTVthekIE(InfoExtractor):
- IE_NAME = 'orf:tvthek'
- IE_DESC = 'ORF TVthek'
- _VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P<id>\d+)'
+from ..traversal import T, traverse_obj
+
+k_float_or_none = functools.partial(float_or_none, scale=1000)
+
+
+class ORFRadioBase(InfoExtractor):
+ STATION_INFO = {
+ 'fm4': ('fm4', 'fm4', 'orffm4'),
+ 'noe': ('noe', 'oe2n', 'orfnoe'),
+ 'wien': ('wie', 'oe2w', 'orfwie'),
+ 'burgenland': ('bgl', 'oe2b', 'orfbgl'),
+ 'ooe': ('ooe', 'oe2o', 'orfooe'),
+ 'steiermark': ('stm', 'oe2st', 'orfstm'),
+ 'kaernten': ('ktn', 'oe2k', 'orfktn'),
+ 'salzburg': ('sbg', 'oe2s', 'orfsbg'),
+ 'tirol': ('tir', 'oe2t', 'orftir'),
+ 'vorarlberg': ('vbg', 'oe2v', 'orfvbg'),
+ 'oe3': ('oe3', 'oe3', 'orfoe3'),
+ 'oe1': ('oe1', 'oe1', 'orfoe1'),
+ }
+ _ID_NAMES = ('id', 'guid', 'program')
+
+ @classmethod
+ def _get_item_id(cls, data):
+ return traverse_obj(data, *cls._ID_NAMES, expected_type=txt_or_none)
+
+ @classmethod
+ def _get_api_payload(cls, data, expected_id, in_payload=False):
+ if expected_id not in traverse_obj(data, ('payload',)[:1 if in_payload else 0] + (cls._ID_NAMES, T(txt_or_none))):
+ raise ExtractorError('Unexpected API data result', video_id=expected_id)
+ return data['payload']
+
+ @staticmethod
+ def _extract_podcast_upload(data):
+ return traverse_obj(data, {
+ 'url': ('enclosures', 0, 'url'),
+ 'ext': ('enclosures', 0, 'type', T(mimetype2ext)),
+ 'filesize': ('enclosures', 0, 'length', T(int_or_none)),
+ 'title': ('title', T(txt_or_none)),
+ 'description': ('description', T(clean_html)),
+ 'timestamp': (('published', 'postDate'), T(parse_iso8601)),
+ 'duration': ('duration', T(k_float_or_none)),
+ 'series': ('podcast', 'title'),
+ 'uploader': ((('podcast', 'author'), 'station'), T(txt_or_none)),
+ 'uploader_id': ('podcast', 'channel', T(txt_or_none)),
+ }, get_all=False)
+
+ @classmethod
+ def _entries(cls, data, station, item_type=None):
+ if item_type in ('upload', 'podcast-episode'):
+ yield merge_dicts({
+ 'id': cls._get_item_id(data),
+ 'ext': 'mp3',
+ 'vcodec': 'none',
+ }, cls._extract_podcast_upload(data), rev=True)
+ return
+
+ loop_station = cls.STATION_INFO[station][1]
+ for info in traverse_obj(data, ((('streams', Ellipsis), 'stream'), T(lambda v: v if v['loopStreamId'] else None))):
+ item_id = info['loopStreamId']
+ host = info.get('host') or 'loopstream01.apa.at'
+ yield merge_dicts({
+ 'id': item_id.replace('.mp3', ''),
+ 'ext': 'mp3',
+ 'url': update_url_query('https://{0}/'.format(host), {
+ 'channel': loop_station,
+ 'id': item_id,
+ }),
+ 'vcodec': 'none',
+ # '_old_archive_ids': [make_archive_id(old_ie, video_id)],
+ }, traverse_obj(data, {
+ 'title': ('title', T(txt_or_none)),
+ 'description': ('subtitle', T(clean_html)),
+ 'uploader': 'station',
+ 'series': ('programTitle', T(txt_or_none)),
+ }), traverse_obj(info, {
+ 'duration': (('duration',
+ (None, T(lambda x: x['end'] - x['start']))),
+ T(k_float_or_none), any),
+ 'timestamp': (('start', 'startISO'), T(parse_iso8601), any),
+ }))
+
+
+class ORFRadioIE(ORFRadioBase):
+ IE_NAME = 'orf:sound'
+ _STATION_RE = '|'.join(map(re.escape, ORFRadioBase.STATION_INFO.keys()))
+
+ _VALID_URL = (
+ r'https?://sound\.orf\.at/radio/(?P<station>{0})/sendung/(?P<id>\d+)(?:/(?P<show>\w+))?'.format(_STATION_RE),
+ r'https?://(?P<station>{0})\.orf\.at/player/(?P<date>\d{{8}})/(?P<id>\d+)'.format(_STATION_RE),
+ )
_TESTS = [{
- 'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389',
+ 'url': 'https://sound.orf.at/radio/ooe/sendung/37802/guten-morgen-oberoesterreich-am-feiertag',
+ 'info_dict': {
+ 'id': '37802',
+ 'title': 'Guten Morgen Oberösterreich am Feiertag',
+ 'description': 'Oberösterreichs meistgehörte regionale Frühsendung.\nRegionale Nachrichten zu jeder halben Stunde.\nModeration: Wolfgang Lehner\nNachrichten: Stephan Schnabl',
+ },
'playlist': [{
- 'md5': '2942210346ed779588f428a92db88712',
+ 'md5': 'f9ff8517dd681b642a2c900e2c9e6085',
'info_dict': {
- 'id': '8896777',
- 'ext': 'mp4',
- 'title': 'Aufgetischt: Mit der Steirischen Tafelrunde',
- 'description': 'md5:c1272f0245537812d4e36419c207b67d',
- 'duration': 2668,
- 'upload_date': '20141208',
- },
+ 'id': '2024-05-30_0559_tl_66_7DaysThu1_443862',
+ 'ext': 'mp3',
+ 'title': 'Guten Morgen Oberösterreich am Feiertag',
+ 'description': 'Oberösterreichs meistgehörte regionale Frühsendung.\nRegionale Nachrichten zu jeder halben Stunde.\nModeration: Wolfgang Lehner\nNachrichten: Stephan Schnabl',
+ 'timestamp': 1717041587,
+ 'upload_date': '20240530',
+ 'uploader': 'ooe',
+ 'duration': 14413.0,
+ }
}],
- 'skip': 'Blocked outside of Austria / Germany',
+ 'skip': 'Shows from ORF Sound are only available for 30 days.'
}, {
- 'url': 'http://tvthek.orf.at/topic/Im-Wandel-der-Zeit/8002126/Best-of-Ingrid-Thurnher/7982256',
+ 'url': 'https://oe1.orf.at/player/20240531/758136',
+ 'md5': '2397717aaf3ae9c22a4f090ee3b8d374',
'info_dict': {
- 'id': '7982259',
- 'ext': 'mp4',
- 'title': 'Best of Ingrid Thurnher',
- 'upload_date': '20140527',
- 'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im Jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".',
- },
- 'params': {
- 'skip_download': True, # rtsp downloads
+ 'id': '2024-05-31_1905_tl_51_7DaysFri35_2413387',
+ 'ext': 'mp3',
+ 'title': '"Who Cares?"',
+ 'description': 'Europas größte Netzkonferenz re:publica 2024',
+ 'timestamp': 1717175100,
+ 'upload_date': '20240531',
+ 'uploader': 'oe1',
+ 'duration': 1500,
},
- 'skip': 'Blocked outside of Austria / Germany',
- }, {
- 'url': 'http://tvthek.orf.at/topic/Fluechtlingskrise/10463081/Heimat-Fremde-Heimat/13879132/Senioren-betreuen-Migrantenkinder/13879141',
- 'only_matching': True,
- }, {
- 'url': 'http://tvthek.orf.at/profile/Universum/35429',
- 'only_matching': True,
+ 'skip': 'Shows from ORF Sound are only available for 30 days.'
}]
def _real_extract(self, url):
- playlist_id = self._match_id(url)
- webpage = self._download_webpage(url, playlist_id)
+ m = self._match_valid_url(url)
+ station, show_id = m.group('station', 'id')
+ api_station, _, _ = self.STATION_INFO[station]
+ if 'date' in m.groupdict():
+ data = self._download_json(
+ 'https://audioapi.orf.at/{0}/json/4.0/broadcast/{1}/{2}?_o={3}.orf.at'.format(
+ api_station, show_id, m.group('date'), station), show_id)
+ show_id = data['id']
+ else:
+ data = self._download_json(
+ 'https://audioapi.orf.at/{0}/api/json/5.0/broadcast/{1}?_o=sound.orf.at'.format(
+ api_station, show_id), show_id)
- data_jsb = self._parse_json(
- self._search_regex(
- r'<div[^>]+class=(["\']).*?VideoPlaylist.*?\1[^>]+data-jsb=(["\'])(?P<json>.+?)\2',
- webpage, 'playlist', group='json'),
- playlist_id, transform_source=unescapeHTML)['playlist']['videos']
+ data = self._get_api_payload(data, show_id, in_payload=True)
- entries = []
- for sd in data_jsb:
- video_id, title = sd.get('id'), sd.get('title')
- if not video_id or not title:
- continue
- video_id = compat_str(video_id)
- formats = []
- for fd in sd['sources']:
- src = url_or_none(fd.get('src'))
- if not src:
- continue
- format_id_list = []
- for key in ('delivery', 'quality', 'quality_string'):
- value = fd.get(key)
- if value:
- format_id_list.append(value)
- format_id = '-'.join(format_id_list)
- ext = determine_ext(src)
- if ext == 'm3u8':
- m3u8_formats = self._extract_m3u8_formats(
- src, video_id, 'mp4', m3u8_id=format_id, fatal=False)
- if any('/geoprotection' in f['url'] for f in m3u8_formats):
- self.raise_geo_restricted()
- formats.extend(m3u8_formats)
- elif ext == 'f4m':
- formats.extend(self._extract_f4m_formats(
- src, video_id, f4m_id=format_id, fatal=False))
- elif ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- src, video_id, mpd_id=format_id, fatal=False))
- else:
- formats.append({
- 'format_id': format_id,
- 'url': src,
- 'protocol': fd.get('protocol'),
- })
-
- # Check for geoblocking.
- # There is a property is_geoprotection, but that's always false
- geo_str = sd.get('geoprotection_string')
- if geo_str:
- try:
- http_url = next(
- f['url']
- for f in formats
- if re.match(r'^https?://.*\.mp4$', f['url']))
- except StopIteration:
- pass
- else:
- req = HEADRequest(http_url)
- self._request_webpage(
- req, video_id,
- note='Testing for geoblocking',
- errnote=((
- 'This video seems to be blocked outside of %s. '
- 'You may want to try the streaming-* formats.')
- % geo_str),
- fatal=False)
-
- self._check_formats(formats, video_id)
- self._sort_formats(formats)
+ # site sends ISO8601 GMT date-times with separate TZ offset, ignored
+ # TODO: should `..._date` be calculated relative to TZ?
- subtitles = {}
- for sub in sd.get('subtitles', []):
- sub_src = sub.get('src')
- if not sub_src:
- continue
- subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({
- 'url': sub_src,
- })
-
- upload_date = unified_strdate(sd.get('created_date'))
+ return merge_dicts(
+ {'_type': 'multi_video'},
+ self.playlist_result(
+ self._entries(data, station), show_id,
+ txt_or_none(data.get('title')),
+ clean_html(data.get('subtitle'))))
- thumbnails = []
- preview = sd.get('preview_image_url')
- if preview:
- thumbnails.append({
- 'id': 'preview',
- 'url': preview,
- 'preference': 0,
- })
- image = sd.get('image_full_url')
- if not image and len(data_jsb) == 1:
- image = self._og_search_thumbnail(webpage)
- if image:
- thumbnails.append({
- 'id': 'full',
- 'url': image,
- 'preference': 1,
- })
- entries.append({
- '_type': 'video',
- 'id': video_id,
- 'title': title,
- 'formats': formats,
- 'subtitles': subtitles,
- 'description': sd.get('description'),
- 'duration': int_or_none(sd.get('duration_in_seconds')),
- 'upload_date': upload_date,
- 'thumbnails': thumbnails,
- })
-
- return {
- '_type': 'playlist',
- 'entries': entries,
- 'id': playlist_id,
- }
-
-
-class ORFRadioIE(InfoExtractor):
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- show_date = mobj.group('date')
- show_id = mobj.group('show')
+class ORFRadioCollectionIE(ORFRadioBase):
+ IE_NAME = 'orf:collection'
+ _VALID_URL = r'https?://sound\.orf\.at/collection/(?P<coll_id>\d+)(?:/(?P<item_id>\d+))?'
- data = self._download_json(
- 'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s'
- % (self._API_STATION, show_id, show_date), show_id)
-
- entries = []
- for info in data['streams']:
- loop_stream_id = str_or_none(info.get('loopStreamId'))
- if not loop_stream_id:
- continue
- title = str_or_none(data.get('title'))
- if not title:
- continue
- start = int_or_none(info.get('start'), scale=1000)
- end = int_or_none(info.get('end'), scale=1000)
- duration = end - start if end and start else None
- entries.append({
- 'id': loop_stream_id.replace('.mp3', ''),
- 'url': 'https://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id),
- 'title': title,
- 'description': clean_html(data.get('subtitle')),
- 'duration': duration,
- 'timestamp': start,
+ _TESTS = [{
+ 'url': 'https://sound.orf.at/collection/4/61908/was-das-uberschreiten-des-15-limits-bedeutet',
+ 'info_dict': {
+ 'id': '2577582',
+ },
+ 'playlist': [{
+ 'md5': '5789cec7d75575ff58d19c0428c80eb3',
+ 'info_dict': {
+ 'id': '2024-06-06_1659_tl_54_7DaysThu6_153926',
'ext': 'mp3',
- 'series': data.get('programTitle'),
- })
-
- return {
- '_type': 'playlist',
- 'id': show_id,
- 'title': data.get('title'),
- 'description': clean_html(data.get('subtitle')),
- 'entries': entries,
- }
-
-
-class ORFFM4IE(ORFRadioIE):
- IE_NAME = 'orf:fm4'
- IE_DESC = 'radio FM4'
- _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>4\w+)'
- _API_STATION = 'fm4'
- _LOOP_STATION = 'fm4'
-
- _TEST = {
- 'url': 'http://fm4.orf.at/player/20170107/4CC',
- 'md5': '2b0be47375432a7ef104453432a19212',
+ 'title': 'Klimakrise: Was das Überschreiten des 1,5°-Limits bedeutet',
+ 'timestamp': 1717686674,
+ 'upload_date': '20240606',
+ 'uploader': 'fm4',
+ },
+ }],
+ 'skip': 'Shows from ORF Sound are only available for 30 days.'
+ }, {
+ # persistent playlist (FM4 Highlights)
+ 'url': 'https://sound.orf.at/collection/4/',
'info_dict': {
- 'id': '2017-01-07_2100_tl_54_7DaysSat18_31295',
- 'ext': 'mp3',
- 'title': 'Solid Steel Radioshow',
- 'description': 'Die Mixshow von Coldcut und Ninja Tune.',
- 'duration': 3599,
- 'timestamp': 1483819257,
- 'upload_date': '20170107',
+ 'id': '4',
},
- 'skip': 'Shows from ORF radios are only available for 7 days.',
- 'only_matching': True,
- }
-
-
-class ORFNOEIE(ORFRadioIE):
- IE_NAME = 'orf:noe'
- IE_DESC = 'Radio Niederösterreich'
- _VALID_URL = r'https?://(?P<station>noe)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
- _API_STATION = 'noe'
- _LOOP_STATION = 'oe2n'
-
- _TEST = {
- 'url': 'https://noe.orf.at/player/20200423/NGM',
- 'only_matching': True,
- }
-
-
-class ORFWIEIE(ORFRadioIE):
- IE_NAME = 'orf:wien'
- IE_DESC = 'Radio Wien'
- _VALID_URL = r'https?://(?P<station>wien)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
- _API_STATION = 'wie'
- _LOOP_STATION = 'oe2w'
-
- _TEST = {
- 'url': 'https://wien.orf.at/player/20200423/WGUM',
- 'only_matching': True,
- }
-
-
-class ORFBGLIE(ORFRadioIE):
- IE_NAME = 'orf:burgenland'
- IE_DESC = 'Radio Burgenland'
- _VALID_URL = r'https?://(?P<station>burgenland)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
- _API_STATION = 'bgl'
- _LOOP_STATION = 'oe2b'
-
- _TEST = {
- 'url': 'https://burgenland.orf.at/player/20200423/BGM',
- 'only_matching': True,
- }
-
-
-class ORFOOEIE(ORFRadioIE):
- IE_NAME = 'orf:oberoesterreich'
- IE_DESC = 'Radio Oberösterreich'
- _VALID_URL = r'https?://(?P<station>ooe)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
- _API_STATION = 'ooe'
- _LOOP_STATION = 'oe2o'
+ 'playlist_mincount': 10,
+ 'playlist_maxcount': 13,
+ }]
- _TEST = {
- 'url': 'https://ooe.orf.at/player/20200423/OGMO',
- 'only_matching': True,
- }
+ def _real_extract(self, url):
+ coll_id, item_id = self._match_valid_url(url).group('coll_id', 'item_id')
+ data = self._download_json(
+ 'https://collector.orf.at/api/frontend/collections/{0}?_o=sound.orf.at'.format(
+ coll_id), coll_id)
+ data = self._get_api_payload(data, coll_id, in_payload=True)
+
+ def yield_items():
+ for item in traverse_obj(data, (
+ 'content', 'items', lambda _, v: any(k in v['target']['params'] for k in self._ID_NAMES))):
+ if item_id is None or item_id == txt_or_none(item.get('id')):
+ target = item['target']
+ typed_item_id = self._get_item_id(target['params'])
+ station = target['params'].get('station')
+ item_type = target.get('type')
+ if typed_item_id and (station or item_type):
+ yield station, typed_item_id, item_type
+ if item_id is not None:
+ break
+ else:
+ if item_id is not None:
+ raise ExtractorError('Item not found in collection',
+ video_id=coll_id, expected=True)
+
+ def item_playlist(station, typed_item_id, item_type):
+ if item_type == 'upload':
+ item_data = self._download_json('https://audioapi.orf.at/radiothek/api/2.0/upload/{0}?_o=sound.orf.at'.format(
+ typed_item_id), typed_item_id)
+ elif item_type == 'podcast-episode':
+ item_data = self._download_json('https://audioapi.orf.at/radiothek/api/2.0/episode/{0}?_o=sound.orf.at'.format(
+ typed_item_id), typed_item_id)
+ else:
+ api_station, _, _ = self.STATION_INFO[station]
+ item_data = self._download_json(
+ 'https://audioapi.orf.at/{0}/api/json/5.0/{1}/{2}?_o=sound.orf.at'.format(
+ api_station, item_type or 'broadcastitem', typed_item_id), typed_item_id)
+ item_data = self._get_api_payload(item_data, typed_item_id, in_payload=True)
-class ORFSTMIE(ORFRadioIE):
- IE_NAME = 'orf:steiermark'
- IE_DESC = 'Radio Steiermark'
- _VALID_URL = r'https?://(?P<station>steiermark)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
- _API_STATION = 'stm'
- _LOOP_STATION = 'oe2st'
+ return merge_dicts(
+ {'_type': 'multi_video'},
+ self.playlist_result(
+ self._entries(item_data, station, item_type), typed_item_id,
+ txt_or_none(data.get('title')),
+ clean_html(data.get('subtitle'))))
- _TEST = {
- 'url': 'https://steiermark.orf.at/player/20200423/STGMS',
- 'only_matching': True,
- }
+ def yield_item_entries():
+ for station, typed_id, item_type in yield_items():
+ yield item_playlist(station, typed_id, item_type)
+ if item_id is not None:
+ # coll_id = '/'.join((coll_id, item_id))
+ return next(yield_item_entries())
-class ORFKTNIE(ORFRadioIE):
- IE_NAME = 'orf:kaernten'
- IE_DESC = 'Radio Kärnten'
- _VALID_URL = r'https?://(?P<station>kaernten)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
- _API_STATION = 'ktn'
- _LOOP_STATION = 'oe2k'
+ return self.playlist_result(yield_item_entries(), coll_id, data.get('title'))
- _TEST = {
- 'url': 'https://kaernten.orf.at/player/20200423/KGUMO',
- 'only_matching': True,
- }
+class ORFPodcastIE(ORFRadioBase):
+ IE_NAME = 'orf:podcast'
+ _STATION_RE = '|'.join(map(re.escape, (x[0] for x in ORFRadioBase.STATION_INFO.values()))) + '|tv'
+ _VALID_URL = r'https?://sound\.orf\.at/podcast/(?P<station>{0})/(?P<show>[\w-]+)/(?P<id>[\w-]+)'.format(_STATION_RE)
+ _TESTS = [{
+ 'url': 'https://sound.orf.at/podcast/stm/der-kraeutertipp-von-christine-lackner/rotklee',
+ 'md5': '1f2bab2ba90c2ce0c2754196ea78b35f',
+ 'info_dict': {
+ 'id': 'der-kraeutertipp-von-christine-lackner/rotklee',
+ 'ext': 'mp3',
+ 'title': 'Rotklee',
+ 'description': 'In der Natur weit verbreitet - in der Medizin längst anerkennt: Rotklee. Dieser Podcast begleitet die Sendung "Radio Steiermark am Vormittag", Radio Steiermark, 28. Mai 2024.',
+ 'timestamp': 1716891761,
+ 'upload_date': '20240528',
+ 'uploader_id': 'stm_kraeutertipp',
+ 'uploader': 'ORF Radio Steiermark',
+ 'duration': 101,
+ 'series': 'Der Kräutertipp von Christine Lackner',
+ },
+ 'skip': 'ORF podcasts are only available for a limited time'
+ }]
-class ORFSBGIE(ORFRadioIE):
- IE_NAME = 'orf:salzburg'
- IE_DESC = 'Radio Salzburg'
- _VALID_URL = r'https?://(?P<station>salzburg)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
- _API_STATION = 'sbg'
- _LOOP_STATION = 'oe2s'
+ _ID_NAMES = ('slug', 'guid')
- _TEST = {
- 'url': 'https://salzburg.orf.at/player/20200423/SGUM',
- 'only_matching': True,
- }
+ def _real_extract(self, url):
+ station, show, show_id = self._match_valid_url(url).group('station', 'show', 'id')
+ data = self._download_json(
+ 'https://audioapi.orf.at/radiothek/api/2.0/podcast/{0}/{1}/{2}'.format(
+ station, show, show_id), show_id)
+ data = self._get_api_payload(data, show_id, in_payload=True)
+ return merge_dicts({
+ 'id': '/'.join((show, show_id)),
+ 'ext': 'mp3',
+ 'vcodec': 'none',
+ }, self._extract_podcast_upload(data), rev=True)
-class ORFTIRIE(ORFRadioIE):
- IE_NAME = 'orf:tirol'
- IE_DESC = 'Radio Tirol'
- _VALID_URL = r'https?://(?P<station>tirol)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
- _API_STATION = 'tir'
- _LOOP_STATION = 'oe2t'
- _TEST = {
- 'url': 'https://tirol.orf.at/player/20200423/TGUMO',
- 'only_matching': True,
- }
+class ORFIPTVBase(InfoExtractor):
+ _TITLE_STRIP_RE = ''
+ def _extract_video(self, video_id, webpage, fatal=False):
-class ORFVBGIE(ORFRadioIE):
- IE_NAME = 'orf:vorarlberg'
- IE_DESC = 'Radio Vorarlberg'
- _VALID_URL = r'https?://(?P<station>vorarlberg)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
- _API_STATION = 'vbg'
- _LOOP_STATION = 'oe2v'
+ data = self._download_json(
+ 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id,
+ video_id)[0]
- _TEST = {
- 'url': 'https://vorarlberg.orf.at/player/20200423/VGUM',
- 'only_matching': True,
- }
+ video = traverse_obj(data, (
+ 'sources', ('default', 'q8c'),
+ T(lambda x: x if x['loadBalancerUrl'] else None),
+ any))
+ load_balancer_url = video['loadBalancerUrl']
-class ORFOE3IE(ORFRadioIE):
- IE_NAME = 'orf:oe3'
- IE_DESC = 'Radio Österreich 3'
- _VALID_URL = r'https?://(?P<station>oe3)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
- _API_STATION = 'oe3'
- _LOOP_STATION = 'oe3'
+ try:
+ rendition = self._download_json(
+ load_balancer_url, video_id, transform_source=strip_jsonp)
+ except ExtractorError:
+ rendition = None
+
+ if not rendition:
+ rendition = {
+ 'redirect': {
+ 'smil': re.sub(
+ r'(/)jsonp(/.+\.)mp4$', r'\1dash\2smil/manifest.mpd',
+ load_balancer_url),
+ },
+ }
- _TEST = {
- 'url': 'https://oe3.orf.at/player/20200424/3WEK',
- 'only_matching': True,
- }
+ f = traverse_obj(video, {
+ 'abr': ('audioBitrate', T(int_or_none)),
+ 'vbr': ('bitrate', T(int_or_none)),
+ 'fps': ('videoFps', T(int_or_none)),
+ 'width': ('videoWidth', T(int_or_none)),
+ 'height': ('videoHeight', T(int_or_none)),
+ })
+ formats = []
+ for format_id, format_url in traverse_obj(rendition, (
+ 'redirect', T(dict.items), Ellipsis)):
+ if format_id == 'rtmp':
+ ff = f.copy()
+ ff.update({
+ 'url': format_url,
+ 'format_id': format_id,
+ })
+ formats.append(ff)
+ elif determine_ext(format_url) == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ format_url, video_id, f4m_id=format_id))
+ elif determine_ext(format_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', m3u8_id=format_id,
+ entry_protocol='m3u8_native'))
+ elif determine_ext(format_url) == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ format_url, video_id, mpd_id=format_id))
-class ORFOE1IE(ORFRadioIE):
- IE_NAME = 'orf:oe1'
- IE_DESC = 'Radio Österreich 1'
- _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
- _API_STATION = 'oe1'
- _LOOP_STATION = 'oe1'
+ if formats or fatal:
+ self._sort_formats(formats)
+ else:
+ return
- _TEST = {
- 'url': 'http://oe1.orf.at/player/20170108/456544',
- 'md5': '34d8a6e67ea888293741c86a099b745b',
- 'info_dict': {
- 'id': '2017-01-08_0759_tl_51_7DaysSun6_256141',
- 'ext': 'mp3',
- 'title': 'Morgenjournal',
- 'duration': 609,
- 'timestamp': 1483858796,
- 'upload_date': '20170108',
- },
- 'skip': 'Shows from ORF radios are only available for 7 days.'
- }
+ return merge_dicts({
+ 'id': video_id,
+ 'title': re.sub(self._TITLE_STRIP_RE, '', self._og_search_title(webpage)),
+ 'description': self._og_search_description(webpage),
+ 'upload_date': unified_strdate(self._html_search_meta(
+ 'dc.date', webpage, 'upload date', fatal=False)),
+ 'formats': formats,
+ }, traverse_obj(data, {
+ 'duration': ('duration', T(k_float_or_none)),
+ 'thumbnail': ('sources', 'default', 'preview', T(url_or_none)),
+ }), rev=True)
-class ORFIPTVIE(InfoExtractor):
+class ORFIPTVIE(ORFIPTVBase):
IE_NAME = 'orf:iptv'
IE_DESC = 'iptv.ORF.at'
+ _WORKING = False # URLs redirect to orf.at/
_VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)'
+ _TITLE_STRIP_RE = r'\s+-\s+iptv\.ORF\.at\S*$'
_TEST = {
'url': 'http://iptv.orf.at/stories/2275236/',
@@ -426,74 +413,32 @@ class ORFIPTVIE(InfoExtractor):
video_id = self._search_regex(
r'data-video(?:id)?="(\d+)"', webpage, 'video id')
- data = self._download_json(
- 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id,
- video_id)[0]
-
- duration = float_or_none(data['duration'], 1000)
+ return self._extract_video(video_id, webpage)
- video = data['sources']['default']
- load_balancer_url = video['loadBalancerUrl']
- abr = int_or_none(video.get('audioBitrate'))
- vbr = int_or_none(video.get('bitrate'))
- fps = int_or_none(video.get('videoFps'))
- width = int_or_none(video.get('videoWidth'))
- height = int_or_none(video.get('videoHeight'))
- thumbnail = video.get('preview')
-
- rendition = self._download_json(
- load_balancer_url, video_id, transform_source=strip_jsonp)
-
- f = {
- 'abr': abr,
- 'vbr': vbr,
- 'fps': fps,
- 'width': width,
- 'height': height,
- }
- formats = []
- for format_id, format_url in rendition['redirect'].items():
- if format_id == 'rtmp':
- ff = f.copy()
- ff.update({
- 'url': format_url,
- 'format_id': format_id,
- })
- formats.append(ff)
- elif determine_ext(format_url) == 'f4m':
- formats.extend(self._extract_f4m_formats(
- format_url, video_id, f4m_id=format_id))
- elif determine_ext(format_url) == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- format_url, video_id, 'mp4', m3u8_id=format_id))
- else:
- continue
- self._sort_formats(formats)
-
- title = remove_end(self._og_search_title(webpage), ' - iptv.ORF.at')
- description = self._og_search_description(webpage)
- upload_date = unified_strdate(self._html_search_meta(
- 'dc.date', webpage, 'upload date'))
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'duration': duration,
- 'thumbnail': thumbnail,
- 'upload_date': upload_date,
- 'formats': formats,
- }
-
-
-class ORFFM4StoryIE(InfoExtractor):
+class ORFFM4StoryIE(ORFIPTVBase):
IE_NAME = 'orf:fm4:story'
IE_DESC = 'fm4.orf.at stories'
_VALID_URL = r'https?://fm4\.orf\.at/stories/(?P<id>\d+)'
+ _TITLE_STRIP_RE = r'\s+-\s+fm4\.ORF\.at\s*$'
- _TEST = {
+ _TESTS = [{
+ 'url': 'https://fm4.orf.at/stories/3041554/',
+ 'add_ie': ['Youtube'],
+ 'info_dict': {
+ 'id': '3041554',
+ 'title': 'Is The EU Green Deal In Mortal Danger?',
+ },
+ 'playlist_count': 4,
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ }, {
'url': 'http://fm4.orf.at/stories/2865738/',
+ 'info_dict': {
+ 'id': '2865738',
+ 'title': 'Manu Delago und Inner Tongue live',
+ },
'playlist': [{
'md5': 'e1c2c706c45c7b34cf478bbf409907ca',
'info_dict': {
@@ -510,83 +455,311 @@ class ORFFM4StoryIE(InfoExtractor):
'info_dict': {
'id': '547798',
'ext': 'flv',
- 'title': 'Manu Delago und Inner Tongue live (2)',
+ 'title': 'Manu Delago und Inner Tongue https://vod-ww.mdn.ors.at/cms-worldwide_episodes_nas/_definst_/nas/cms-worldwide_episodes/online/14228823_0005.smil/chunklist_b992000_vo.m3u8live (2)',
'duration': 1504.08,
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20170913',
'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.',
},
}],
- }
+ 'skip': 'Videos gone',
+ }]
def _real_extract(self, url):
story_id = self._match_id(url)
webpage = self._download_webpage(url, story_id)
entries = []
- all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage))
- for idx, video_id in enumerate(all_ids):
- data = self._download_json(
- 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id,
- video_id)[0]
+ seen_ids = set()
+ for idx, video_id in enumerate(re.findall(r'data-video(?:id)?="(\d+)"', webpage)):
+ if video_id in seen_ids:
+ continue
+ seen_ids.add(video_id)
+ entry = self._extract_video(video_id, webpage, fatal=False)
+ if not entry:
+ continue
+
+ if idx >= 1:
+ # Titles are duplicates, make them unique
+ entry['title'] = '%s (%d)' % (entry['title'], idx)
- duration = float_or_none(data['duration'], 1000)
+ entries.append(entry)
- video = data['sources']['q8c']
- load_balancer_url = video['loadBalancerUrl']
- abr = int_or_none(video.get('audioBitrate'))
- vbr = int_or_none(video.get('bitrate'))
- fps = int_or_none(video.get('videoFps'))
- width = int_or_none(video.get('videoWidth'))
- height = int_or_none(video.get('videoHeight'))
- thumbnail = video.get('preview')
+ seen_ids = set()
+ for yt_id in re.findall(
+ r'data-id\s*=\s*["\']([\w-]+)[^>]+\bclass\s*=\s*["\']youtube\b',
+ webpage):
+ if yt_id in seen_ids:
+ continue
+ seen_ids.add(yt_id)
+ if YoutubeIE.suitable(yt_id):
+ entries.append(self.url_result(yt_id, ie='Youtube', video_id=yt_id))
+
+ return self.playlist_result(
+ entries, story_id,
+ re.sub(self._TITLE_STRIP_RE, '', self._og_search_title(webpage, default='') or None))
+
+
+class ORFONBase(InfoExtractor):
+ _ENC_PFX = '3dSlfek03nsLKdj4Jsd'
+ _API_PATH = 'episode'
+
+ def _call_api(self, video_id, **kwargs):
+ encrypted_id = base64.b64encode('{0}{1}'.format(
+ self._ENC_PFX, video_id).encode('utf-8')).decode('ascii')
+ return self._download_json(
+ 'https://api-tvthek.orf.at/api/v4.3/public/{0}/encrypted/{1}'.format(
+ self._API_PATH, encrypted_id),
+ video_id, **kwargs)
+
+ @classmethod
+ def _parse_metadata(cls, api_json):
+ return traverse_obj(api_json, {
+ 'id': ('id', T(int), T(txt_or_none)),
+ 'age_limit': ('age_classification', T(parse_age_limit)),
+ 'duration': ((('exact_duration', T(k_float_or_none)),
+ ('duration_second', T(float_or_none))),),
+ 'title': (('title', 'headline'), T(txt_or_none)),
+ 'description': (('description', 'teaser_text'), T(txt_or_none)),
+ # 'media_type': ('video_type', T(txt_or_none)),
+ 'thumbnail': ('_embedded', 'image', 'public_urls', 'highlight_teaser', 'url', T(url_or_none)),
+ 'timestamp': (('date', 'episode_date'), T(parse_iso8601)),
+ 'release_timestamp': ('release_date', T(parse_iso8601)),
+ # 'modified_timestamp': ('updated_at', T(parse_iso8601)),
+ }, get_all=False)
+
+ def _extract_video(self, video_id, segment_id):
+ # Not a segmented episode: return single video
+ # Segmented episode without valid segment id: return entire playlist
+ # Segmented episode with valid segment id and yes-playlist: return entire playlist
+ # Segmented episode with valid segment id and no-playlist: return single video corresponding to segment id
+ # If a multi_video playlist would be returned, but an unsegmented source exists, that source is chosen instead.
+
+ api_json = self._call_api(video_id)
+
+ if traverse_obj(api_json, 'is_drm_protected'):
+ self.report_drm(video_id)
+
+ # updates formats, subtitles
+ def extract_sources(src_json, video_id):
+ for manifest_type in traverse_obj(src_json, ('sources', T(dict.keys), Ellipsis)):
+ for manifest_url in traverse_obj(src_json, ('sources', manifest_type, Ellipsis, 'src', T(url_or_none))):
+ if manifest_type == 'hls':
+ fmts, subs = self._extract_m3u8_formats(
+ manifest_url, video_id, fatal=False, m3u8_id='hls',
+ ext='mp4', entry_protocol='m3u8_native'), {}
+ for f in fmts:
+ if '_vo.' in f['url']:
+ f['acodec'] = 'none'
+ elif manifest_type == 'dash':
+ fmts, subs = self._extract_mpd_formats_and_subtitles(
+ manifest_url, video_id, fatal=False, mpd_id='dash')
+ else:
+ continue
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
+
+ formats, subtitles = [], {}
+ if segment_id is None:
+ extract_sources(api_json, video_id)
+ if not formats:
+ segments = traverse_obj(api_json, (
+ '_embedded', 'segments', lambda _, v: v['id']))
+ if len(segments) > 1 and segment_id is not None:
+ if not self._yes_playlist(video_id, segment_id, playlist_label='collection', video_label='segment'):
+ segments = [next(s for s in segments if txt_or_none(s['id']) == segment_id)]
+
+ entries = []
+ for seg in segments:
+ formats, subtitles = [], {}
+ extract_sources(seg, segment_id)
+ self._sort_formats(formats)
+ entries.append(merge_dicts({
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }, self._parse_metadata(seg), rev=True))
+ result = merge_dicts(
+ {'_type': 'multi_video' if len(entries) > 1 else 'playlist'},
+ self._parse_metadata(api_json),
+ self.playlist_result(entries, video_id))
+ # not yet processed in core for playlist/multi
+ self._downloader._fill_common_fields(result)
+ return result
+ else:
+ self._sort_formats(formats)
- rendition = self._download_json(
- load_balancer_url, video_id, transform_source=strip_jsonp)
+ for sub_url in traverse_obj(api_json, (
+ '_embedded', 'subtitle',
+ ('xml_url', 'sami_url', 'stl_url', 'ttml_url', 'srt_url', 'vtt_url'),
+ T(url_or_none))):
+ self._merge_subtitles({'de': [{'url': sub_url}]}, target=subtitles)
- f = {
- 'abr': abr,
- 'vbr': vbr,
- 'fps': fps,
- 'width': width,
- 'height': height,
- }
+ return merge_dicts({
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ # '_old_archive_ids': [self._downloader._make_archive_id({'ie_key': 'ORFTVthek', 'id': video_id})],
+ }, self._parse_metadata(api_json), rev=True)
- formats = []
- for format_id, format_url in rendition['redirect'].items():
- if format_id == 'rtmp':
- ff = f.copy()
- ff.update({
- 'url': format_url,
- 'format_id': format_id,
- })
- formats.append(ff)
- elif determine_ext(format_url) == 'f4m':
- formats.extend(self._extract_f4m_formats(
- format_url, video_id, f4m_id=format_id))
- elif determine_ext(format_url) == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- format_url, video_id, 'mp4', m3u8_id=format_id))
- else:
- continue
- self._sort_formats(formats)
+ def _real_extract(self, url):
+ video_id, segment_id = self._match_valid_url(url).group('id', 'segment')
+ webpage = self._download_webpage(url, video_id)
- title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at')
- if idx >= 1:
- # Titles are duplicates, make them unique
- title += ' (' + str(idx + 1) + ')'
- description = self._og_search_description(webpage)
- upload_date = unified_strdate(self._html_search_meta(
- 'dc.date', webpage, 'upload date'))
-
- entries.append({
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'duration': duration,
- 'thumbnail': thumbnail,
- 'upload_date': upload_date,
- 'formats': formats,
- })
-
- return self.playlist_result(entries)
+ # ORF doesn't like 410 or 404
+ if self._search_regex(r'<div\b[^>]*>\s*(Nicht mehr verfügbar)\s*</div>', webpage, 'Availability', default=False):
+ raise ExtractorError('Content is no longer available', expected=True, video_id=video_id)
+
+ return merge_dicts({
+ 'id': video_id,
+ 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None),
+ 'description': self._html_search_meta(
+ ['description', 'og:description', 'twitter:description'], webpage, default=None),
+ }, self._search_json_ld(webpage, video_id, default={}),
+ self._extract_video(video_id, segment_id),
+ rev=True)
+
+
+class ORFONIE(ORFONBase):
+ IE_NAME = 'orf:on'
+ _VALID_URL = r'https?://on\.orf\.at/video/(?P<id>\d+)(?:/(?P<segment>\d+))?'
+ _TESTS = [{
+ 'url': 'https://on.orf.at/video/14210000/school-of-champions-48',
+ 'info_dict': {
+ 'id': '14210000',
+ 'ext': 'mp4',
+ 'duration': 2651.08,
+ 'thumbnail': 'https://api-tvthek.orf.at/assets/segments/0167/98/thumb_16697671_segments_highlight_teaser.jpeg',
+ 'title': 'School of Champions (4/8)',
+ 'description': r're:(?s)Luca hat sein ganzes Leben in den Bergen Südtirols verbracht und ist bei seiner Mutter aufgewachsen, .{1029} Leo$',
+ # 'media_type': 'episode',
+ 'timestamp': 1706558922,
+ 'upload_date': '20240129',
+ 'release_timestamp': 1706472362,
+ 'release_date': '20240128',
+ # 'modified_timestamp': 1712756663,
+ # 'modified_date': '20240410',
+ # '_old_archive_ids': ['orftvthek 14210000'],
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ 'skip': 'Available until 2024-08-12',
+ }, {
+ 'url': 'https://on.orf.at/video/3220355',
+ 'md5': '925a93b2b9a37da5c9b979d7cf71aa2e',
+ 'info_dict': {
+ 'id': '3220355',
+ 'ext': 'mp4',
+ 'duration': 445.04,
+ 'thumbnail': 'https://api-tvthek.orf.at/assets/segments/0002/60/thumb_159573_segments_highlight_teaser.png',
+ 'title': '50 Jahre Burgenland: Der Festumzug',
+ 'description': r're:(?s)Aus allen Landesteilen zogen festlich geschmückte Wagen und Musikkapellen .{270} Jenakowitsch$',
+ # 'media_type': 'episode',
+ 'timestamp': 52916400,
+ 'upload_date': '19710905',
+ 'release_timestamp': 52916400,
+ 'release_date': '19710905',
+ # 'modified_timestamp': 1498536049,
+ # 'modified_date': '20170627',
+ # '_old_archive_ids': ['orftvthek 3220355'],
+ },
+ }, {
+ # Video with multiple segments selecting the second segment
+ 'url': 'https://on.orf.at/video/14226549/15639808/jugendbande-einbrueche-aus-langeweile',
+ 'md5': 'fc151bba8c05ea77ab5693617e4a33d3',
+ 'info_dict': {
+ 'id': '15639808',
+ 'ext': 'mp4',
+ 'duration': 97.707,
+ 'thumbnail': 'https://api-tvthek.orf.at/assets/segments/0175/43/thumb_17442704_segments_highlight_teaser.jpg',
+ 'title': 'Jugendbande: Einbrüche aus Langeweile',
+ 'description': r're:Jugendbande: Einbrüche aus Langeweile \| Neuer Kinder- und .{259} Wanda$',
+ # 'media_type': 'segment',
+ 'timestamp': 1715792400,
+ 'upload_date': '20240515',
+ # 'modified_timestamp': 1715794394,
+ # 'modified_date': '20240515',
+ # '_old_archive_ids': ['orftvthek 15639808'],
+ },
+ 'params': {
+ 'noplaylist': True,
+ 'format': 'bestvideo',
+ },
+ 'skip': 'Available until 2024-06-14',
+ }, {
+ # Video with multiple segments and no combined version
+ 'url': 'https://on.orf.at/video/14227864/formel-1-grosser-preis-von-monaco-2024',
+ 'info_dict': {
+ '_type': 'multi_video',
+ 'id': '14227864',
+ 'duration': 18410.52,
+ 'thumbnail': 'https://api-tvthek.orf.at/assets/segments/0176/04/thumb_17503881_segments_highlight_teaser.jpg',
+ 'title': 'Formel 1: Großer Preis von Monaco 2024',
+ 'description': 'md5:aeeb010710ccf70ce28ccb4482243d4f',
+ # 'media_type': 'episode',
+ 'timestamp': 1716721200,
+ 'upload_date': '20240526',
+ 'release_timestamp': 1716721802,
+ 'release_date': '20240526',
+ # 'modified_timestamp': 1716884702,
+ # 'modified_date': '20240528',
+ },
+ 'playlist_count': 42,
+ 'skip': 'Gone: Nicht mehr verfügbar',
+ }, {
+ # Video with multiple segments, but with combined version
+ 'url': 'https://on.orf.at/video/14228172',
+ 'info_dict': {
+ 'id': '14228172',
+ 'ext': 'mp4',
+ 'duration': 3294.878,
+ 'thumbnail': 'https://api-tvthek.orf.at/assets/segments/0176/29/thumb_17528242_segments_highlight_teaser.jpg',
+ 'title': 'Willkommen Österreich mit Stermann & Grissemann',
+ 'description': r're:Zum Saisonfinale freuen sich die urlaubsreifen Gastgeber Stermann und .{1863} Geschichten\.$',
+ # 'media_type': 'episode',
+ 'timestamp': 1716926584,
+ 'upload_date': '20240528',
+ 'release_timestamp': 1716919202,
+ 'release_date': '20240528',
+ # 'modified_timestamp': 1716968045,
+ # 'modified_date': '20240529',
+ # '_old_archive_ids': ['orftvthek 14228172'],
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ 'skip': 'Gone: Nicht mehr verfügbar',
+ }]
+
+
+class ORFONLiveIE(ORFONBase):
+ _ENC_PFX = '8876324jshjd7293ktd'
+ _API_PATH = 'livestream'
+ _VALID_URL = r'https?://on\.orf\.at/livestream/(?P<id>\d+)(?:/(?P<segment>\d+))?'
+ _TESTS = [{
+ 'url': 'https://on.orf.at/livestream/14320204/pressekonferenz-neos-zu-aktuellen-entwicklungen',
+ 'info_dict': {
+ 'id': '14320204',
+ 'ext': 'mp4',
+ 'title': 'Pressekonferenz: Neos zu aktuellen Entwicklungen',
+ 'description': r're:(?s)Neos-Chefin Beate Meinl-Reisinger informi.{598}ng\."',
+ 'timestamp': 1716886335,
+ 'upload_date': '20240528',
+ # 'modified_timestamp': 1712756663,
+ # 'modified_date': '20240410',
+ # '_old_archive_ids': ['orftvthek 14210000'],
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ }]
+
+ @classmethod
+ def _parse_metadata(cls, api_json):
+ return merge_dicts(
+ super(ORFONLiveIE, cls)._parse_metadata(api_json),
+ traverse_obj(api_json, {
+ 'timestamp': ('updated_at', T(parse_iso8601)),
+ 'release_timestamp': ('start', T(parse_iso8601)),
+ 'is_live': True,
+ }))
diff --git a/youtube_dl/extractor/palcomp3.py b/youtube_dl/extractor/palcomp3.py
index fb29d83f9..60f7a4d48 100644
--- a/youtube_dl/extractor/palcomp3.py
+++ b/youtube_dl/extractor/palcomp3.py
@@ -8,7 +8,7 @@ from ..compat import compat_str
from ..utils import (
int_or_none,
str_or_none,
- try_get,
+ traverse_obj,
)
@@ -109,7 +109,7 @@ class PalcoMP3ArtistIE(PalcoMP3BaseIE):
}
name'''
- @ classmethod
+ @classmethod
def suitable(cls, url):
return False if re.match(PalcoMP3IE._VALID_URL, url) else super(PalcoMP3ArtistIE, cls).suitable(url)
@@ -118,7 +118,8 @@ class PalcoMP3ArtistIE(PalcoMP3BaseIE):
artist = self._call_api(artist_slug, self._ARTIST_FIELDS_TMPL)['artist']
def entries():
- for music in (try_get(artist, lambda x: x['musics']['nodes'], list) or []):
+ for music in traverse_obj(artist, (
+ 'musics', 'nodes', lambda _, m: m['musicID'])):
yield self._parse_music(music)
return self.playlist_result(
@@ -137,7 +138,7 @@ class PalcoMP3VideoIE(PalcoMP3BaseIE):
'title': 'Maiara e Maraisa - Você Faz Falta Aqui - DVD Ao Vivo Em Campo Grande',
'description': 'md5:7043342c09a224598e93546e98e49282',
'upload_date': '20161107',
- 'uploader_id': 'maiaramaraisaoficial',
+ 'uploader_id': '@maiaramaraisaoficial',
'uploader': 'Maiara e Maraisa',
}
}]
diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py
index db5ef8b57..b8ac58713 100644
--- a/youtube_dl/extractor/senateisvp.py
+++ b/youtube_dl/extractor/senateisvp.py
@@ -47,7 +47,7 @@ class SenateISVPIE(InfoExtractor):
['vetaff', '76462', 'http://vetaff-f.akamaihd.net'],
['arch', '', 'http://ussenate-f.akamaihd.net/']
]
- _IE_NAME = 'senate.gov'
+ IE_NAME = 'senate.gov'
_VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)'
_TESTS = [{
'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
diff --git a/youtube_dl/extractor/vidlii.py b/youtube_dl/extractor/vidlii.py
index f4774256b..47f328e87 100644
--- a/youtube_dl/extractor/vidlii.py
+++ b/youtube_dl/extractor/vidlii.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+
from ..utils import (
float_or_none,
get_element_by_id,
@@ -11,6 +12,7 @@ from ..utils import (
strip_or_none,
unified_strdate,
urljoin,
+ str_to_int,
)
@@ -36,6 +38,26 @@ class VidLiiIE(InfoExtractor):
'tags': ['Vidlii', 'Jan', 'Videogames'],
}
}, {
+ # HD
+ 'url': 'https://www.vidlii.com/watch?v=2Ng8Abj2Fkl',
+ 'md5': '450e7da379c884788c3a4fa02a3ce1a4',
+ 'info_dict': {
+ 'id': '2Ng8Abj2Fkl',
+ 'ext': 'mp4',
+ 'title': 'test',
+ 'description': 'md5:cc55a86032a7b6b3cbfd0f6b155b52e9',
+ 'thumbnail': 'https://www.vidlii.com/usfi/thmp/2Ng8Abj2Fkl.jpg',
+ 'uploader': 'VidLii',
+ 'uploader_url': 'https://www.vidlii.com/user/VidLii',
+ 'upload_date': '20200927',
+ 'duration': 5,
+ 'view_count': int,
+ 'comment_count': int,
+ 'average_rating': float,
+ 'categories': ['Film & Animation'],
+ 'tags': list,
+ },
+ }, {
'url': 'https://www.vidlii.com/embed?v=tJluaH4BJ3v&a=0',
'only_matching': True,
}]
@@ -46,11 +68,32 @@ class VidLiiIE(InfoExtractor):
webpage = self._download_webpage(
'https://www.vidlii.com/watch?v=%s' % video_id, video_id)
- video_url = self._search_regex(
- r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1', webpage,
- 'video url', group='url')
+ formats = []
+
+ def add_format(format_url, height=None):
+ height = int(self._search_regex(r'(\d+)\.mp4',
+ format_url, 'height', default=360))
+
+ formats.append({
+ 'url': format_url,
+ 'format_id': '%dp' % height if height else None,
+ 'height': height,
+ })
+
+ sources = re.findall(
+ r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1',
+ webpage)
+
+ formats = []
+ if len(sources) > 1:
+ add_format(sources[1][1])
+ self._check_formats(formats, video_id)
+ if len(sources) > 0:
+ add_format(sources[0][1])
+
+ self._sort_formats(formats)
- title = self._search_regex(
+ title = self._html_search_regex(
(r'<h1>([^<]+)</h1>', r'<title>([^<]+) - VidLii<'), webpage,
'title')
@@ -82,9 +125,9 @@ class VidLiiIE(InfoExtractor):
default=None) or self._search_regex(
r'duration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
- view_count = int_or_none(self._search_regex(
- (r'<strong>(\d+)</strong> views',
- r'Views\s*:\s*<strong>(\d+)</strong>'),
+ view_count = str_to_int(self._html_search_regex(
+ (r'<strong>([\d,.]+)</strong> views',
+ r'Views\s*:\s*<strong>([\d,.]+)</strong>'),
webpage, 'view count', fatal=False))
comment_count = int_or_none(self._search_regex(
@@ -109,7 +152,7 @@ class VidLiiIE(InfoExtractor):
return {
'id': video_id,
- 'url': video_url,
+ 'formats': formats,
'title': title,
'description': description,
'thumbnail': thumbnail,
diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py
index 84969f8e1..8da5b430f 100644
--- a/youtube_dl/extractor/yandexmusic.py
+++ b/youtube_dl/extractor/yandexmusic.py
@@ -106,6 +106,25 @@ class YandexMusicTrackIE(YandexMusicBaseIE):
}, {
'url': 'http://music.yandex.com/album/540508/track/4878838',
'only_matching': True,
+ }, {
+ 'url': 'https://music.yandex.ru/album/16302456/track/85430762',
+ 'md5': '11b8d50ab03b57738deeaadf661a0a48',
+ 'info_dict': {
+ 'id': '85430762',
+ 'ext': 'mp3',
+ 'abr': 128,
+ 'title': 'Haddadi Von Engst, Phonic Youth, Super Flu - Til The End (Super Flu Remix)',
+ 'filesize': int,
+ 'duration': 431.14,
+ 'track': 'Til The End (Super Flu Remix)',
+ 'album': 'Til The End',
+ 'album_artist': 'Haddadi Von Engst, Phonic Youth',
+ 'artist': 'Haddadi Von Engst, Phonic Youth, Super Flu',
+ 'release_year': 2021,
+ 'genre': 'house',
+ 'disc_number': 1,
+ 'track_number': 2,
+ }
}]
def _real_extract(self, url):
@@ -116,10 +135,14 @@ class YandexMusicTrackIE(YandexMusicBaseIE):
'track', tld, url, track_id, 'Downloading track JSON',
{'track': '%s:%s' % (track_id, album_id)})['track']
track_title = track['title']
+ track_version = track.get('version')
+ if track_version:
+ track_title = '%s (%s)' % (track_title, track_version)
download_data = self._download_json(
'https://music.yandex.ru/api/v2.1/handlers/track/%s:%s/web-album_track-track-track-main/download/m' % (track_id, album_id),
track_id, 'Downloading track location url JSON',
+ query={'hq': 1},
headers={'X-Retpath-Y': url})
fd_data = self._download_json(
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 90c16e172..b31798729 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -3,11 +3,14 @@
from __future__ import unicode_literals
import collections
+import hashlib
import itertools
import json
import os.path
import random
import re
+import string
+import time
import traceback
from .common import InfoExtractor, SearchInfoExtractor
@@ -24,11 +27,14 @@ from ..compat import (
)
from ..jsinterp import JSInterpreter
from ..utils import (
+ bug_reports_message,
clean_html,
dict_get,
error_to_compat_str,
ExtractorError,
+ filter_dict,
float_or_none,
+ get_first,
extract_attributes,
get_element_by_attribute,
int_or_none,
@@ -43,6 +49,7 @@ from ..utils import (
parse_duration,
parse_qs,
qualities,
+ remove_end,
remove_start,
smuggle_url,
str_or_none,
@@ -60,11 +67,13 @@ from ..utils import (
url_or_none,
urlencode_postdata,
urljoin,
+ variadic,
)
class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors"""
+
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
@@ -78,9 +87,66 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
_PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)'
+ _INNERTUBE_CLIENTS = {
+ 'ios': {
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'IOS',
+ 'clientVersion': '20.10.4',
+ 'deviceMake': 'Apple',
+ 'deviceModel': 'iPhone16,2',
+ 'userAgent': 'com.google.ios.youtube/20.10.4 (iPhone16,2; U; CPU iOS 18_3_2 like Mac OS X;)',
+ 'osName': 'iPhone',
+ 'osVersion': '18.3.2.22D82',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 5,
+ 'REQUIRE_JS_PLAYER': False,
+ 'REQUIRE_PO_TOKEN': True,
+ },
+ # mweb has 'ultralow' formats
+ # See: https://github.com/yt-dlp/yt-dlp/pull/557
+ 'mweb': {
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'MWEB',
+ 'clientVersion': '2.20250311.03.00',
+ # mweb previously did not require PO Token with this UA
+ 'userAgent': 'Mozilla/5.0 (iPad; CPU OS 16_7_10 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1,gzip(gfe)',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 2,
+ 'REQUIRE_PO_TOKEN': True,
+ 'SUPPORTS_COOKIES': True,
+ },
+ 'tv': {
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'TVHTML5',
+ 'clientVersion': '7.20250312.16.00',
+ 'userAgent': 'Mozilla/5.0 (ChromiumStylePlatform) Cobalt/Version',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 7,
+ 'SUPPORTS_COOKIES': True,
+ },
+ 'web': {
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'WEB',
+ 'clientVersion': '2.20250312.04.00',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 1,
+ 'REQUIRE_PO_TOKEN': True,
+ 'SUPPORTS_COOKIES': True,
+ },
+ }
+
def _login(self):
"""
Attempt to log in to YouTube.
+
True is returned if successful or skipped.
False is returned if login failed.
@@ -136,7 +202,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
[2, 1, None, 1,
'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
None, [], 4],
- 1, [None, None, []], None, None, None, True
+ 1, [None, None, []], None, None, None, True,
],
username,
]
@@ -158,7 +224,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
None, 1, None, [1, None, None, None, [password, None, True]],
[
None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
- 1, [None, None, []], None, None, None, True
+ 1, [None, None, []], None, None, None, True,
]]
challenge_results = req(
@@ -211,7 +277,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
user_hash, None, 2, None,
[
9, None, None, None, None, None, None, None,
- [None, tfa_code, True, 2]
+ [None, tfa_code, True, 2],
]]
tfa_results = req(
@@ -277,32 +343,57 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
if not self._login():
return
- _DEFAULT_API_DATA = {
- 'context': {
- 'client': {
- 'clientName': 'WEB',
- 'clientVersion': '2.20201021.03.00',
- }
- },
- }
+ _DEFAULT_API_DATA = {'context': _INNERTUBE_CLIENTS['web']['INNERTUBE_CONTEXT']}
_YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
_YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
- def _call_api(self, ep, query, video_id, fatal=True, headers=None):
+ _SAPISID = None
+
+ def _generate_sapisidhash_header(self, origin='https://www.youtube.com'):
+ time_now = round(time.time())
+ if self._SAPISID is None:
+ yt_cookies = self._get_cookies('https://www.youtube.com')
+ # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is.
+ # See: https://github.com/yt-dlp/yt-dlp/issues/393
+ sapisid_cookie = dict_get(
+ yt_cookies, ('__Secure-3PAPISID', 'SAPISID'))
+ if sapisid_cookie and sapisid_cookie.value:
+ self._SAPISID = sapisid_cookie.value
+ self.write_debug('Extracted SAPISID cookie')
+ # SAPISID cookie is required if not already present
+ if not yt_cookies.get('SAPISID'):
+ self.write_debug('Copying __Secure-3PAPISID cookie to SAPISID cookie')
+ self._set_cookie(
+ '.youtube.com', 'SAPISID', self._SAPISID, secure=True, expire_time=time_now + 3600)
+ else:
+ self._SAPISID = False
+ if not self._SAPISID:
+ return None
+ # SAPISIDHASH algorithm from https://stackoverflow.com/a/32065323
+ sapisidhash = hashlib.sha1(
+ '{0} {1} {2}'.format(time_now, self._SAPISID, origin).encode('utf-8')).hexdigest()
+ return 'SAPISIDHASH {0}_{1}'.format(time_now, sapisidhash)
+
+ def _call_api(self, ep, query, video_id, fatal=True, headers=None,
+ note='Downloading API JSON'):
data = self._DEFAULT_API_DATA.copy()
data.update(query)
real_headers = {'content-type': 'application/json'}
if headers:
real_headers.update(headers)
+ # was: 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
+ api_key = self.get_param('youtube_innertube_key')
return self._download_json(
'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
- note='Downloading API JSON', errnote='Unable to download API page',
+ note=note, errnote='Unable to download API page',
data=json.dumps(data).encode('utf8'), fatal=fatal,
- headers=real_headers,
- query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
+ headers=real_headers, query=filter_dict({
+ 'key': api_key,
+ 'prettyPrint': 'false',
+ }))
def _extract_yt_initial_data(self, video_id, webpage):
return self._parse_json(
@@ -311,6 +402,22 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
video_id)
+ def _extract_visitor_data(self, *args):
+ """
+ Extract visitorData from an API response or ytcfg
+
+ Appears to be used to track session state
+ """
+ visitor_data = self.get_param('youtube_visitor_data')
+ if visitor_data:
+ return visitor_data
+
+ return get_first(
+ args, (('VISITOR_DATA',
+ ('INNERTUBE_CONTEXT', 'client', 'visitorData'),
+ ('responseContext', 'visitorData')),
+ T(compat_str)))
+
def _extract_ytcfg(self, video_id, webpage):
return self._parse_json(
self._search_regex(
@@ -350,13 +457,33 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'uploader': uploader,
}
+ @staticmethod
+ def _extract_thumbnails(data, *path_list, **kw_final_key):
+ """
+ Extract thumbnails from thumbnails dict
+ @param path_list: path list to level that contains 'thumbnails' key
+ """
+ final_key = kw_final_key.get('final_key', 'thumbnails')
+
+ return traverse_obj(data, ((
+ tuple(variadic(path) + (final_key, Ellipsis)
+ for path in path_list or [()])), {
+ 'url': ('url', T(url_or_none),
+ # Sometimes youtube gives a wrong thumbnail URL. See:
+ # https://github.com/yt-dlp/yt-dlp/issues/233
+ # https://github.com/ytdl-org/youtube-dl/issues/28023
+ T(lambda u: update_url(u, query=None) if u and 'maxresdefault' in u else u)),
+ 'height': ('height', T(int_or_none)),
+ 'width': ('width', T(int_or_none)),
+ }, T(lambda t: t if t.get('url') else None)))
+
def _search_results(self, query, params):
data = {
'context': {
'client': {
'clientName': 'WEB',
'clientVersion': '2.20201021.03.00',
- }
+ },
},
'query': query,
}
@@ -364,11 +491,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
data['params'] = params
for page_num in itertools.count(1):
search = self._download_json(
- 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'https://www.youtube.com/youtubei/v1/search',
video_id='query "%s"' % query,
note='Downloading page %s' % page_num,
errnote='Unable to download API page', fatal=False,
data=json.dumps(data).encode('utf8'),
+ query={
+ # 'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'prettyPrint': 'false',
+ },
headers={'content-type': 'application/json'})
if not search:
break
@@ -433,7 +564,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# (HTML, videodetails, metadata, renderers)
'name': ('content', 'author', (('ownerChannelName', None), 'title'), ['text']),
'url': ('href', 'ownerProfileUrl', 'vanityChannelUrl',
- ['navigationEndpoint', 'browseEndpoint', 'canonicalBaseUrl'])
+ ['navigationEndpoint', 'browseEndpoint', 'canonicalBaseUrl']),
}
if any((videodetails, metadata, renderers)):
result = (
@@ -559,9 +690,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'invidious': '|'.join(_INVIDIOUS_SITES),
}
_PLAYER_INFO_RE = (
- r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
- r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
- r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
+ r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/(?:tv-)?player',
+ r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias(?:_tce)?\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
+ r'\b(?P<id>vfl[a-zA-Z0-9_-]{6,})\b.*?\.js$',
)
_SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
@@ -642,7 +773,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
'description': '',
'uploader': '8KVIDEO',
- 'title': 'UHDTV TEST 8K VIDEO.mp4'
+ 'title': 'UHDTV TEST 8K VIDEO.mp4',
},
'params': {
'youtube_include_dash_manifest': True,
@@ -682,7 +813,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': r're:https?://(?:www\.)?youtube\.com/@theamazingatheist',
'title': 'Burning Everyone\'s Koran',
'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
- }
+ },
},
# Age-gated videos
{
@@ -810,7 +941,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
},
'expected_warnings': [
'DASH manifest missing',
- ]
+ ],
},
# Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
{
@@ -1454,6 +1585,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
}
+ _PLAYER_JS_VARIANT_MAP = (
+ ('main', 'player_ias.vflset/en_US/base.js'),
+ ('tce', 'player_ias_tce.vflset/en_US/base.js'),
+ ('tv', 'tv-player-ias.vflset/tv-player-ias.js'),
+ ('tv_es6', 'tv-player-es6.vflset/tv-player-es6.js'),
+ ('phone', 'player-plasma-ias-phone-en_US.vflset/base.js'),
+ ('tablet', 'player-plasma-ias-tablet-en_US.vflset/base.js'),
+ )
+
@classmethod
def suitable(cls, url):
if parse_qs(url).get('list', [None])[0]:
@@ -1493,46 +1633,97 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
""" Return a string representation of a signature """
return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
- @classmethod
- def _extract_player_info(cls, player_url):
- for player_re in cls._PLAYER_INFO_RE:
- id_m = re.search(player_re, player_url)
- if id_m:
- break
- else:
- raise ExtractorError('Cannot identify player %r' % player_url)
- return id_m.group('id')
+ def _extract_player_info(self, player_url):
+ try:
+ return self._search_regex(
+ self._PLAYER_INFO_RE, player_url, 'player info', group='id')
+ except ExtractorError as e:
+ raise ExtractorError(
+ 'Cannot identify player %r' % (player_url,), cause=e)
- def _load_player(self, video_id, player_url, fatal=True, player_id=None):
- if not player_id:
+ def _player_js_cache_key(self, player_url, extra_id=None, _cache={}):
+ if player_url not in _cache:
player_id = self._extract_player_info(player_url)
- if player_id not in self._code_cache:
+ player_path = remove_start(
+ compat_urllib_parse.urlparse(player_url).path,
+ '/s/player/{0}/'.format(player_id))
+ variant = next((k for k, v in self._PLAYER_JS_VARIANT_MAP
+ if v == player_path), None)
+ if not variant:
+ variant = next(
+ (k for k, v in self._PLAYER_JS_VARIANT_MAP
+ if re.match(re.escape(v).replace('en_US', r'\w+') + '$', player_path)),
+ None)
+ if not variant:
+ self.write_debug(
+ 'Unable to determine player JS variant\n'
+ ' player = {0}'.format(player_url), only_once=True)
+ variant = re.sub(r'[^a-zA-Z0-9]', '_', remove_end(player_path, '.js'))
+ _cache[player_url] = join_nonempty(player_id, variant)
+
+ if extra_id:
+ extra_id = '-'.join((_cache[player_url], extra_id))
+ assert os.path.basename(extra_id) == extra_id
+ return extra_id
+ return _cache[player_url]
+
+ def _load_player(self, video_id, player_url, fatal=True):
+ player_js_key = self._player_js_cache_key(player_url)
+ if player_js_key not in self._code_cache:
code = self._download_webpage(
player_url, video_id, fatal=fatal,
- note='Downloading player ' + player_id,
- errnote='Download of %s failed' % player_url)
+ note='Downloading player {0}'.format(player_js_key),
+ errnote='Download of {0} failed'.format(player_url))
if code:
- self._code_cache[player_id] = code
- return self._code_cache[player_id] if fatal else self._code_cache.get(player_id)
+ self._code_cache[player_js_key] = code
+ return self._code_cache.get(player_js_key)
+
+ def _load_player_data_from_cache(self, name, player_url, extra_id=None):
+ cache_id = ('youtube-{0}'.format(name), self._player_js_cache_key(player_url, extra_id))
+ data = self._player_cache.get(cache_id)
+ if data:
+ return data
+
+ data = self.cache.load(*cache_id, min_ver='2025.04.07')
+ if data:
+ self._player_cache[cache_id] = data
+ return data
+
+ def _store_player_data_to_cache(self, name, player_url, data, extra_id=None):
+ cache_id = ('youtube-{0}'.format(name), self._player_js_cache_key(player_url, extra_id))
+
+ if cache_id not in self._player_cache:
+ self.cache.store(cache_id[0], cache_id[1], data)
+ self._player_cache[cache_id] = data
+
+ def _remove_player_data_from_cache(self, name, player_url, extra_id=None):
+ cache_id = ('youtube-{0}'.format(name), self._player_js_cache_key(player_url, extra_id))
+
+ if cache_id in self._player_cache:
+ self.cache.clear(*cache_id)
+ self._player_cache.pop(cache_id, None)
def _extract_signature_function(self, video_id, player_url, example_sig):
- player_id = self._extract_player_info(player_url)
+ # player_id = self._extract_player_info(player_url)
# Read from filesystem cache
- func_id = 'js_{0}_{1}'.format(
- player_id, self._signature_cache_id(example_sig))
- assert os.path.basename(func_id) == func_id
-
- self.write_debug('Extracting signature function {0}'.format(func_id))
- cache_spec, code = self.cache.load('youtube-sigfuncs', func_id), None
+ extra_id = self._signature_cache_id(example_sig)
+ self.write_debug('Extracting signature function {0}-{1}'.format(player_url, extra_id))
+ cache_spec, code = self._load_player_data_from_cache(
+ 'sigfuncs', player_url, extra_id=extra_id), None
if not cache_spec:
- code = self._load_player(video_id, player_url, player_id)
- if code:
- res = self._parse_sig_js(code)
- test_string = ''.join(map(compat_chr, range(len(example_sig))))
- cache_spec = [ord(c) for c in res(test_string)]
- self.cache.store('youtube-sigfuncs', func_id, cache_spec)
+ code = self._load_player(video_id, player_url)
+ if code:
+ res = self._parse_sig_js(code)
+ test_string = ''.join(map(compat_chr, range(len(example_sig))))
+ cache_spec = [ord(c) for c in res(test_string)]
+ self._store_player_data_to_cache(
+ 'sigfuncs', player_url, cache_spec, extra_id=extra_id)
+ else:
+ self.report_warning(
+ 'Failed to compute signature function {0}-{1}'.format(
+ player_url, extra_id))
return lambda s: ''.join(s[i] for i in cache_spec)
@@ -1578,26 +1769,54 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
' return %s\n') % (signature_id_tuple, expr_code)
self.to_screen('Extracted signature function:\n' + code)
+ def _extract_sig_fn(self, jsi, funcname):
+ var_ay = self._search_regex(
+ r'''(?x)
+ (?:\*/|\{|\n|^)\s*(?:'[^']+'\s*;\s*)
+ (var\s*[\w$]+\s*=\s*(?:
+ ('|")(?:\\\2|(?!\2).)+\2\s*\.\s*split\(\s*('|")\W+\3\s*\)|
+ \[\s*(?:('|")(?:\\\4|(?!\4).)*\4\s*(?:(?=\])|,\s*))+\]
+ ))(?=\s*[,;])
+ ''', jsi.code, 'useful values', default='')
+
+ sig_fn = jsi.extract_function_code(funcname)
+
+ if var_ay:
+ sig_fn = (sig_fn[0], ';\n'.join((var_ay, sig_fn[1])))
+
+ return sig_fn
+
def _parse_sig_js(self, jscode):
+ # Examples where `sig` is funcname:
+ # sig=function(a){a=a.split(""); ... ;return a.join("")};
+ # ;c&&(c=sig(decodeURIComponent(c)),a.set(b,encodeURIComponent(c)));return a};
+ # {var l=f,m=h.sp,n=sig(decodeURIComponent(h.s));l.set(m,encodeURIComponent(n))}
+ # sig=function(J){J=J.split(""); ... ;return J.join("")};
+ # ;N&&(N=sig(decodeURIComponent(N)),J.set(R,encodeURIComponent(N)));return J};
+ # {var H=u,k=f.sp,v=sig(decodeURIComponent(f.s));H.set(k,encodeURIComponent(v))}
funcname = self._search_regex(
- (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
- r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
- r'\bm=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)',
- r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)',
- r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\))?',
- r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
+ (r'\b(?P<var>[\w$]+)&&\((?P=var)=(?P<sig>[\w$]{2,})\(decodeURIComponent\((?P=var)\)\)',
+ r'(?P<sig>[\w$]+)\s*=\s*function\(\s*(?P<arg>[\w$]+)\s*\)\s*{\s*(?P=arg)\s*=\s*(?P=arg)\.split\(\s*""\s*\)\s*;\s*[^}]+;\s*return\s+(?P=arg)\.join\(\s*""\s*\)',
+ r'(?:\b|[^\w$])(?P<sig>[\w$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[\w$]{2}\.[\w$]{2}\(a,\d+\))?',
+ # Old patterns
+ r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[\w$]+)\(',
+ r'\b[\w]+\s*&&\s*[\w]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[\w$]+)\(',
+ r'\bm=(?P<sig>[\w$]{2,})\(decodeURIComponent\(h\.s\)\)',
# Obsolete patterns
- r'("|\')signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
- r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
- r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
- r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
- r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
- r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
+ r'("|\')signature\1\s*,\s*(?P<sig>[\w$]+)\(',
+ r'\.sig\|\|(?P<sig>[\w$]+)\(',
+ r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[\w$]+)\(',
+ r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[\w$]+)\(',
+ r'\bc\s*&&\s*[\w]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[\w$]+)\('),
jscode, 'Initial JS player signature function name', group='sig')
jsi = JSInterpreter(jscode)
- initial_function = jsi.extract_function(funcname)
- return lambda s: initial_function([s])
+
+ initial_function = self._extract_sig_fn(jsi, funcname)
+
+ func = jsi.extract_function_from_code(*initial_function)
+
+ return lambda s: func([s])
def _cached(self, func, *cache_id):
def inner(*args, **kwargs):
@@ -1636,7 +1855,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
try:
jsi, player_id, func_code = self._extract_n_function_code(video_id, player_url)
except ExtractorError as e:
- raise ExtractorError('Unable to extract nsig jsi, player_id, func_codefunction code', cause=e)
+ raise ExtractorError('Unable to extract nsig function code', cause=e)
if self.get_param('youtube_print_sig_code'):
self.to_screen('Extracted nsig function from {0}:\n{1}\n'.format(
player_id, func_code[1]))
@@ -1647,7 +1866,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
except JSInterpreter.Exception as e:
self.report_warning(
'%s (%s %s)' % (
- 'Unable to decode n-parameter: download likely to be throttled',
+ 'Unable to decode n-parameter: expect download to be blocked or throttled',
error_to_compat_str(e),
traceback.format_exc()),
video_id=video_id)
@@ -1657,41 +1876,103 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return ret
def _extract_n_function_name(self, jscode):
+ func_name, idx = None, None
+
+ def generic_n_function_search(func_name=None):
+ return self._search_regex(
+ r'''(?xs)
+ (?:(?<=[^\w$])|^) # instead of \b, which ignores $
+ (?P<name>%s)\s*=\s*function\((?!\d)[a-zA-Z\d_$]+\)
+ \s*\{(?:(?!};).)+?(?:
+ ["']enhanced_except_ |
+ return\s*(?P<q>"|')[a-zA-Z\d-]+_w8_(?P=q)\s*\+\s*[\w$]+
+ )
+ ''' % (func_name or r'(?!\d)[a-zA-Z\d_$]+',), jscode,
+ 'Initial JS player n function name', group='name',
+ default=None if func_name else NO_DEFAULT)
+
+ # these special cases are redundant and probably obsolete (2025-04):
+ # they make the tests run ~10% faster without fallback warnings
+ r"""
func_name, idx = self._search_regex(
- r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z_$][\w$]*)(?:\[(?P<idx>\d+)\])?\([\w$]+\)',
- jscode, 'Initial JS player n function name', group=('nfunc', 'idx'))
+ # (y=NuD(),Mw(k),q=k.Z[y]||null)&&(q=narray[idx](q),k.set(y,q),k.V||NuD(''))}};
+ # (R="nn"[+J.Z],mW(J),N=J.K[R]||null)&&(N=narray[idx](N),J.set(R,N))}};
+ # or: (b=String.fromCharCode(110),c=a.get(b))&&c=narray[idx](c)
+ # or: (b="nn"[+a.D],c=a.get(b))&&(c=narray[idx](c)
+ # or: (PL(a),b=a.j.n||null)&&(b=narray[idx](b)
+ # or: (b="nn"[+a.D],vL(a),c=a.j[b]||null)&&(c=narray[idx](c),a.set(b,c),narray.length||nfunc("")
+ # old: (b=a.get("n"))&&(b=narray[idx](b)(?P<c>[a-z])\s*=\s*[a-z]\s*
+ # older: (b=a.get("n"))&&(b=nfunc(b)
+ r'''(?x)
+ # (expr, ...,
+ \((?:(?:\s*[\w$]+\s*=)?(?:[\w$"+\.\s(\[]+(?:[)\]]\s*)?),)*
+ # b=...
+ (?P<b>[\w$]+)\s*=\s*(?!(?P=b)[^\w$])[\w$]+\s*(?:(?:
+ \.\s*[\w$]+ |
+ \[\s*[\w$]+\s*\] |
+ \.\s*get\s*\(\s*[\w$"]+\s*\)
+ )\s*){,2}(?:\s*\|\|\s*null(?=\s*\)))?\s*
+ \)\s*&&\s*\( # ...)&&(
+ # b = nfunc, b = narray[idx]
+ (?P=b)\s*=\s*(?P<nfunc>[\w$]+)\s*
+ (?:\[\s*(?P<idx>[\w$]+)\s*\]\s*)?
+ # (...)
+ \(\s*[\w$]+\s*\)
+ ''', jscode, 'Initial JS player n function name', group=('nfunc', 'idx'),
+ default=(None, None))
+ """
+
+ if not func_name:
+ # nfunc=function(x){...}|function nfunc(x); ...
+ # ... var y=[nfunc]|y[idx]=nfunc);
+ # obvious REs hang, so use a two-stage tactic
+ for m in re.finditer(r'''(?x)
+ [\n;]var\s(?:(?:(?!,).)+,|\s)*?(?!\d)[\w$]+(?:\[(?P<idx>\d+)\])?\s*=\s*
+ (?(idx)|\[\s*)(?P<nfunc>(?!\d)[\w$]+)(?(idx)|\s*\])
+ \s*?[;\n]
+ ''', jscode):
+ fn = self._search_regex(
+ r'[;,]\s*(function\s+)?({0})(?(1)|\s*=\s*function)\s*\((?!\d)[\w$]+\)\s*\{1}(?!\s*return\s)'.format(
+ re.escape(m.group('nfunc')), '{'),
+ jscode, 'Initial JS player n function name (2)', group=2, default=None)
+ if fn:
+ func_name = fn
+ idx = m.group('idx')
+ if generic_n_function_search(func_name):
+ # don't look any further
+ break
+
+ # thx bashonly: yt-dlp/yt-dlp/pull/10611
+ if not func_name:
+ self.report_warning('Falling back to generic n function search', only_once=True)
+ return generic_n_function_search()
+
if not idx:
return func_name
- return self._parse_json(self._search_regex(
- r'var {0}\s*=\s*(\[.+?\])\s*[,;]'.format(re.escape(func_name)), jscode,
- 'Initial JS player n function list ({0}.{1})'.format(func_name, idx)),
- func_name, transform_source=js_to_json)[int(idx)]
+ return self._search_json(
+ r'(?<![\w-])var\s(?:(?:(?!,).)+,|\s)*?{0}\s*='.format(re.escape(func_name)), jscode,
+ 'Initial JS player n function list ({0}.{1})'.format(func_name, idx),
+ func_name, contains_pattern=r'\[.+\]', end_pattern='[,;]',
+ transform_source=js_to_json)[int(idx)]
def _extract_n_function_code(self, video_id, player_url):
player_id = self._extract_player_info(player_url)
- func_code = self.cache.load('youtube-nsig', player_id)
+ func_code = self._load_player_data_from_cache('nsig', player_url)
jscode = func_code or self._load_player(video_id, player_url)
jsi = JSInterpreter(jscode)
if func_code:
return jsi, player_id, func_code
- func_name = self._extract_n_function_name(jscode)
+ return self._extract_n_function_code_jsi(video_id, jsi, player_id, player_url)
- # For redundancy
- func_code = self._search_regex(
- r'''(?xs)%s\s*=\s*function\s*\((?P<var>[\w$]+)\)\s*
- # NB: The end of the regex is intentionally kept strict
- {(?P<code>.+?}\s*return\ [\w$]+.join\(""\))};''' % func_name,
- jscode, 'nsig function', group=('var', 'code'), default=None)
- if func_code:
- func_code = ([func_code[0]], func_code[1])
- else:
- self.write_debug('Extracting nsig function with jsinterp')
- func_code = jsi.extract_function_code(func_name)
+ def _extract_n_function_code_jsi(self, video_id, jsi, player_id=None, player_url=None):
+ func_name = self._extract_n_function_name(jsi.code)
- self.cache.store('youtube-nsig', player_id, func_code)
+ func_code = self._extract_sig_fn(jsi, func_name)
+ if player_url:
+ self._store_player_data_to_cache('nsig', player_url, func_code)
return jsi, player_id, func_code
def _extract_n_function_from_code(self, jsi, func_code):
@@ -1699,13 +1980,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def extract_nsig(s):
try:
- ret = func([s])
+ ret = func([s], kwargs={'_ytdl_do_not_return': s})
except JSInterpreter.Exception:
raise
except Exception as e:
raise JSInterpreter.Exception(traceback.format_exc(), cause=e)
- if ret.startswith('enhanced_except_'):
+ if ret.startswith('enhanced_except_') or ret.endswith(s):
raise JSInterpreter.Exception('Signature function returned an exception')
return ret
@@ -1724,7 +2005,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
n_param = n_param[-1]
n_response = decrypt_nsig(n_param)(n_param, video_id, player_url)
if n_response is None:
- # give up if descrambling failed
+ # give up and forget cached data if descrambling failed
+ self._remove_player_data_from_cache('nsig', player_url)
break
fmt['url'] = update_url_query(fmt['url'], {'n': n_response})
@@ -1735,18 +2017,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
Required to tell API what sig/player version is in use.
"""
sts = traverse_obj(ytcfg, 'STS', expected_type=int)
- if not sts:
- # Attempt to extract from player
- if player_url is None:
- error_msg = 'Cannot extract signature timestamp without player_url.'
- if fatal:
- raise ExtractorError(error_msg)
- self.report_warning(error_msg)
- return
- code = self._load_player(video_id, player_url, fatal=fatal)
- sts = int_or_none(self._search_regex(
- r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code or '',
- 'JS player signature timestamp', group='sts', fatal=fatal))
+ if sts:
+ return sts
+
+ if not player_url:
+ error_msg = 'Cannot extract signature timestamp without player url'
+ if fatal:
+ raise ExtractorError(error_msg)
+ self.report_warning(error_msg)
+ return None
+
+ sts = self._load_player_data_from_cache('sts', player_url)
+ if sts:
+ return sts
+
+ # Attempt to extract from player
+ code = self._load_player(video_id, player_url, fatal=fatal)
+ sts = int_or_none(self._search_regex(
+ r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code or '',
+ 'JS player signature timestamp', group='sts', fatal=fatal))
+ if sts:
+ self._store_player_data_to_cache('sts', player_url, sts)
+
return sts
def _mark_watched(self, video_id, player_response):
@@ -1758,8 +2050,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# cpn generation algorithm is reverse engineered from base.js.
# In fact it works even with dummy cpn.
- CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
- cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))
+ CPN_ALPHABET = string.ascii_letters + string.digits + '-_'
+ cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(16))
# more consistent results setting it to right before the end
qs = parse_qs(playback_url)
@@ -1819,8 +2111,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
if mobj is None:
raise ExtractorError('Invalid URL: %s' % url)
- video_id = mobj.group(2)
- return video_id
+ return mobj.group(2)
def _extract_chapters_from_json(self, data, video_id, duration):
chapters_list = try_get(
@@ -1881,9 +2172,89 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_response = self._extract_yt_initial_variable(
webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
video_id, 'initial player response')
- if not player_response:
+ is_live = traverse_obj(player_response, ('videoDetails', 'isLive'))
+
+ if False and not player_response:
player_response = self._call_api(
'player', {'videoId': video_id}, video_id)
+ if True or not player_response:
+ origin = 'https://www.youtube.com'
+ pb_context = {'html5Preference': 'HTML5_PREF_WANTS'}
+
+ player_url = self._extract_player_url(webpage)
+ ytcfg = self._extract_ytcfg(video_id, webpage or '')
+ sts = self._extract_signature_timestamp(video_id, player_url, ytcfg)
+ if sts:
+ pb_context['signatureTimestamp'] = sts
+
+ client_names = traverse_obj(self._INNERTUBE_CLIENTS, (
+ T(dict.items), lambda _, k_v: not k_v[1].get('REQUIRE_PO_TOKEN'),
+ 0))[:1]
+ if 'web' not in client_names:
+ # webpage links won't download: ignore links and playability
+ player_response = filter_dict(
+ player_response or {},
+ lambda k, _: k not in ('streamingData', 'playabilityStatus'))
+
+ if is_live and 'ios' not in client_names:
+ client_names.append('ios')
+
+ headers = {
+ 'Sec-Fetch-Mode': 'navigate',
+ 'Origin': origin,
+ 'X-Goog-Visitor-Id': self._extract_visitor_data(ytcfg) or '',
+ }
+ auth = self._generate_sapisidhash_header(origin)
+ if auth is not None:
+ headers['Authorization'] = auth
+ headers['X-Origin'] = origin
+
+ for client in traverse_obj(self._INNERTUBE_CLIENTS, (client_names, T(dict))):
+
+ query = {
+ 'playbackContext': {
+ 'contentPlaybackContext': pb_context,
+ },
+ 'contentCheckOk': True,
+ 'racyCheckOk': True,
+ 'context': {
+ 'client': merge_dicts(
+ traverse_obj(client, ('INNERTUBE_CONTEXT', 'client')), {
+ 'hl': 'en',
+ 'timeZone': 'UTC',
+ 'utcOffsetMinutes': 0,
+ }),
+ },
+ 'videoId': video_id,
+ }
+
+ api_headers = merge_dicts(headers, traverse_obj(client, {
+ 'X-YouTube-Client-Name': 'INNERTUBE_CONTEXT_CLIENT_NAME',
+ 'X-YouTube-Client-Version': (
+ 'INNERTUBE_CONTEXT', 'client', 'clientVersion'),
+ 'User-Agent': (
+ 'INNERTUBE_CONTEXT', 'client', 'userAgent'),
+ }))
+
+ api_player_response = self._call_api(
+ 'player', query, video_id, fatal=False, headers=api_headers,
+ note=join_nonempty(
+ 'Downloading', traverse_obj(query, (
+ 'context', 'client', 'clientName')),
+ 'API JSON', delim=' '))
+
+ hls = traverse_obj(
+ (player_response, api_player_response),
+ (Ellipsis, 'streamingData', 'hlsManifestUrl', T(url_or_none)))
+ if len(hls) == 2 and not hls[0] and hls[1]:
+ player_response['streamingData']['hlsManifestUrl'] = hls[1]
+ else:
+ video_details = merge_dicts(*traverse_obj(
+ (player_response, api_player_response),
+ (Ellipsis, 'videoDetails', T(dict))))
+ player_response.update(filter_dict(
+ api_player_response or {}, cndn=lambda k, _: k != 'captions'))
+ player_response['videoDetails'] = video_details
def is_agegated(playability):
if not isinstance(playability, dict):
@@ -1932,7 +2303,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
headers = {
'X-YouTube-Client-Name': '85',
'X-YouTube-Client-Version': '2.0',
- 'Origin': 'https://www.youtube.com'
+ 'Origin': 'https://www.youtube.com',
}
video_info = self._call_api('player', query, video_id, fatal=False, headers=headers)
@@ -1961,8 +2332,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)])
search_meta = (
- lambda x: self._html_search_meta(x, webpage, default=None)) \
- if webpage else lambda x: None
+ (lambda x: self._html_search_meta(x, webpage, default=None))
+ if webpage else lambda _: None)
video_details = player_response.get('videoDetails') or {}
microformat = try_get(
@@ -2026,6 +2397,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
itag_qualities = {}
q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'])
CHUNK_SIZE = 10 << 20
+ is_live = video_details.get('isLive')
streaming_data = player_response.get('streamingData') or {}
streaming_formats = streaming_data.get('formats') or []
@@ -2034,7 +2406,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def build_fragments(f):
return LazyList({
'url': update_url_query(f['url'], {
- 'range': '{0}-{1}'.format(range_start, min(range_start + CHUNK_SIZE - 1, f['filesize']))
+ 'range': '{0}-{1}'.format(range_start, min(range_start + CHUNK_SIZE - 1, f['filesize'])),
})
} for range_start in range(0, f['filesize'], CHUNK_SIZE))
@@ -2133,7 +2505,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'protocol': 'http_dash_segments',
'fragments': build_fragments(dct),
} if dct['filesize'] else {
- 'downloader_options': {'http_chunk_size': CHUNK_SIZE} # No longer useful?
+ 'downloader_options': {'http_chunk_size': CHUNK_SIZE}, # No longer useful?
})
formats.append(dct)
@@ -2170,7 +2542,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
hls_manifest_url = streaming_data.get('hlsManifestUrl')
if hls_manifest_url:
for f in self._extract_m3u8_formats(
- hls_manifest_url, video_id, 'mp4', fatal=False):
+ hls_manifest_url, video_id, 'mp4',
+ entry_protocol='m3u8_native', live=is_live, fatal=False):
if process_manifest_format(
f, 'hls', None, self._search_regex(
r'/itag/(\d+)', f['url'], 'itag', default=None)):
@@ -2190,12 +2563,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
formats.append(f)
playable_formats = [f for f in formats if not f.get('has_drm')]
- if formats and not playable_formats:
- # If there are no formats that definitely don't have DRM, all have DRM
- self.report_drm(video_id)
- formats[:] = playable_formats
-
- if not formats:
+ if formats:
+ if not playable_formats:
+ # If there are no formats that definitely don't have DRM, all have DRM
+ self.report_drm(video_id)
+ formats[:] = playable_formats
+ else:
if streaming_data.get('licenseInfos'):
raise ExtractorError(
'This video is DRM protected.', expected=True)
@@ -2276,8 +2649,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Strictly de-prioritize damaged formats
f['preference'] = -10
- is_live = video_details.get('isLive')
-
owner_profile_url = self._yt_urljoin(self._extract_author_var(
webpage, 'url', videodetails=video_details, metadata=microformat))
@@ -2311,9 +2682,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'is_live': is_live,
}
- pctr = try_get(
- player_response,
- lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict)
+ pctr = traverse_obj(
+ (player_response, api_player_response),
+ (Ellipsis, 'captions', 'playerCaptionsTracklistRenderer', T(dict)))
if pctr:
def process_language(container, base_url, lang_code, query):
lang_subs = []
@@ -2327,31 +2698,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
})
container[lang_code] = lang_subs
- subtitles = {}
- for caption_track in (pctr.get('captionTracks') or []):
- base_url = caption_track.get('baseUrl')
- if not base_url:
- continue
- if caption_track.get('kind') != 'asr':
- lang_code = caption_track.get('languageCode')
- if not lang_code:
+ def process_subtitles():
+ subtitles = {}
+ for caption_track in traverse_obj(pctr, (
+ Ellipsis, 'captionTracks', lambda _, v: (
+ v.get('baseUrl') and v.get('languageCode')))):
+ base_url = self._yt_urljoin(caption_track['baseUrl'])
+ if not base_url:
continue
- process_language(
- subtitles, base_url, lang_code, {})
- continue
- automatic_captions = {}
- for translation_language in (pctr.get('translationLanguages') or []):
- translation_language_code = translation_language.get('languageCode')
- if not translation_language_code:
+ lang_code = caption_track['languageCode']
+ if caption_track.get('kind') != 'asr':
+ process_language(
+ subtitles, base_url, lang_code, {})
continue
+ automatic_captions = {}
process_language(
- automatic_captions, base_url, translation_language_code,
- {'tlang': translation_language_code})
- info['automatic_captions'] = automatic_captions
- info['subtitles'] = subtitles
+ automatic_captions, base_url, lang_code, {})
+ for translation_language in traverse_obj(pctr, (
+ Ellipsis, 'translationLanguages', lambda _, v: v.get('languageCode'))):
+ translation_language_code = translation_language['languageCode']
+ process_language(
+ automatic_captions, base_url, translation_language_code,
+ {'tlang': translation_language_code})
+ info['automatic_captions'] = automatic_captions
+ info['subtitles'] = subtitles
+
+ process_subtitles()
parsed_url = compat_urllib_parse_urlparse(url)
- for component in [parsed_url.fragment, parsed_url.query]:
+ for component in (parsed_url.fragment, parsed_url.query):
query = compat_parse_qs(component)
for k, v in query.items():
for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
@@ -2581,7 +2956,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'title': 'Super Cooper Shorts - Shorts',
'uploader': 'Super Cooper Shorts',
'uploader_id': '@SuperCooperShorts',
- }
+ },
}, {
# Channel that does not have a Shorts tab. Test should just download videos on Home tab instead
'url': 'https://www.youtube.com/@emergencyawesome/shorts',
@@ -2635,7 +3010,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'description': 'md5:609399d937ea957b0f53cbffb747a14c',
'uploader': 'ThirstForScience',
'uploader_id': '@ThirstForScience',
- }
+ },
}, {
'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
'only_matching': True,
@@ -2934,7 +3309,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'uploader': '3Blue1Brown',
'uploader_id': '@3blue1brown',
'channel_id': 'UCYO_jab_esuFRV4b17AJtAw',
- }
+ },
}]
@classmethod
@@ -2959,8 +3334,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
expected_type=txt_or_none)
def _grid_entries(self, grid_renderer):
- for item in grid_renderer['items']:
- if not isinstance(item, dict):
+ for item in traverse_obj(grid_renderer, ('items', Ellipsis, T(dict))):
+ lockup_view_model = traverse_obj(item, ('lockupViewModel', T(dict)))
+ if lockup_view_model:
+ entry = self._extract_lockup_view_model(lockup_view_model)
+ if entry:
+ yield entry
continue
renderer = self._extract_grid_item_renderer(item)
if not isinstance(renderer, dict):
@@ -3044,6 +3423,39 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
continue
yield self._extract_video(renderer)
+ def _extract_lockup_view_model(self, view_model):
+ content_id = view_model.get('contentId')
+ if not content_id:
+ return
+ content_type = view_model.get('contentType')
+ if content_type not in ('LOCKUP_CONTENT_TYPE_PLAYLIST', 'LOCKUP_CONTENT_TYPE_PODCAST'):
+ self.report_warning(
+ 'Unsupported lockup view model content type "{0}"{1}'.format(content_type, bug_reports_message()), only_once=True)
+ return
+ return merge_dicts(self.url_result(
+ update_url_query('https://www.youtube.com/playlist', {'list': content_id}),
+ ie=YoutubeTabIE.ie_key(), video_id=content_id), {
+ 'title': traverse_obj(view_model, (
+ 'metadata', 'lockupMetadataViewModel', 'title', 'content', T(compat_str))),
+ 'thumbnails': self._extract_thumbnails(view_model, (
+ 'contentImage', 'collectionThumbnailViewModel', 'primaryThumbnail',
+ 'thumbnailViewModel', 'image'), final_key='sources'),
+ })
+
+ def _extract_shorts_lockup_view_model(self, view_model):
+ content_id = traverse_obj(view_model, (
+ 'onTap', 'innertubeCommand', 'reelWatchEndpoint', 'videoId',
+ T(lambda v: v if YoutubeIE.suitable(v) else None)))
+ if not content_id:
+ return
+ return merge_dicts(self.url_result(
+ content_id, ie=YoutubeIE.ie_key(), video_id=content_id), {
+ 'title': traverse_obj(view_model, (
+ 'overlayMetadata', 'primaryText', 'content', T(compat_str))),
+ 'thumbnails': self._extract_thumbnails(
+ view_model, 'thumbnail', final_key='sources'),
+ })
+
def _video_entry(self, video_renderer):
video_id = video_renderer.get('videoId')
if video_id:
@@ -3090,10 +3502,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
yield entry
def _rich_grid_entries(self, contents):
- for content in contents:
- content = traverse_obj(
- content, ('richItemRenderer', 'content'),
- expected_type=dict) or {}
+ for content in traverse_obj(
+ contents, (Ellipsis, 'richItemRenderer', 'content'),
+ expected_type=dict):
video_renderer = traverse_obj(
content, 'videoRenderer', 'reelItemRenderer',
expected_type=dict)
@@ -3101,6 +3512,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
entry = self._video_entry(video_renderer)
if entry:
yield entry
+ # shorts item
+ shorts_lockup_view_model = content.get('shortsLockupViewModel')
+ if shorts_lockup_view_model:
+ entry = self._extract_shorts_lockup_view_model(shorts_lockup_view_model)
+ if entry:
+ yield entry
# playlist
renderer = traverse_obj(
content, 'playlistRenderer', expected_type=dict) or {}
@@ -3139,23 +3556,15 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
next_continuation = cls._extract_next_continuation_data(renderer)
if next_continuation:
return next_continuation
- contents = []
- for key in ('contents', 'items'):
- contents.extend(try_get(renderer, lambda x: x[key], list) or [])
- for content in contents:
- if not isinstance(content, dict):
- continue
- continuation_ep = try_get(
- content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
- dict)
- if not continuation_ep:
- continue
- continuation = try_get(
- continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
+ for command in traverse_obj(renderer, (
+ ('contents', 'items', 'rows'), Ellipsis, 'continuationItemRenderer',
+ ('continuationEndpoint', ('button', 'buttonRenderer', 'command')),
+ (('commandExecutorCommand', 'commands', Ellipsis), None), T(dict))):
+ continuation = traverse_obj(command, ('continuationCommand', 'token', T(compat_str)))
if not continuation:
continue
- ctp = continuation_ep.get('clickTrackingParams')
- return YoutubeTabIE._build_continuation_query(continuation, ctp)
+ ctp = command.get('clickTrackingParams')
+ return cls._build_continuation_query(continuation, ctp)
def _entries(self, tab, item_id, webpage):
tab_content = try_get(tab, lambda x: x['content'], dict)
@@ -3204,6 +3613,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
entry = self._video_entry(renderer)
if entry:
yield entry
+ renderer = isr_content.get('richGridRenderer')
+ if renderer:
+ for from_ in self._rich_grid_entries(
+ traverse_obj(renderer, ('contents', Ellipsis, T(dict)))):
+ yield from_
+ continuation = self._extract_continuation(renderer)
+ continue
if not continuation:
continuation = self._extract_continuation(is_renderer)
@@ -3213,8 +3629,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
rich_grid_renderer = tab_content.get('richGridRenderer')
if not rich_grid_renderer:
return
- for entry in self._rich_grid_entries(rich_grid_renderer.get('contents') or []):
- yield entry
+ for from_ in self._rich_grid_entries(
+ traverse_obj(rich_grid_renderer, ('contents', Ellipsis, T(dict)))):
+ yield from_
continuation = self._extract_continuation(rich_grid_renderer)
@@ -3232,7 +3649,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'client': {
'clientName': 'WEB',
'clientVersion': client_version,
- }
+ },
}
visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str)
@@ -3248,10 +3665,10 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
if not continuation:
break
if visitor_data:
- headers['x-goog-visitor-id'] = visitor_data
+ headers['X-Goog-Visitor-Id'] = visitor_data
data['continuation'] = continuation['continuation']
data['clickTracking'] = {
- 'clickTrackingParams': continuation['itct']
+ 'clickTrackingParams': continuation['itct'],
}
count = 0
retries = 3
@@ -3260,8 +3677,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
# Downloading page may result in intermittent 5xx HTTP error
# that is usually worked around with a retry
response = self._download_json(
- 'https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'https://www.youtube.com/youtubei/v1/browse',
None, 'Downloading page %d%s' % (page_num, ' (retry #%d)' % count if count else ''),
+ query={
+ # 'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'prettyPrint': 'false',
+ },
headers=headers, data=json.dumps(data).encode('utf8'))
break
except ExtractorError as e:
@@ -3430,10 +3851,23 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
def _real_extract(self, url):
item_id = self._match_id(url)
url = update_url(url, netloc='www.youtube.com')
- # Handle both video/playlist URLs
qs = parse_qs(url)
- video_id = qs.get('v', [None])[0]
- playlist_id = qs.get('list', [None])[0]
+
+ def qs_get(key, default=None):
+ return qs.get(key, [default])[-1]
+
+ # Go around for /feeds/videos.xml?playlist_id={pl_id}
+ if item_id == 'feeds' and '/feeds/videos.xml?' in url:
+ playlist_id = qs_get('playlist_id')
+ if playlist_id:
+ return self.url_result(
+ update_url_query('https://www.youtube.com/playlist', {
+ 'list': playlist_id,
+ }), ie=self.ie_key(), video_id=playlist_id)
+
+ # Handle both video/playlist URLs
+ video_id = qs_get('v')
+ playlist_id = qs_get('list')
if video_id and playlist_id:
if self._downloader.params.get('noplaylist'):
self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
@@ -3510,7 +3944,7 @@ class YoutubePlaylistIE(InfoExtractor):
'uploader': 'milan',
'uploader_id': '@milan5503',
'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
- }
+ },
}, {
'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
'playlist_mincount': 455,
@@ -3520,7 +3954,7 @@ class YoutubePlaylistIE(InfoExtractor):
'uploader': 'LBK',
'uploader_id': '@music_king',
'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA',
- }
+ },
}, {
'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
'only_matching': True,
@@ -3631,7 +4065,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
'info_dict': {
'id': 'youtube-dl test video',
'title': 'youtube-dl test video',
- }
+ },
}]
def _get_n_results(self, query, n):
@@ -3651,7 +4085,7 @@ class YoutubeSearchDateIE(YoutubeSearchIE):
'info_dict': {
'id': 'youtube-dl test video',
'title': 'youtube-dl test video',
- }
+ },
}]
@@ -3666,7 +4100,7 @@ class YoutubeSearchURLIE(YoutubeBaseInfoExtractor):
'id': 'youtube-dl test video',
'title': 'youtube-dl test video',
},
- 'params': {'playlistend': 5}
+ 'params': {'playlistend': 5},
}, {
'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
'only_matching': True,
@@ -3682,6 +4116,7 @@ class YoutubeSearchURLIE(YoutubeBaseInfoExtractor):
class YoutubeFeedsInfoExtractor(YoutubeTabIE):
"""
Base class for feed extractors
+
Subclasses must define the _FEED_NAME property.
"""
_LOGIN_REQUIRED = True