aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/youtube.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
-rw-r--r--youtube_dl/extractor/youtube.py232
1 files changed, 183 insertions, 49 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index d7eda7aa7..4aac2cc03 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -20,13 +20,13 @@ from ..compat import (
compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus,
compat_urllib_parse_urlparse,
- compat_urllib_request,
compat_urlparse,
compat_str,
)
from ..utils import (
clean_html,
encode_dict,
+ error_to_compat_str,
ExtractorError,
float_or_none,
get_element_by_attribute,
@@ -34,7 +34,9 @@ from ..utils import (
int_or_none,
orderedSet,
parse_duration,
+ remove_quotes,
remove_start,
+ sanitized_Request,
smuggle_url,
str_to_int,
unescapeHTML,
@@ -114,7 +116,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii')
- req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
+ req = sanitized_Request(self._LOGIN_URL, login_data)
login_results = self._download_webpage(
req, None,
note='Logging in', errnote='unable to log in', fatal=False)
@@ -147,7 +149,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii')
- tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
+ tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)
tfa_results = self._download_webpage(
tfa_req, None,
note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
@@ -178,15 +180,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return
-class YoutubePlaylistBaseInfoExtractor(InfoExtractor):
- # Extract the video ids from the playlist pages
+class YoutubeEntryListBaseInfoExtractor(InfoExtractor):
+ # Extract entries from page with "Load more" button
def _entries(self, page, playlist_id):
more_widget_html = content_html = page
for page_num in itertools.count(1):
- for video_id, video_title in self.extract_videos_from_page(content_html):
- yield self.url_result(
- video_id, 'Youtube', video_id=video_id,
- video_title=video_title)
+ for entry in self._process_page(content_html):
+ yield entry
mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
if not mobj:
@@ -203,6 +203,12 @@ class YoutubePlaylistBaseInfoExtractor(InfoExtractor):
break
more_widget_html = more['load_more_widget_html']
+
+class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
+ def _process_page(self, content):
+ for video_id, video_title in self.extract_videos_from_page(content):
+ yield self.url_result(video_id, 'Youtube', video_id, video_title)
+
def extract_videos_from_page(self, page):
ids_in_page = []
titles_in_page = []
@@ -224,6 +230,19 @@ class YoutubePlaylistBaseInfoExtractor(InfoExtractor):
return zip(ids_in_page, titles_in_page)
+class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
+ def _process_page(self, content):
+ for playlist_id in re.findall(r'href="/?playlist\?list=(.+?)"', content):
+ yield self.url_result(
+ 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ webpage = self._download_webpage(url, playlist_id)
+ title = self._og_search_title(webpage, fatal=False)
+ return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
+
+
class YoutubeIE(YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com'
_VALID_URL = r"""(?x)^
@@ -241,7 +260,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|(?: # or the v= param in all its forms
(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
- (?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx)
+ (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
v=
)
))
@@ -329,6 +348,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
@@ -377,12 +397,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'upload_date': '20120506',
'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
+ 'alt_title': 'I Love It (feat. Charli XCX)',
'description': 'md5:782e8651347686cba06e58f71ab51773',
'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
'iconic ep', 'iconic', 'love', 'it'],
'uploader': 'Icona Pop',
'uploader_id': 'IconaPop',
+ 'creator': 'Icona Pop',
}
},
{
@@ -393,9 +415,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'upload_date': '20130703',
'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
+ 'alt_title': 'Tunnel Vision',
'description': 'md5:64249768eec3bc4276236606ea996373',
'uploader': 'justintimberlakeVEVO',
'uploader_id': 'justintimberlakeVEVO',
+ 'creator': 'Justin Timberlake',
'age_limit': 18,
}
},
@@ -409,7 +433,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
'uploader': 'SET India',
- 'uploader_id': 'setindia'
+ 'uploader_id': 'setindia',
+ 'age_limit': 18,
}
},
{
@@ -473,10 +498,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'nfWlot6h_JM',
'ext': 'm4a',
'title': 'Taylor Swift - Shake It Off',
+ 'alt_title': 'Shake It Off',
'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
'uploader': 'TaylorSwiftVEVO',
'uploader_id': 'TaylorSwiftVEVO',
'upload_date': '20140818',
+ 'creator': 'Taylor Swift',
},
'params': {
'youtube_include_dash_manifest': True,
@@ -532,9 +559,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'upload_date': '20100430',
'uploader_id': 'deadmau5',
+ 'creator': 'deadmau5',
'description': 'md5:12c56784b8032162bb936a5f76d55360',
'uploader': 'deadmau5',
'title': 'Deadmau5 - Some Chords (HD)',
+ 'alt_title': 'Some Chords',
},
'expected_warnings': [
'DASH manifest missing',
@@ -546,7 +575,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'info_dict': {
'id': 'lqQg6PlCWgI',
'ext': 'mp4',
- 'upload_date': '20120724',
+ 'upload_date': '20150827',
'uploader_id': 'olympic',
'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
'uploader': 'Olympics',
@@ -674,6 +703,49 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
{
'url': 'http://vid.plus/FlRa-iH7PGw',
'only_matching': True,
+ },
+ {
+ # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)
+ 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
+ 'info_dict': {
+ 'id': 'lsguqyKfVQg',
+ 'ext': 'mp4',
+ 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
+ 'alt_title': 'Dark Walk',
+ 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
+ 'upload_date': '20151119',
+ 'uploader_id': 'IronSoulElf',
+ 'uploader': 'IronSoulElf',
+ 'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)
+ 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
+ 'only_matching': True,
+ },
+ {
+ # Video with yt:stretch=17:0
+ 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
+ 'info_dict': {
+ 'id': 'Q39EVAstoRM',
+ 'ext': 'mp4',
+ 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
+ 'description': 'md5:ee18a25c350637c8faff806845bddee9',
+ 'upload_date': '20151107',
+ 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
+ 'uploader': 'CH GAMER DROID',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
+ 'only_matching': True,
}
]
@@ -703,7 +775,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_signature_function(self, video_id, player_url, example_sig):
id_m = re.match(
- r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?)?\.(?P<ext>[a-z]+)$',
+ r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',
player_url)
if not id_m:
raise ExtractorError('Cannot identify player %r' % player_url)
@@ -832,7 +904,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
video_id, note=False)
except ExtractorError as err:
- self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
+ self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
return {}
sub_lang_list = {}
@@ -858,16 +930,33 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return {}
return sub_lang_list
+ def _get_ytplayer_config(self, video_id, webpage):
+ patterns = (
+ # User data may contain arbitrary character sequences that may affect
+ # JSON extraction with regex, e.g. when '};' is contained the second
+ # regex won't capture the whole JSON. Yet working around by trying more
+ # concrete regex first keeping in mind proper quoted string handling
+ # to be implemented in future that will replace this workaround (see
+ # https://github.com/rg3/youtube-dl/issues/7468,
+ # https://github.com/rg3/youtube-dl/pull/7599)
+ r';ytplayer\.config\s*=\s*({.+?});ytplayer',
+ r';ytplayer\.config\s*=\s*({.+?});',
+ )
+ config = self._search_regex(
+ patterns, webpage, 'ytplayer.config', default=None)
+ if config:
+ return self._parse_json(
+ uppercase_escape(config), video_id, fatal=False)
+
def _get_automatic_captions(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an
argument to speed up the process."""
self.to_screen('%s: Looking for automatic captions' % video_id)
- mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
+ player_config = self._get_ytplayer_config(video_id, webpage)
err_msg = 'Couldn\'t find automatic captions for %s' % video_id
- if mobj is None:
+ if not player_config:
self._downloader.report_warning(err_msg)
return {}
- player_config = json.loads(mobj.group(1))
try:
args = player_config['args']
caption_url = args['ttsurl']
@@ -1074,10 +1163,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
age_gate = False
video_info = None
# Try looking directly into the video webpage
- mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
- if mobj:
- json_code = uppercase_escape(mobj.group(1))
- ytplayer_config = json.loads(json_code)
+ ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
+ if ytplayer_config:
args = ytplayer_config['args']
if args.get('url_encoded_fmt_stream_map'):
# Convert to the same format returned by compat_parse_qs
@@ -1107,6 +1194,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if not video_info:
video_info = get_video_info
if 'token' in get_video_info:
+ # Different get_video_info requests may report different results, e.g.
+ # some may report video unavailability, but some may serve it without
+ # any complaint (see https://github.com/rg3/youtube-dl/issues/7362,
+ # the original webpage as well as el=info and el=embedded get_video_info
+ # requests report video unavailability due to geo restriction while
+ # el=detailpage succeeds and returns valid data). This is probably
+ # due to YouTube measures against IP ranges of hosting providers.
+ # Working around by preferring the first succeeded video_info containing
+ # the token if no such video_info yet was found.
+ if 'token' not in video_info:
+ video_info = get_video_info
break
if 'token' not in video_info:
if 'reason' in video_info:
@@ -1222,6 +1320,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
upload_date = unified_strdate(upload_date)
+ m_music = re.search(
+ r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li',
+ video_webpage)
+ if m_music:
+ video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
+ video_creator = clean_html(m_music.group('creator'))
+ else:
+ video_alt_title = video_creator = None
+
m_cat_container = self._search_regex(
r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
video_webpage, 'categories', default=None)
@@ -1332,7 +1439,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_desc = 'flash player %s' % player_version
else:
player_version = self._search_regex(
- r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
+ [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'],
player_url,
'html5 player', fatal=False)
player_desc = 'html5 player %s' % player_version
@@ -1394,6 +1501,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
manifest_url = video_info['hlsvp'][0]
url_map = self._extract_from_m3u8(manifest_url, video_id)
formats = _map_to_format_list(url_map)
+ # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
+ for a_format in formats:
+ a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
else:
raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
@@ -1431,10 +1541,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
video_webpage)
if stretched_m:
- ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
- for f in formats:
- if f.get('vcodec') != 'none':
- f['stretched_ratio'] = ratio
+ w = float(stretched_m.group('w'))
+ h = float(stretched_m.group('h'))
+ # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
+ # We will only process correct ratios.
+ if w > 0 and h > 0:
+ ratio = w / h
+ for f in formats:
+ if f.get('vcodec') != 'none':
+ f['stretched_ratio'] = ratio
self._sort_formats(formats)
@@ -1443,7 +1558,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': video_uploader,
'uploader_id': video_uploader_id,
'upload_date': upload_date,
+ 'creator': video_creator,
'title': video_title,
+ 'alt_title': video_alt_title,
'thumbnail': video_thumbnail,
'description': video_description,
'categories': video_categories,
@@ -1473,7 +1590,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtract
youtube\.com/
(?:
(?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
- \? (?:.*?&)*? (?:p|a|list)=
+ \? (?:.*?[&;])*? (?:p|a|list)=
| p/
)
(
@@ -1604,7 +1721,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtract
self.report_warning('Youtube gives an alert message: ' + match)
playlist_title = self._html_search_regex(
- r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
+ r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
page, 'title')
return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)
@@ -1658,6 +1775,10 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
},
}]
+ @classmethod
+ def suitable(cls, url):
+ return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url)
+
def _real_extract(self, url):
channel_id = self._match_id(url)
@@ -1731,6 +1852,36 @@ class YoutubeUserIE(YoutubeChannelIE):
return super(YoutubeUserIE, cls).suitable(url)
+class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
+ IE_DESC = 'YouTube.com user/channel playlists'
+ _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
+ IE_NAME = 'youtube:playlists'
+
+ _TESTS = [{
+ 'url': 'http://www.youtube.com/user/ThirstForScience/playlists',
+ 'playlist_mincount': 4,
+ 'info_dict': {
+ 'id': 'ThirstForScience',
+ 'title': 'Thirst for Science',
+ },
+ }, {
+ # with "Load more" button
+ 'url': 'http://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
+ 'playlist_mincount': 70,
+ 'info_dict': {
+ 'id': 'igorkle1',
+ 'title': 'Игорь Клейнер',
+ },
+ }, {
+ 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
+ 'playlist_mincount': 17,
+ 'info_dict': {
+ 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
+ 'title': 'Chem Player',
+ },
+ }]
+
+
class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
IE_DESC = 'YouTube.com searches'
# there doesn't appear to be a real limit, for example if you search for
@@ -1826,7 +1977,7 @@ class YoutubeSearchURLIE(InfoExtractor):
}
-class YoutubeShowIE(InfoExtractor):
+class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
IE_DESC = 'YouTube.com (multi-season) shows'
_VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
IE_NAME = 'youtube:show'
@@ -1840,26 +1991,9 @@ class YoutubeShowIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('id')
- webpage = self._download_webpage(
- 'https://www.youtube.com/show/%s/playlists' % playlist_id, playlist_id, 'Downloading show webpage')
- # There's one playlist for each season of the show
- m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
- self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
- entries = [
- self.url_result(
- 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
- for season in m_seasons
- ]
- title = self._og_search_title(webpage, fatal=False)
-
- return {
- '_type': 'playlist',
- 'id': playlist_id,
- 'title': title,
- 'entries': entries,
- }
+ playlist_id = self._match_id(url)
+ return super(YoutubeShowIE, self)._real_extract(
+ 'https://www.youtube.com/show/%s/playlists' % playlist_id)
class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):