aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/youtube.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
-rw-r--r--youtube_dl/extractor/youtube.py214
1 files changed, 137 insertions, 77 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 939f5e61f..9a08924ef 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -17,6 +17,8 @@ from ..compat import (
compat_chr,
compat_parse_qs,
compat_urllib_parse,
+ compat_urllib_parse_unquote,
+ compat_urllib_parse_unquote_plus,
compat_urllib_request,
compat_urlparse,
compat_str,
@@ -29,9 +31,11 @@ from ..utils import (
get_element_by_id,
int_or_none,
orderedSet,
+ str_to_int,
unescapeHTML,
unified_strdate,
uppercase_escape,
+ ISO3166Utils,
)
@@ -234,6 +238,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'44': {'ext': 'webm', 'width': 854, 'height': 480},
'45': {'ext': 'webm', 'width': 1280, 'height': 720},
'46': {'ext': 'webm', 'width': 1920, 'height': 1080},
+ '59': {'ext': 'mp4', 'width': 854, 'height': 480},
+ '78': {'ext': 'mp4', 'width': 854, 'height': 480},
# 3d videos
@@ -516,6 +522,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'skip_download': 'requires avconv',
}
},
+ # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
+ {
+ 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
+ 'info_dict': {
+ 'id': 'FIl7x6_3R5Y',
+ 'ext': 'mp4',
+ 'title': 'md5:7b81415841e02ecd4313668cde88737a',
+ 'description': 'md5:116377fd2963b81ec4ce64b542173306',
+ 'upload_date': '20150625',
+ 'uploader_id': 'dorappi2000',
+ 'uploader': 'dorappi2000',
+ 'formats': 'mincount:33',
+ },
+ },
# DASH manifest with segment_list
{
'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
@@ -533,7 +553,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'youtube_include_dash_manifest': True,
'format': '135', # bestvideo
}
- }
+ },
]
def __init__(self, *args, **kwargs):
@@ -798,16 +818,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
def _parse_dash_manifest(
- self, video_id, dash_manifest_url, player_url, age_gate):
+ self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
def decrypt_sig(mobj):
s = mobj.group(1)
dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
return '/signature/%s' % dec_s
- dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
+ dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
dash_doc = self._download_xml(
dash_manifest_url, video_id,
note='Downloading DASH manifest',
- errnote='Could not download DASH manifest')
+ errnote='Could not download DASH manifest',
+ fatal=fatal)
+
+ if dash_doc is False:
+ return []
formats = []
for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
@@ -847,6 +871,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
except StopIteration:
full_info = self._formats.get(format_id, {}).copy()
full_info.update(f)
+ codecs = r.attrib.get('codecs')
+ if codecs:
+ if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
+ full_info['vcodec'] = codecs
+ elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
+ full_info['acodec'] = codecs
formats.append(full_info)
else:
existing_format.update(f)
@@ -862,7 +892,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Extract original video URL from URL with redirection, like age verification, using next_url parameter
mobj = re.search(self._NEXT_URL_RE, url)
if mobj:
- url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
+ url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
video_id = self.extract_id(url)
# Get video webpage
@@ -876,6 +906,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else:
player_url = None
+ dash_mpds = []
+
+ def add_dash_mpd(video_info):
+ dash_mpd = video_info.get('dashmpd')
+ if dash_mpd and dash_mpd[0] not in dash_mpds:
+ dash_mpds.append(dash_mpd[0])
+
# Get video info
embed_webpage = None
if re.search(r'player-age-gate-content">', video_webpage) is not None:
@@ -896,24 +933,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
note='Refetching age-gated info webpage',
errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
+ add_dash_mpd(video_info)
else:
age_gate = False
- try:
- # Try looking directly into the video webpage
- mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
- if not mobj:
- raise ValueError('Could not find ytplayer.config') # caught below
+ video_info = None
+ # Try looking directly into the video webpage
+ mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
+ if mobj:
json_code = uppercase_escape(mobj.group(1))
ytplayer_config = json.loads(json_code)
args = ytplayer_config['args']
- # Convert to the same format returned by compat_parse_qs
- video_info = dict((k, [v]) for k, v in args.items())
- if not args.get('url_encoded_fmt_stream_map'):
- raise ValueError('No stream_map present') # caught below
- except ValueError:
- # We fallback to the get_video_info pages (used by the embed page)
+ if args.get('url_encoded_fmt_stream_map'):
+ # Convert to the same format returned by compat_parse_qs
+ video_info = dict((k, [v]) for k, v in args.items())
+ add_dash_mpd(video_info)
+ if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
+ # We also try looking in get_video_info since it may contain different dashmpd
+ # URL that points to a DASH manifest with possibly different itag set (some itags
+ # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
+ # manifest pointed by get_video_info's dashmpd).
+ # The general idea is to take a union of itags of both DASH manifests (for example
+ # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
self.report_video_info_webpage_download(video_id)
- for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
+ for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
video_info_url = (
'%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
% (proto, video_id, el_type))
@@ -921,11 +963,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_info_url,
video_id, note=False,
errnote='unable to download video info webpage')
- video_info = compat_parse_qs(video_info_webpage)
- if 'token' in video_info:
+ get_video_info = compat_parse_qs(video_info_webpage)
+ add_dash_mpd(get_video_info)
+ if not video_info:
+ video_info = get_video_info
+ if 'token' in get_video_info:
break
if 'token' not in video_info:
if 'reason' in video_info:
+ if 'The uploader has not made this video available in your country.' in video_info['reason']:
+ regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
+ if regions_allowed is not None:
+ raise ExtractorError('YouTube said: This video is available in %s only' % (
+ ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
+ expected=True)
raise ExtractorError(
'YouTube said: %s' % video_info['reason'][0],
expected=True, video_id=video_id)
@@ -949,7 +1000,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# uploader
if 'author' not in video_info:
raise ExtractorError('Unable to extract uploader name')
- video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
+ video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
# uploader_id
video_uploader_id = None
@@ -976,18 +1027,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self._downloader.report_warning('unable to extract video thumbnail')
video_thumbnail = None
else: # don't panic if we can't find it
- video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
+ video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
# upload date
- upload_date = None
- mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
- if mobj is None:
- mobj = re.search(
- r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
- video_webpage)
- if mobj is not None:
- upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
- upload_date = unified_strdate(upload_date)
+ upload_date = self._html_search_meta(
+ 'datePublished', video_webpage, 'upload date', default=None)
+ if not upload_date:
+ upload_date = self._search_regex(
+ [r'(?s)id="eow-date.*?>(.*?)</span>',
+ r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
+ video_webpage, 'upload date', default=None)
+ if upload_date:
+ upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
+ upload_date = unified_strdate(upload_date)
m_cat_container = self._search_regex(
r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
@@ -1021,12 +1073,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_description = ''
def _extract_count(count_name):
- count = self._search_regex(
- r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
- video_webpage, count_name, default=None)
- if count is not None:
- return int(count.replace(',', ''))
- return None
+ return str_to_int(self._search_regex(
+ r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
+ % re.escape(count_name),
+ video_webpage, count_name, default=None))
+
like_count = _extract_count('like')
dislike_count = _extract_count('dislike')
@@ -1038,7 +1089,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self._downloader.report_warning('unable to extract video duration')
video_duration = None
else:
- video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
+ video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
# annotations
video_annotations = None
@@ -1141,24 +1192,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Look for the DASH manifest
if self._downloader.params.get('youtube_include_dash_manifest', True):
- dash_mpd = video_info.get('dashmpd')
- if dash_mpd:
- dash_manifest_url = dash_mpd[0]
+ dash_mpd_fatal = True
+ for dash_manifest_url in dash_mpds:
+ dash_formats = {}
try:
- dash_formats = self._parse_dash_manifest(
- video_id, dash_manifest_url, player_url, age_gate)
+ for df in self._parse_dash_manifest(
+ video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
+ # Do not overwrite DASH format found in some previous DASH manifest
+ if df['format_id'] not in dash_formats:
+ dash_formats[df['format_id']] = df
+ # Additional DASH manifests may end up in HTTP Error 403 therefore
+ # allow them to fail without bug report message if we already have
+ # some DASH manifest succeeded. This is temporary workaround to reduce
+ # burst of bug reports until we figure out the reason and whether it
+ # can be fixed at all.
+ dash_mpd_fatal = False
except (ExtractorError, KeyError) as e:
self.report_warning(
'Skipping DASH manifest: %r' % e, video_id)
- else:
+ if dash_formats:
# Remove the formats we found through non-DASH, they
# contain less info and it can be wrong, because we use
# fixed values (for example the resolution). See
# https://github.com/rg3/youtube-dl/issues/5774 for an
# example.
- dash_keys = set(df['format_id'] for df in dash_formats)
- formats = [f for f in formats if f['format_id'] not in dash_keys]
- formats.extend(dash_formats)
+ formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
+ formats.extend(dash_formats.values())
# Check for malformed aspect ratio
stretched_m = re.search(
@@ -1315,7 +1374,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
def _extract_playlist(self, playlist_id):
url = self._TEMPLATE_URL % playlist_id
page = self._download_webpage(url, playlist_id)
- more_widget_html = content_html = page
for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
match = match.strip()
@@ -1335,36 +1393,36 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
self.report_warning('Youtube gives an alert message: ' + match)
# Extract the video ids from the playlist pages
- ids = []
-
- for page_num in itertools.count(1):
- matches = re.finditer(self._VIDEO_RE, content_html)
- # We remove the duplicates and the link with index 0
- # (it's not the first video of the playlist)
- new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
- ids.extend(new_ids)
-
- mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
- if not mobj:
- break
+ def _entries():
+ more_widget_html = content_html = page
+ for page_num in itertools.count(1):
+ matches = re.finditer(self._VIDEO_RE, content_html)
+ # We remove the duplicates and the link with index 0
+ # (it's not the first video of the playlist)
+ new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
+ for vid_id in new_ids:
+ yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
+
+ mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+ if not mobj:
+ break
- more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
- 'Downloading page #%s' % page_num,
- transform_source=uppercase_escape)
- content_html = more['content_html']
- if not content_html.strip():
- # Some webpages show a "Load more" button but they don't
- # have more videos
- break
- more_widget_html = more['load_more_widget_html']
+ more = self._download_json(
+ 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+ 'Downloading page #%s' % page_num,
+ transform_source=uppercase_escape)
+ content_html = more['content_html']
+ if not content_html.strip():
+ # Some webpages show a "Load more" button but they don't
+ # have more videos
+ break
+ more_widget_html = more['load_more_widget_html']
playlist_title = self._html_search_regex(
r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
page, 'title')
- url_results = self._ids_to_results(ids)
- return self.playlist_result(url_results, playlist_id, playlist_title)
+ return self.playlist_result(_entries(), playlist_id, playlist_title)
def _real_extract(self, url):
# Extract playlist id
@@ -1431,10 +1489,12 @@ class YoutubeChannelIE(InfoExtractor):
channel_page = self._download_webpage(
url + '?view=57', channel_id,
'Downloading channel page', fatal=False)
- channel_playlist_id = self._search_regex(
- [r'<meta itemprop="channelId" content="([^"]+)">',
- r'data-channel-external-id="([^"]+)"'],
- channel_page, 'channel id', default=None)
+ channel_playlist_id = self._html_search_meta(
+ 'channelId', channel_page, 'channel id', default=None)
+ if not channel_playlist_id:
+ channel_playlist_id = self._search_regex(
+ r'data-channel-external-id="([^"]+)"',
+ channel_page, 'channel id', default=None)
if channel_playlist_id and channel_playlist_id.startswith('UC'):
playlist_id = 'UU' + channel_playlist_id[2:]
return self.url_result(
@@ -1528,7 +1588,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
for pagenum in itertools.count(1):
url_query = {
- 'search_query': query,
+ 'search_query': query.encode('utf-8'),
'page': pagenum,
'spf': 'navigate',
}
@@ -1576,7 +1636,7 @@ class YoutubeSearchURLIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- query = compat_urllib_parse.unquote_plus(mobj.group('query'))
+ query = compat_urllib_parse_unquote_plus(mobj.group('query'))
webpage = self._download_webpage(url, query)
result_code = self._search_regex(