aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/youtube.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
-rw-r--r--youtube_dl/extractor/youtube.py317
1 files changed, 210 insertions, 107 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index cd35a1620..e28db2b5a 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1,7 +1,5 @@
# coding: utf-8
-import errno
-import io
import itertools
import json
import os.path
@@ -21,7 +19,6 @@ from ..utils import (
compat_str,
clean_html,
- get_cachedir,
get_element_by_id,
get_element_by_attribute,
ExtractorError,
@@ -30,13 +27,13 @@ from ..utils import (
unescapeHTML,
unified_strdate,
orderedSet,
- write_json_file,
uppercase_escape,
)
class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors"""
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
+ _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
_LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
_AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
_NETRC_MACHINE = 'youtube'
@@ -50,12 +47,19 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
fatal=False))
def _login(self):
+ """
+ Attempt to log in to YouTube.
+ True is returned if successful or skipped.
+ False is returned if login failed.
+
+ If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
+ """
(username, password) = self._get_login_info()
# No authentication to be performed
if username is None:
if self._LOGIN_REQUIRED:
raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
- return False
+ return True
login_page = self._download_webpage(
self._LOGIN_URL, None,
@@ -73,6 +77,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
u'Email': username,
u'GALX': galx,
u'Passwd': password,
+
u'PersistentCookie': u'yes',
u'_utf8': u'霱',
u'bgresponse': u'js_disabled',
@@ -88,6 +93,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
u'uilel': u'3',
u'hl': u'en_US',
}
+
# Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
# chokes on unicode
login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
@@ -99,6 +105,68 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
note=u'Logging in', errnote=u'unable to log in', fatal=False)
if login_results is False:
return False
+
+ if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
+ raise ExtractorError(u'Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
+
+ # Two-Factor
+ # TODO add SMS and phone call support - these require making a request and then prompting the user
+
+ if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
+ tfa_code = self._get_tfa_info()
+
+ if tfa_code is None:
+ self._downloader.report_warning(u'Two-factor authentication required. Provide it with --twofactor <code>')
+ self._downloader.report_warning(u'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
+ return False
+
+ # Unlike the first login form, secTok and timeStmp are both required for the TFA form
+
+ match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
+ if match is None:
+ self._downloader.report_warning(u'Failed to get secTok - did the page structure change?')
+ secTok = match.group(1)
+ match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
+ if match is None:
+ self._downloader.report_warning(u'Failed to get timeStmp - did the page structure change?')
+ timeStmp = match.group(1)
+
+ tfa_form_strs = {
+ u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
+ u'smsToken': u'',
+ u'smsUserPin': tfa_code,
+ u'smsVerifyPin': u'Verify',
+
+ u'PersistentCookie': u'yes',
+ u'checkConnection': u'',
+ u'checkedDomains': u'youtube',
+ u'pstMsg': u'1',
+ u'secTok': secTok,
+ u'timeStmp': timeStmp,
+ u'service': u'youtube',
+ u'hl': u'en_US',
+ }
+ tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
+ tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
+
+ tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
+ tfa_results = self._download_webpage(
+ tfa_req, None,
+ note=u'Submitting TFA code', errnote=u'unable to submit tfa', fatal=False)
+
+ if tfa_results is False:
+ return False
+
+ if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
+ self._downloader.report_warning(u'Two-factor code expired. Please try again, or use a one-use backup code instead.')
+ return False
+ if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
+ self._downloader.report_warning(u'unable to log in - did the page structure change?')
+ return False
+ if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
+ self._downloader.report_warning(u'Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
+ return False
+
if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
self._downloader.report_warning(u'unable to log in: bad username or password')
return False
@@ -131,7 +199,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
IE_DESC = u'YouTube.com'
_VALID_URL = r"""(?x)^
(
- (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
+ (?:https?://|//) # http(s):// or protocol-independent URL
(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
(?:www\.)?deturl\.com/www\.youtube\.com/|
(?:www\.)?pwnyoutube\.com/|
@@ -149,7 +217,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
)
))
|youtu\.be/ # just youtu.be/xxxx
- |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
+ |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
)
)? # all until now is optional -> you can pass the naked ID
([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
@@ -225,7 +293,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
# Dash webm audio
- '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
+ '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
'172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
# RTMP (unnamed)
@@ -244,6 +312,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
u"upload_date": u"20121002",
u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
u"categories": [u'Science & Technology'],
+ 'like_count': int,
+ 'dislike_count': int,
}
},
{
@@ -344,7 +414,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
"""Indicate the download will use the RTMP protocol."""
self.to_screen(u'RTMP download detected')
- def _extract_signature_function(self, video_id, player_url, slen):
+ def _signature_cache_id(self, example_sig):
+ """ Return a string representation of a signature """
+ return u'.'.join(compat_str(len(part)) for part in example_sig.split('.'))
+
+ def _extract_signature_function(self, video_id, player_url, example_sig):
id_m = re.match(
r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
player_url)
@@ -354,21 +428,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
player_id = id_m.group('id')
# Read from filesystem cache
- func_id = '%s_%s_%d' % (player_type, player_id, slen)
+ func_id = '%s_%s_%s' % (
+ player_type, player_id, self._signature_cache_id(example_sig))
assert os.path.basename(func_id) == func_id
- cache_dir = get_cachedir(self._downloader.params)
- cache_enabled = cache_dir is not None
- if cache_enabled:
- cache_fn = os.path.join(os.path.expanduser(cache_dir),
- u'youtube-sigfuncs',
- func_id + '.json')
- try:
- with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
- cache_spec = json.load(cachef)
- return lambda s: u''.join(s[i] for i in cache_spec)
- except IOError:
- pass # No cache available
+ cache_spec = self._downloader.cache.load(u'youtube-sigfuncs', func_id)
+ if cache_spec is not None:
+ return lambda s: u''.join(s[i] for i in cache_spec)
if player_type == 'js':
code = self._download_webpage(
@@ -386,25 +452,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
else:
assert False, 'Invalid player type %r' % player_type
- if cache_enabled:
- try:
- test_string = u''.join(map(compat_chr, range(slen)))
- cache_res = res(test_string)
- cache_spec = [ord(c) for c in cache_res]
- try:
- os.makedirs(os.path.dirname(cache_fn))
- except OSError as ose:
- if ose.errno != errno.EEXIST:
- raise
- write_json_file(cache_spec, cache_fn)
- except Exception:
- tb = traceback.format_exc()
- self._downloader.report_warning(
- u'Writing cache to %r failed: %s' % (cache_fn, tb))
+ if cache_spec is None:
+ test_string = u''.join(map(compat_chr, range(len(example_sig))))
+ cache_res = res(test_string)
+ cache_spec = [ord(c) for c in cache_res]
+ self._downloader.cache.store(u'youtube-sigfuncs', func_id, cache_spec)
return res
- def _print_sig_code(self, func, slen):
+ def _print_sig_code(self, func, example_sig):
def gen_sig_code(idxs):
def _genslice(start, end, step):
starts = u'' if start == 0 else str(start)
@@ -433,11 +489,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
else:
yield _genslice(start, i, step)
- test_string = u''.join(map(compat_chr, range(slen)))
+ test_string = u''.join(map(compat_chr, range(len(example_sig))))
cache_res = func(test_string)
cache_spec = [ord(c) for c in cache_res]
expr_code = u' + '.join(gen_sig_code(cache_spec))
- code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
+ signature_id_tuple = '(%s)' % (
+ ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
+ code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
+ u' return %s\n') % (signature_id_tuple, expr_code)
self.to_screen(u'Extracted signature function:\n' + code)
def _parse_sig_js(self, jscode):
@@ -465,20 +524,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
if player_url.startswith(u'//'):
player_url = u'https:' + player_url
try:
- player_id = (player_url, len(s))
+ player_id = (player_url, self._signature_cache_id(s))
if player_id not in self._player_cache:
func = self._extract_signature_function(
- video_id, player_url, len(s)
+ video_id, player_url, s
)
self._player_cache[player_id] = func
func = self._player_cache[player_id]
if self._downloader.params.get('youtube_print_sig_code'):
- self._print_sig_code(func, len(s))
+ self._print_sig_code(func, s)
return func(s)
except Exception as e:
tb = traceback.format_exc()
raise ExtractorError(
- u'Automatic signature extraction failed: ' + tb, cause=e)
+ u'Signature extraction failed: ' + tb, cause=e)
def _get_available_subtitles(self, video_id, webpage):
try:
@@ -493,6 +552,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
sub_lang_list = {}
for l in lang_list:
lang = l[1]
+ if lang in sub_lang_list:
+ continue
params = compat_urllib_parse.urlencode({
'lang': lang,
'v': video_id,
@@ -695,7 +756,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
upload_date = unified_strdate(upload_date)
- m_cat_container = get_element_by_id("eow-category", video_webpage)
+ m_cat_container = self._search_regex(
+ r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
+ video_webpage, 'categories', fatal=False)
if m_cat_container:
category = self._html_search_regex(
r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
@@ -724,15 +787,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
else:
video_description = u''
- def _extract_count(klass):
+ def _extract_count(count_name):
count = self._search_regex(
- r'class="%s">([\d,]+)</span>' % re.escape(klass),
- video_webpage, klass, default=None)
+ r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
+ video_webpage, count_name, default=None)
if count is not None:
return int(count.replace(',', ''))
return None
- like_count = _extract_count(u'likes-count')
- dislike_count = _extract_count(u'dislikes-count')
+ like_count = _extract_count(u'like')
+ dislike_count = _extract_count(u'dislike')
# subtitles
video_subtitles = self.extract_subtitles(video_id, video_webpage)
@@ -806,51 +869,54 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
url_map = {}
for url_data_str in encoded_url_map.split(','):
url_data = compat_parse_qs(url_data_str)
- if 'itag' in url_data and 'url' in url_data:
- url = url_data['url'][0]
- if 'sig' in url_data:
- url += '&signature=' + url_data['sig'][0]
- elif 's' in url_data:
- encrypted_sig = url_data['s'][0]
-
- if not age_gate:
- jsplayer_url_json = self._search_regex(
- r'"assets":.+?"js":\s*("[^"]+")',
- video_webpage, u'JS player URL')
- player_url = json.loads(jsplayer_url_json)
+ if 'itag' not in url_data or 'url' not in url_data:
+ continue
+ format_id = url_data['itag'][0]
+ url = url_data['url'][0]
+
+ if 'sig' in url_data:
+ url += '&signature=' + url_data['sig'][0]
+ elif 's' in url_data:
+ encrypted_sig = url_data['s'][0]
+
+ if not age_gate:
+ jsplayer_url_json = self._search_regex(
+ r'"assets":.+?"js":\s*("[^"]+")',
+ video_webpage, u'JS player URL')
+ player_url = json.loads(jsplayer_url_json)
+ if player_url is None:
+ player_url_json = self._search_regex(
+ r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
+ video_webpage, u'age gate player URL')
+ player_url = json.loads(player_url_json)
+
+ if self._downloader.params.get('verbose'):
if player_url is None:
- player_url_json = self._search_regex(
- r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
- video_webpage, u'age gate player URL')
- player_url = json.loads(player_url_json)
-
- if self._downloader.params.get('verbose'):
- if player_url is None:
- player_version = 'unknown'
- player_desc = 'unknown'
+ player_version = 'unknown'
+ player_desc = 'unknown'
+ else:
+ if player_url.endswith('swf'):
+ player_version = self._search_regex(
+ r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
+ u'flash player', fatal=False)
+ player_desc = 'flash player %s' % player_version
else:
- if player_url.endswith('swf'):
- player_version = self._search_regex(
- r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
- u'flash player', fatal=False)
- player_desc = 'flash player %s' % player_version
- else:
- player_version = self._search_regex(
- r'html5player-([^/]+?)(?:/html5player)?\.js',
- player_url,
- 'html5 player', fatal=False)
- player_desc = u'html5 player %s' % player_version
-
- parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
- self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
- (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
-
- signature = self._decrypt_signature(
- encrypted_sig, video_id, player_url, age_gate)
- url += '&signature=' + signature
- if 'ratebypass' not in url:
- url += '&ratebypass=yes'
- url_map[url_data['itag'][0]] = url
+ player_version = self._search_regex(
+ r'html5player-([^/]+?)(?:/html5player)?\.js',
+ player_url,
+ 'html5 player', fatal=False)
+ player_desc = u'html5 player %s' % player_version
+
+ parts_sizes = self._signature_cache_id(encrypted_sig)
+ self.to_screen(u'{%s} signature length %s, %s' %
+ (format_id, parts_sizes, player_desc))
+
+ signature = self._decrypt_signature(
+ encrypted_sig, video_id, player_url, age_gate)
+ url += '&signature=' + signature
+ if 'ratebypass' not in url:
+ url += '&ratebypass=yes'
+ url_map[format_id] = url
formats = _map_to_format_list(url_map)
elif video_info.get('hlsvp'):
manifest_url = video_info['hlsvp'][0]
@@ -959,21 +1025,26 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
self._login()
def _ids_to_results(self, ids):
- return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
- for vid_id in ids]
+ return [
+ self.url_result(vid_id, 'Youtube', video_id=vid_id)
+ for vid_id in ids]
def _extract_mix(self, playlist_id):
# The mixes are generated from a a single video
# the id of the playlist is just 'RD' + video_id
url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
- webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
+ webpage = self._download_webpage(
+ url, playlist_id, u'Downloading Youtube mix')
search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
- title_span = (search_title('playlist-title') or
- search_title('title long-title') or search_title('title'))
+ title_span = (
+ search_title('playlist-title') or
+ search_title('title long-title') or
+ search_title('title'))
title = clean_html(title_span)
- video_re = r'''(?x)data-video-username=".*?".*?
- href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
- ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
+ ids = orderedSet(re.findall(
+ r'''(?xs)data-video-username=".*?".*?
+ href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
+ webpage))
url_results = self._ids_to_results(ids)
return self.playlist_result(url_results, playlist_id, title)
@@ -1066,6 +1137,7 @@ class YoutubeTopListIE(YoutubePlaylistIE):
msg = u'Downloading Youtube mix'
if i > 0:
msg += ', retry #%d' % i
+
webpage = self._download_webpage(url, title, msg)
ids = orderedSet(re.findall(video_re, webpage))
if ids:
@@ -1339,12 +1411,6 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
paging = mobj.group('paging')
return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
-class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
- IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
- _FEED_NAME = 'subscriptions'
- _PLAYLIST_TITLE = u'Youtube Subscriptions'
-
class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
_VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
@@ -1377,6 +1443,43 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
return self.url_result(playlist_id, 'YoutubePlaylist')
+class YoutubeSubscriptionsIE(YoutubePlaylistIE):
+ IE_NAME = u'youtube:subscriptions'
+ IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+
+ def _real_extract(self, url):
+ title = u'Youtube Subscriptions'
+ page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
+
+ # The extraction process is the same as for playlists, but the regex
+ # for the video ids doesn't contain an index
+ ids = []
+ more_widget_html = content_html = page
+
+ for page_num in itertools.count(1):
+ matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
+ new_ids = orderedSet(matches)
+ ids.extend(new_ids)
+
+ mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+ if not mobj:
+ break
+
+ more = self._download_json(
+ 'https://youtube.com/%s' % mobj.group('more'), title,
+ 'Downloading page #%s' % page_num,
+ transform_source=uppercase_escape)
+ content_html = more['content_html']
+ more_widget_html = more['load_more_widget_html']
+
+ return {
+ '_type': 'playlist',
+ 'title': title,
+ 'entries': self._ids_to_results(ids),
+ }
+
+
class YoutubeTruncatedURLIE(InfoExtractor):
IE_NAME = 'youtube:truncated_url'
IE_DESC = False # Do not list