aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--test/test_jsinterp.py66
-rw-r--r--test/test_youtube_lists.py37
-rw-r--r--test/test_youtube_signature.py99
-rw-r--r--youtube_dl/compat.py26
-rw-r--r--youtube_dl/extractor/youtube.py165
-rw-r--r--youtube_dl/jsinterp.py510
6 files changed, 727 insertions, 176 deletions
diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py
index c24b8ca74..acdabffb1 100644
--- a/test/test_jsinterp.py
+++ b/test/test_jsinterp.py
@@ -112,6 +112,72 @@ class TestJSInterpreter(unittest.TestCase):
''')
self.assertEqual(jsi.call_function('z'), 5)
+ def test_for_loop(self):
+ # function x() { a=0; for (i=0; i-10; i++) {a++} a }
+ jsi = JSInterpreter('''
+ function x() { a=0; for (i=0; i-10; i = i + 1) {a++} a }
+ ''')
+ self.assertEqual(jsi.call_function('x'), 10)
+
+ def test_switch(self):
+ jsi = JSInterpreter('''
+ function x(f) { switch(f){
+ case 1:f+=1;
+ case 2:f+=2;
+ case 3:f+=3;break;
+ case 4:f+=4;
+ default:f=0;
+ } return f }
+ ''')
+ self.assertEqual(jsi.call_function('x', 1), 7)
+ self.assertEqual(jsi.call_function('x', 3), 6)
+ self.assertEqual(jsi.call_function('x', 5), 0)
+
+ def test_switch_default(self):
+ jsi = JSInterpreter('''
+ function x(f) { switch(f){
+ case 2: f+=2;
+ default: f-=1;
+ case 5:
+ case 6: f+=6;
+ case 0: break;
+ case 1: f+=1;
+ } return f }
+ ''')
+ self.assertEqual(jsi.call_function('x', 1), 2)
+ self.assertEqual(jsi.call_function('x', 5), 11)
+ self.assertEqual(jsi.call_function('x', 9), 14)
+
+ def test_try(self):
+ jsi = JSInterpreter('''
+ function x() { try{return 10} catch(e){return 5} }
+ ''')
+ self.assertEqual(jsi.call_function('x'), 10)
+
+ def test_for_loop_continue(self):
+ jsi = JSInterpreter('''
+ function x() { a=0; for (i=0; i-10; i++) { continue; a++ } a }
+ ''')
+ self.assertEqual(jsi.call_function('x'), 0)
+
+ def test_for_loop_break(self):
+ jsi = JSInterpreter('''
+ function x() { a=0; for (i=0; i-10; i++) { break; a++ } a }
+ ''')
+ self.assertEqual(jsi.call_function('x'), 0)
+
+ def test_literal_list(self):
+ jsi = JSInterpreter('''
+ function x() { [1, 2, "asdf", [5, 6, 7]][3] }
+ ''')
+ self.assertEqual(jsi.call_function('x'), [5, 6, 7])
+
+ def test_comma(self):
+ jsi = JSInterpreter('''
+ function x() { a=5; a -= 1, a+=3; return a }
+ ''')
+ self.assertEqual(jsi.call_function('x'), 7)
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py
index cf2fdf14f..07a6b6d06 100644
--- a/test/test_youtube_lists.py
+++ b/test/test_youtube_lists.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
from __future__ import unicode_literals
# Allow direct execution
@@ -9,11 +10,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import FakeYDL
-
from youtube_dl.extractor import (
+ YoutubeIE,
YoutubePlaylistIE,
YoutubeTabIE,
- YoutubeIE,
)
@@ -25,38 +25,23 @@ class TestYoutubeLists(unittest.TestCase):
def test_youtube_playlist_noplaylist(self):
dl = FakeYDL()
dl.params['noplaylist'] = True
+ dl.params['format'] = 'best'
ie = YoutubePlaylistIE(dl)
result = ie.extract('https://www.youtube.com/watch?v=FXxLjLQi3Fg&list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')
self.assertEqual(result['_type'], 'url')
+ result = dl.extract_info(result['url'], download=False, ie_key=result.get('ie_key'), process=False)
self.assertEqual(YoutubeIE().extract_id(result['url']), 'FXxLjLQi3Fg')
- def test_youtube_course(self):
- dl = FakeYDL()
- ie = YoutubePlaylistIE(dl)
- # TODO find a > 100 (paginating?) videos course
- result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
- entries = list(result['entries'])
- self.assertEqual(YoutubeIE().extract_id(entries[0]['url']), 'j9WZyLZCBzs')
- self.assertEqual(len(entries), 25)
- self.assertEqual(YoutubeIE().extract_id(entries[-1]['url']), 'rYefUsYuEp0')
-
def test_youtube_mix(self):
dl = FakeYDL()
- ie = YoutubePlaylistIE(dl)
- result = ie.extract('https://www.youtube.com/watch?v=W01L70IGBgE&index=2&list=RDOQpdSVF_k_w')
- entries = result['entries']
+ dl.params['format'] = 'best'
+ ie = YoutubeTabIE(dl)
+ result = dl.extract_info('https://www.youtube.com/watch?v=uVJ0Il5WvbE&list=PLhQjrBD2T381k8ul4WQ8SQ165XqY149WW',
+ download=False, ie_key=ie.ie_key(), process=True)
+ entries = (result or {}).get('entries', [{'id': 'not_found', }])
self.assertTrue(len(entries) >= 50)
original_video = entries[0]
- self.assertEqual(original_video['id'], 'OQpdSVF_k_w')
-
- def test_youtube_toptracks(self):
- print('Skipping: The playlist page gives error 500')
- return
- dl = FakeYDL()
- ie = YoutubePlaylistIE(dl)
- result = ie.extract('https://www.youtube.com/playlist?list=MCUS')
- entries = result['entries']
- self.assertEqual(len(entries), 100)
+ self.assertEqual(original_video['id'], 'uVJ0Il5WvbE')
def test_youtube_flat_playlist_extraction(self):
dl = FakeYDL()
@@ -67,7 +52,7 @@ class TestYoutubeLists(unittest.TestCase):
entries = list(result['entries'])
self.assertTrue(len(entries) == 1)
video = entries[0]
- self.assertEqual(video['_type'], 'url_transparent')
+ self.assertEqual(video['_type'], 'url')
self.assertEqual(video['ie_key'], 'Youtube')
self.assertEqual(video['id'], 'BaW_jenozKc')
self.assertEqual(video['url'], 'BaW_jenozKc')
diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py
index 627d4cb92..fc5e9828e 100644
--- a/test/test_youtube_signature.py
+++ b/test/test_youtube_signature.py
@@ -14,9 +14,10 @@ import string
from test.helper import FakeYDL
from youtube_dl.extractor import YoutubeIE
+from youtube_dl.jsinterp import JSInterpreter
from youtube_dl.compat import compat_str, compat_urlretrieve
-_TESTS = [
+_SIG_TESTS = [
(
'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js',
86,
@@ -64,6 +65,33 @@ _TESTS = [
)
]
+_NSIG_TESTS = [
+ (
+ 'https://www.youtube.com/s/player/9216d1f7/player_ias.vflset/en_US/base.js',
+ 'SLp9F5bwjAdhE9F-', 'gWnb9IK2DJ8Q1w',
+ ),
+ (
+ 'https://www.youtube.com/s/player/f8cb7a3b/player_ias.vflset/en_US/base.js',
+ 'oBo2h5euWy6osrUt', 'ivXHpm7qJjJN',
+ ),
+ (
+ 'https://www.youtube.com/s/player/2dfe380c/player_ias.vflset/en_US/base.js',
+ 'oBo2h5euWy6osrUt', '3DIBbn3qdQ',
+ ),
+ (
+ 'https://www.youtube.com/s/player/f1ca6900/player_ias.vflset/en_US/base.js',
+ 'cu3wyu6LQn2hse', 'jvxetvmlI9AN9Q',
+ ),
+ (
+ 'https://www.youtube.com/s/player/8040e515/player_ias.vflset/en_US/base.js',
+ 'wvOFaY-yjgDuIEg5', 'HkfBFDHmgw4rsw',
+ ),
+ (
+ 'https://www.youtube.com/s/player/e06dea74/player_ias.vflset/en_US/base.js',
+ 'AiuodmaDDYw8d3y4bf', 'ankd8eza2T6Qmw',
+ ),
+]
+
class TestPlayerInfo(unittest.TestCase):
def test_youtube_extract_player_info(self):
@@ -90,40 +118,61 @@ class TestPlayerInfo(unittest.TestCase):
class TestSignature(unittest.TestCase):
def setUp(self):
TEST_DIR = os.path.dirname(os.path.abspath(__file__))
- self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata')
+ self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata/sigs')
if not os.path.exists(self.TESTDATA_DIR):
os.mkdir(self.TESTDATA_DIR)
+ def tearDown(self):
+ try:
+ for f in os.listdir(self.TESTDATA_DIR):
+ os.remove(f)
+ except OSError:
+ pass
+
+
+def t_factory(name, sig_func, url_pattern):
+ def make_tfunc(url, sig_input, expected_sig):
+ m = url_pattern.match(url)
+ assert m, '%r should follow URL format' % url
+ test_id = m.group('id')
+
+ def test_func(self):
+ basename = 'player-{0}-{1}.js'.format(name, test_id)
+ fn = os.path.join(self.TESTDATA_DIR, basename)
+
+ if not os.path.exists(fn):
+ compat_urlretrieve(url, fn)
+ with io.open(fn, encoding='utf-8') as testf:
+ jscode = testf.read()
+ self.assertEqual(sig_func(jscode, sig_input), expected_sig)
+
+ test_func.__name__ = str('test_{0}_js_{1}'.format(name, test_id))
+ setattr(TestSignature, test_func.__name__, test_func)
+ return make_tfunc
-def make_tfunc(url, sig_input, expected_sig):
- m = re.match(r'.*-([a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$', url)
- assert m, '%r should follow URL format' % url
- test_id = m.group(1)
- def test_func(self):
- basename = 'player-%s.js' % test_id
- fn = os.path.join(self.TESTDATA_DIR, basename)
+def signature(jscode, sig_input):
+ func = YoutubeIE(FakeYDL())._parse_sig_js(jscode)
+ src_sig = (
+ compat_str(string.printable[:sig_input])
+ if isinstance(sig_input, int) else sig_input)
+ return func(src_sig)
- if not os.path.exists(fn):
- compat_urlretrieve(url, fn)
- ydl = FakeYDL()
- ie = YoutubeIE(ydl)
- with io.open(fn, encoding='utf-8') as testf:
- jscode = testf.read()
- func = ie._parse_sig_js(jscode)
- src_sig = (
- compat_str(string.printable[:sig_input])
- if isinstance(sig_input, int) else sig_input)
- got_sig = func(src_sig)
- self.assertEqual(got_sig, expected_sig)
+def n_sig(jscode, sig_input):
+ funcname = YoutubeIE(FakeYDL())._extract_n_function_name(jscode)
+ return JSInterpreter(jscode).call_function(funcname, sig_input)
- test_func.__name__ = str('test_signature_js_' + test_id)
- setattr(TestSignature, test_func.__name__, test_func)
+make_sig_test = t_factory(
+ 'signature', signature, re.compile(r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$'))
+for test_spec in _SIG_TESTS:
+ make_sig_test(*test_spec)
-for test_spec in _TESTS:
- make_tfunc(*test_spec)
+make_nsig_test = t_factory(
+ 'nsig', n_sig, re.compile(r'.+/player/(?P<id>[a-zA-Z0-9_-]+)/.+.js$'))
+for test_spec in _NSIG_TESTS:
+ make_nsig_test(*test_spec)
if __name__ == '__main__':
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py
index 9e45c454b..2004a405a 100644
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@@ -21,6 +21,10 @@ import subprocess
import sys
import xml.etree.ElementTree
+try:
+ import collections.abc as compat_collections_abc
+except ImportError:
+ import collections as compat_collections_abc
try:
import urllib.request as compat_urllib_request
@@ -2962,6 +2966,25 @@ else:
compat_Struct = struct.Struct
+# compat_map/filter() returning an iterator, supposedly the
+# same versioning as for zip below
+try:
+ from future_builtins import map as compat_map
+except ImportError:
+ try:
+ from itertools import imap as compat_map
+ except ImportError:
+ compat_map = map
+
+try:
+ from future_builtins import filter as compat_filter
+except ImportError:
+ try:
+ from itertools import ifilter as compat_filter
+ except ImportError:
+ compat_filter = filter
+
+
try:
from future_builtins import zip as compat_zip
except ImportError: # not 2.6+ or is 3.x
@@ -3006,6 +3029,7 @@ __all__ = [
'compat_b64decode',
'compat_basestring',
'compat_chr',
+ 'compat_collections_abc',
'compat_cookiejar',
'compat_cookiejar_Cookie',
'compat_cookies',
@@ -3015,6 +3039,7 @@ __all__ = [
'compat_etree_fromstring',
'compat_etree_register_namespace',
'compat_expanduser',
+ 'compat_filter',
'compat_get_terminal_size',
'compat_getenv',
'compat_getpass',
@@ -3026,6 +3051,7 @@ __all__ = [
'compat_integer_types',
'compat_itertools_count',
'compat_kwargs',
+ 'compat_map',
'compat_numeric_types',
'compat_ord',
'compat_os_name',
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 7cd651c67..3ab60960a 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -13,6 +13,7 @@ from .common import InfoExtractor, SearchInfoExtractor
from ..compat import (
compat_chr,
compat_HTTPError,
+ compat_map as map,
compat_parse_qs,
compat_str,
compat_urllib_parse_unquote_plus,
@@ -25,8 +26,10 @@ from ..utils import (
ExtractorError,
clean_html,
dict_get,
+ error_to_compat_str,
float_or_none,
int_or_none,
+ js_to_json,
mimetype2ext,
parse_codecs,
parse_duration,
@@ -469,6 +472,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
+ |shorts/
|(?: # or the v= param in all its forms
(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
@@ -1171,6 +1175,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'skip_download': True,
},
},
+ {
+ # YT 'Shorts'
+ 'url': 'https://youtube.com/shorts/4L2J27mJ3Dc',
+ 'info_dict': {
+ 'id': '4L2J27mJ3Dc',
+ 'ext': 'mp4',
+ 'upload_date': '20211025',
+ 'uploader': 'Charlie Berens',
+ 'description': 'md5:976512b8a29269b93bbd8a61edc45a6d',
+ 'uploader_id': 'fivedlrmilkshake',
+ 'title': 'Midwest Squid Game #Shorts',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
]
_formats = {
'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
@@ -1307,6 +1327,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
raise ExtractorError('Cannot identify player %r' % player_url)
return id_m.group('id')
+ def _get_player_code(self, video_id, player_url, player_id=None):
+ if not player_id:
+ player_id = self._extract_player_info(player_url)
+
+ if player_id not in self._code_cache:
+ self._code_cache[player_id] = self._download_webpage(
+ player_url, video_id,
+ note='Downloading player ' + player_id,
+ errnote='Download of %s failed' % player_url)
+ return self._code_cache[player_id]
+
def _extract_signature_function(self, video_id, player_url, example_sig):
player_id = self._extract_player_info(player_url)
@@ -1319,12 +1350,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if cache_spec is not None:
return lambda s: ''.join(s[i] for i in cache_spec)
- if player_id not in self._code_cache:
- self._code_cache[player_id] = self._download_webpage(
- player_url, video_id,
- note='Downloading player ' + player_id,
- errnote='Download of %s failed' % player_url)
- code = self._code_cache[player_id]
+ code = self._get_player_code(video_id, player_url, player_id)
res = self._parse_sig_js(code)
test_string = ''.join(map(compat_chr, range(len(example_sig))))
@@ -1403,11 +1429,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if player_url is None:
raise ExtractorError('Cannot decrypt signature without player_url')
- if player_url.startswith('//'):
- player_url = 'https:' + player_url
- elif not re.match(r'https?://', player_url):
- player_url = compat_urlparse.urljoin(
- 'https://www.youtube.com', player_url)
try:
player_id = (player_url, self._signature_cache_id(s))
if player_id not in self._player_cache:
@@ -1424,6 +1445,100 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
raise ExtractorError(
'Signature extraction failed: ' + tb, cause=e)
+ def _extract_player_url(self, webpage):
+ player_url = self._search_regex(
+ r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
+ webpage or '', 'player URL', fatal=False)
+ if not player_url:
+ return
+ if player_url.startswith('//'):
+ player_url = 'https:' + player_url
+ elif not re.match(r'https?://', player_url):
+ player_url = compat_urlparse.urljoin(
+ 'https://www.youtube.com', player_url)
+ return player_url
+
+ # from yt-dlp
+ # See also:
+ # 1. https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-894619419
+ # 2. https://code.videolan.org/videolan/vlc/-/blob/4fb284e5af69aa9ac2100ccbdd3b88debec9987f/share/lua/playlist/youtube.lua#L116
+ # 3. https://github.com/ytdl-org/youtube-dl/issues/30097#issuecomment-950157377
+ def _extract_n_function_name(self, jscode):
+ target = r'(?P<nfunc>[a-zA-Z0-9$]{3})(?:\[(?P<idx>\d+)\])?'
+ nfunc_and_idx = self._search_regex(
+ r'\.get\("n"\)\)&&\(b=(%s)\([a-zA-Z0-9]\)' % (target, ),
+ jscode, 'Initial JS player n function name')
+ nfunc, idx = re.match(target, nfunc_and_idx).group('nfunc', 'idx')
+ if not idx:
+ return nfunc
+ return self._parse_json(self._search_regex(
+ r'var %s\s*=\s*(\[.+?\]);' % (nfunc, ), jscode,
+ 'Initial JS player n function list ({nfunc}[{idx}])'.format(**locals())), nfunc, transform_source=js_to_json)[int(idx)]
+
+ def _extract_n_function(self, video_id, player_url):
+ player_id = self._extract_player_info(player_url)
+ func_code = self._downloader.cache.load('youtube-nsig', player_id)
+
+ if func_code:
+ jsi = JSInterpreter(func_code)
+ else:
+ player_id = self._extract_player_info(player_url)
+ jscode = self._get_player_code(video_id, player_url, player_id)
+ funcname = self._extract_n_function_name(jscode)
+ jsi = JSInterpreter(jscode)
+ func_code = jsi.extract_function_code(funcname)
+ self._downloader.cache.store('youtube-nsig', player_id, func_code)
+
+ if self._downloader.params.get('youtube_print_sig_code'):
+ self.to_screen('Extracted nsig function from {0}:\n{1}\n'.format(player_id, func_code[1]))
+
+ return lambda s: jsi.extract_function_from_code(*func_code)([s])
+
+ def _n_descramble(self, n_param, player_url, video_id):
+ """Compute the response to YT's "n" parameter challenge
+
+ Args:
+ n_param -- challenge string that is the value of the
+ URL's "n" query parameter
+ player_url -- URL of YT player JS
+ video_id
+ """
+
+ sig_id = ('nsig_value', n_param)
+ if sig_id in self._player_cache:
+ return self._player_cache[sig_id]
+
+ try:
+ player_id = ('nsig', player_url)
+ if player_id not in self._player_cache:
+ self._player_cache[player_id] = self._extract_n_function(video_id, player_url)
+ func = self._player_cache[player_id]
+ self._player_cache[sig_id] = func(n_param)
+ if self._downloader.params.get('verbose', False):
+ self._downloader.to_screen('[debug] [%s] %s' % (self.IE_NAME, 'Decrypted nsig {0} => {1}'.format(n_param, self._player_cache[sig_id])))
+ return self._player_cache[sig_id]
+ except Exception as e:
+ self._downloader.report_warning(
+ '[%s] %s (%s %s)' % (
+ self.IE_NAME,
+ 'Unable to decode n-parameter: download likely to be throttled',
+ error_to_compat_str(e),
+ traceback.format_exc()))
+
+ def _unthrottle_format_urls(self, video_id, player_url, formats):
+ for fmt in formats:
+ parsed_fmt_url = compat_urlparse.urlparse(fmt['url'])
+ qs = compat_urlparse.parse_qs(parsed_fmt_url.query)
+ n_param = qs.get('n')
+ if not n_param:
+ continue
+ n_param = n_param[-1]
+ n_response = self._n_descramble(n_param, player_url, video_id)
+ if n_response:
+ qs['n'] = [n_response]
+ fmt['url'] = compat_urlparse.urlunparse(
+ parsed_fmt_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
+
def _mark_watched(self, video_id, player_response):
playback_url = url_or_none(try_get(
player_response,
@@ -1685,11 +1800,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if not (sc and fmt_url and encrypted_sig):
continue
if not player_url:
- if not webpage:
- continue
- player_url = self._search_regex(
- r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
- webpage, 'player URL', fatal=False)
+ player_url = self._extract_player_url(webpage)
if not player_url:
continue
signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
@@ -1835,6 +1946,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
is_live = video_details.get('isLive')
owner_profile_url = microformat.get('ownerProfileUrl')
+ if not player_url:
+ player_url = self._extract_player_url(webpage)
+ self._unthrottle_format_urls(video_id, player_url, formats)
+
info = {
'id': video_id,
'title': self._live_title(video_title) if is_live else video_title,
@@ -2377,6 +2492,17 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
}, {
'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU',
'only_matching': True,
+ }, {
+ 'note': 'Search tab',
+ 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra',
+ 'playlist_mincount': 40,
+ 'info_dict': {
+ 'id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'title': '3Blue1Brown - Search - linear algebra',
+ 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
+ 'uploader': '3Blue1Brown',
+ 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
+ }
}]
@classmethod
@@ -2774,8 +2900,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
@staticmethod
def _extract_selected_tab(tabs):
for tab in tabs:
- if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
- return tab['tabRenderer']
+ renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {}
+ if renderer.get('selected') is True:
+ return renderer
else:
raise ExtractorError('Unable to find selected tab')
@@ -2832,6 +2959,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
title = channel_title or item_id
if tab_title:
title += ' - %s' % tab_title
+ if selected_tab.get('expandedText'):
+ title += ' - %s' % selected_tab['expandedText']
description = renderer.get('description')
playlist_id = renderer.get('externalId')
else:
diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py
index 7bda59610..8eaa911cd 100644
--- a/youtube_dl/jsinterp.py
+++ b/youtube_dl/jsinterp.py
@@ -8,6 +8,16 @@ from .utils import (
ExtractorError,
remove_quotes,
)
+from .compat import (
+ compat_collections_abc,
+ compat_str,
+)
+MutableMapping = compat_collections_abc.MutableMapping
+
+
+class Nonlocal:
+ pass
+
_OPERATORS = [
('|', operator.or_),
@@ -22,10 +32,56 @@ _OPERATORS = [
('*', operator.mul),
]
_ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS]
-_ASSIGN_OPERATORS.append(('=', lambda cur, right: right))
+_ASSIGN_OPERATORS.append(('=', (lambda cur, right: right)))
_NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*'
+_MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]')))
+
+
+class JS_Break(ExtractorError):
+ def __init__(self):
+ ExtractorError.__init__(self, 'Invalid break')
+
+
+class JS_Continue(ExtractorError):
+ def __init__(self):
+ ExtractorError.__init__(self, 'Invalid continue')
+
+
+class LocalNameSpace(MutableMapping):
+ def __init__(self, *stack):
+ self.stack = tuple(stack)
+
+ def __getitem__(self, key):
+ for scope in self.stack:
+ if key in scope:
+ return scope[key]
+ raise KeyError(key)
+
+ def __setitem__(self, key, value):
+ for scope in self.stack:
+ if key in scope:
+ scope[key] = value
+ break
+ else:
+ self.stack[0][key] = value
+ return value
+
+ def __delitem__(self, key):
+ raise NotImplementedError('Deleting is not supported')
+
+ def __iter__(self):
+ for scope in self.stack:
+ for scope_item in iter(scope):
+ yield scope_item
+
+ def __len__(self, key):
+ return len(iter(self))
+
+ def __repr__(self):
+ return 'LocalNameSpace%s' % (self.stack, )
+
class JSInterpreter(object):
def __init__(self, code, objects=None):
@@ -34,11 +90,56 @@ class JSInterpreter(object):
self.code = code
self._functions = {}
self._objects = objects
+ self.__named_object_counter = 0
+
+ def _named_object(self, namespace, obj):
+ self.__named_object_counter += 1
+ name = '__youtube_dl_jsinterp_obj%s' % (self.__named_object_counter, )
+ namespace[name] = obj
+ return name
+
+ @staticmethod
+ def _separate(expr, delim=',', max_split=None):
+ if not expr:
+ return
+ counters = {k: 0 for k in _MATCHING_PARENS.values()}
+ start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1
+ for idx, char in enumerate(expr):
+ if char in _MATCHING_PARENS:
+ counters[_MATCHING_PARENS[char]] += 1
+ elif char in counters:
+ counters[char] -= 1
+ if char != delim[pos] or any(counters.values()):
+ pos = 0
+ continue
+ elif pos != delim_len:
+ pos += 1
+ continue
+ yield expr[start: idx - delim_len]
+ start, pos = idx + 1, 0
+ splits += 1
+ if max_split and splits >= max_split:
+ break
+ yield expr[start:]
+
+ @staticmethod
+ def _separate_at_paren(expr, delim):
+ separated = list(JSInterpreter._separate(expr, delim, 1))
+ if len(separated) < 2:
+ raise ExtractorError('No terminating paren {0} in {1}'.format(delim, expr))
+ return separated[0][1:].strip(), separated[1].strip()
def interpret_statement(self, stmt, local_vars, allow_recursion=100):
if allow_recursion < 0:
raise ExtractorError('Recursion limit reached')
+ sub_statements = list(self._separate(stmt, ';'))
+ stmt = (sub_statements or ['']).pop()
+ for sub_stmt in sub_statements:
+ ret, should_abort = self.interpret_statement(sub_stmt, local_vars, allow_recursion - 1)
+ if should_abort:
+ return ret
+
should_abort = False
stmt = stmt.lstrip()
stmt_m = re.match(r'var\s', stmt)
@@ -61,25 +162,124 @@ class JSInterpreter(object):
if expr == '': # Empty expression
return None
+ if expr.startswith('{'):
+ inner, outer = self._separate_at_paren(expr, '}')
+ inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion - 1)
+ if not outer or should_abort:
+ return inner
+ else:
+ expr = json.dumps(inner) + outer
+
if expr.startswith('('):
- parens_count = 0
- for m in re.finditer(r'[()]', expr):
- if m.group(0) == '(':
- parens_count += 1
+ inner, outer = self._separate_at_paren(expr, ')')
+ inner = self.interpret_expression(inner, local_vars, allow_recursion)
+ if not outer:
+ return inner
+ else:
+ expr = json.dumps(inner) + outer
+
+ if expr.startswith('['):
+ inner, outer = self._separate_at_paren(expr, ']')
+ name = self._named_object(local_vars, [
+ self.interpret_expression(item, local_vars, allow_recursion)
+ for item in self._separate(inner)])
+ expr = name + outer
+
+ m = re.match(r'try\s*', expr)
+ if m:
+ if expr[m.end()] == '{':
+ try_expr, expr = self._separate_at_paren(expr[m.end():], '}')
+ else:
+ try_expr, expr = expr[m.end() - 1:], ''
+ ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion - 1)
+ if should_abort:
+ return ret
+ return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0]
+
+ m = re.match(r'(?:(?P<catch>catch)|(?P<for>for)|(?P<switch>switch))\s*\(', expr)
+ md = m.groupdict() if m else {}
+ if md.get('catch'):
+ # We ignore the catch block
+ _, expr = self._separate_at_paren(expr, '}')
+ return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0]
+
+ elif md.get('for'):
+ def raise_constructor_error(c):
+ raise ExtractorError(
+ 'Premature return in the initialization of a for loop in {0!r}'.format(c))
+
+ constructor, remaining = self._separate_at_paren(expr[m.end() - 1:], ')')
+ if remaining.startswith('{'):
+ body, expr = self._separate_at_paren(remaining, '}')
+ else:
+ m = re.match(r'switch\s*\(', remaining) # FIXME
+ if m:
+ switch_val, remaining = self._separate_at_paren(remaining[m.end() - 1:], ')')
+ body, expr = self._separate_at_paren(remaining, '}')
+ body = 'switch(%s){%s}' % (switch_val, body)
else:
- parens_count -= 1
- if parens_count == 0:
- sub_expr = expr[1:m.start()]
- sub_result = self.interpret_expression(
- sub_expr, local_vars, allow_recursion)
- remaining_expr = expr[m.end():].strip()
- if not remaining_expr:
- return sub_result
- else:
- expr = json.dumps(sub_result) + remaining_expr
+ body, expr = remaining, ''
+ start, cndn, increment = self._separate(constructor, ';')
+ if self.interpret_statement(start, local_vars, allow_recursion - 1)[1]:
+ raise_constructor_error(constructor)
+ while True:
+ if not self.interpret_expression(cndn, local_vars, allow_recursion):
+ break
+ try:
+ ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion - 1)
+ if should_abort:
+ return ret
+ except JS_Break:
+ break
+ except JS_Continue:
+ pass
+ if self.interpret_statement(increment, local_vars, allow_recursion - 1)[1]:
+ raise_constructor_error(constructor)
+ return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0]
+
+ elif md.get('switch'):
+ switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:], ')')
+ switch_val = self.interpret_expression(switch_val, local_vars, allow_recursion)
+ body, expr = self._separate_at_paren(remaining, '}')
+ items = body.replace('default:', 'case default:').split('case ')[1:]
+ for default in (False, True):
+ matched = False
+ for item in items:
+ case, stmt = [i.strip() for i in self._separate(item, ':', 1)]
+ if default:
+ matched = matched or case == 'default'
+ elif not matched:
+ matched = (case != 'default'
+ and switch_val == self.interpret_expression(case, local_vars, allow_recursion))
+ if not matched:
+ continue
+ try:
+ ret, should_abort = self.interpret_statement(stmt, local_vars, allow_recursion - 1)
+ if should_abort:
+ return ret
+ except JS_Break:
break
- else:
- raise ExtractorError('Premature end of parens in %r' % expr)
+ if matched:
+ break
+ return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0]
+
+ # Comma separated statements
+ sub_expressions = list(self._separate(expr))
+ expr = sub_expressions.pop().strip() if sub_expressions else ''
+ for sub_expr in sub_expressions:
+ self.interpret_expression(sub_expr, local_vars, allow_recursion)
+
+ for m in re.finditer(r'''(?x)
+ (?P<pre_sign>\+\+|--)(?P<var1>%(_NAME_RE)s)|
+ (?P<var2>%(_NAME_RE)s)(?P<post_sign>\+\+|--)''' % globals(), expr):
+ var = m.group('var1') or m.group('var2')
+ start, end = m.span()
+ sign = m.group('pre_sign') or m.group('post_sign')
+ ret = local_vars[var]
+ local_vars[var] += 1 if sign[0] == '+' else -1
+ if m.group('pre_sign'):
+ ret = local_vars[var]
+ expr = expr[:start] + json.dumps(ret) + expr[end:]
for op, opfunc in _ASSIGN_OPERATORS:
m = re.match(r'''(?x)
@@ -88,14 +288,13 @@ class JSInterpreter(object):
(?P<expr>.*)$''' % (_NAME_RE, re.escape(op)), expr)
if not m:
continue
- right_val = self.interpret_expression(
- m.group('expr'), local_vars, allow_recursion - 1)
+ right_val = self.interpret_expression(m.group('expr'), local_vars, allow_recursion)
if m.groupdict().get('index'):
lvar = local_vars[m.group('out')]
- idx = self.interpret_expression(
- m.group('index'), local_vars, allow_recursion)
- assert isinstance(idx, int)
+ idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion)
+ if not isinstance(idx, int):
+ raise ExtractorError('List indices must be integers: %s' % (idx, ))
cur = lvar[idx]
val = opfunc(cur, right_val)
lvar[idx] = val
@@ -109,8 +308,13 @@ class JSInterpreter(object):
if expr.isdigit():
return int(expr)
+ if expr == 'break':
+ raise JS_Break()
+ elif expr == 'continue':
+ raise JS_Continue()
+
var_m = re.match(
- r'(?!if|return|true|false)(?P<name>%s)$' % _NAME_RE,
+ r'(?!if|return|true|false|null)(?P<name>%s)$' % _NAME_RE,
expr)
if var_m:
return local_vars[var_m.group('name')]
@@ -124,91 +328,161 @@ class JSInterpreter(object):
r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr)
if m:
val = local_vars[m.group('in')]
- idx = self.interpret_expression(
- m.group('idx'), local_vars, allow_recursion - 1)
+ idx = self.interpret_expression(m.group('idx'), local_vars, allow_recursion)
return val[idx]
+ def raise_expr_error(where, op, exp):
+ raise ExtractorError('Premature {0} return of {1} in {2!r}'.format(where, op, exp))
+
+ for op, opfunc in _OPERATORS:
+ separated = list(self._separate(expr, op))
+ if len(separated) < 2:
+ continue
+ right_val = separated.pop()
+ left_val = op.join(separated)
+ left_val, should_abort = self.interpret_statement(
+ left_val, local_vars, allow_recursion - 1)
+ if should_abort:
+ raise_expr_error('left-side', op, expr)
+ right_val, should_abort = self.interpret_statement(
+ right_val, local_vars, allow_recursion - 1)
+ if should_abort:
+ raise_expr_error('right-side', op, expr)
+ return opfunc(left_val or 0, right_val)
+
m = re.match(
- r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE,
+ r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*' % _NAME_RE,
expr)
if m:
variable = m.group('var')
- member = remove_quotes(m.group('member') or m.group('member2'))
- arg_str = m.group('args')
+ nl = Nonlocal()
- if variable in local_vars:
- obj = local_vars[variable]
- else:
- if variable not in self._objects:
- self._objects[variable] = self.extract_object(variable)
- obj = self._objects[variable]
-
- if arg_str is None:
- # Member access
- if member == 'length':
- return len(obj)
- return obj[member]
-
- assert expr.endswith(')')
- # Function call
- if arg_str == '':
- argvals = tuple()
+ nl.member = remove_quotes(m.group('member') or m.group('member2'))
+ arg_str = expr[m.end():]
+ if arg_str.startswith('('):
+ arg_str, remaining = self._separate_at_paren(arg_str, ')')
else:
- argvals = tuple([
+ arg_str, remaining = None, arg_str
+
+ def assertion(cndn, msg):
+ """ assert, but without risk of getting optimized out """
+ if not cndn:
+ raise ExtractorError('{0} {1}: {2}'.format(nl.member, msg, expr))
+
+ def eval_method():
+ # nonlocal member
+ member = nl.member
+ if variable == 'String':
+ obj = compat_str
+ elif variable in local_vars:
+ obj = local_vars[variable]
+ else:
+ if variable not in self._objects:
+ self._objects[variable] = self.extract_object(variable)
+ obj = self._objects[variable]
+
+ if arg_str is None:
+ # Member access
+ if member == 'length':
+ return len(obj)
+ return obj[member]
+
+ # Function call
+ argvals = [
self.interpret_expression(v, local_vars, allow_recursion)
- for v in arg_str.split(',')])
-
- if member == 'split':
- assert argvals == ('',)
- return list(obj)
- if member == 'join':
- assert len(argvals) == 1
- return argvals[0].join(obj)
- if member == 'reverse':
- assert len(argvals) == 0
- obj.reverse()
- return obj
- if member == 'slice':
- assert len(argvals) == 1
- return obj[argvals[0]:]
- if member == 'splice':
- assert isinstance(obj, list)
- index, howMany = argvals
- res = []
- for i in range(index, min(index + howMany, len(obj))):
- res.append(obj.pop(index))
- return res
-
- return obj[member](argvals)
-
- for op, opfunc in _OPERATORS:
- m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr)
- if not m:
- continue
- x, abort = self.interpret_statement(
- m.group('x'), local_vars, allow_recursion - 1)
- if abort:
- raise ExtractorError(
- 'Premature left-side return of %s in %r' % (op, expr))
- y, abort = self.interpret_statement(
- m.group('y'), local_vars, allow_recursion - 1)
- if abort:
- raise ExtractorError(
- 'Premature right-side return of %s in %r' % (op, expr))
- return opfunc(x, y)
+ for v in self._separate(arg_str)]
+
+ if obj == compat_str:
+ if member == 'fromCharCode':
+ assertion(argvals, 'takes one or more arguments')
+ return ''.join(map(chr, argvals))
+ raise ExtractorError('Unsupported string method %s' % (member, ))
+
+ if member == 'split':
+ assertion(argvals, 'takes one or more arguments')
+ assertion(argvals == [''], 'with arguments is not implemented')
+ return list(obj)
+ elif member == 'join':
+ assertion(isinstance(obj, list), 'must be applied on a list')
+ assertion(len(argvals) == 1, 'takes exactly one argument')
+ return argvals[0].join(obj)
+ elif member == 'reverse':
+ assertion(not argvals, 'does not take any arguments')
+ obj.reverse()
+ return obj
+ elif member == 'slice':
+ assertion(isinstance(obj, list), 'must be applied on a list')
+ assertion(len(argvals) == 1, 'takes exactly one argument')
+ return obj[argvals[0]:]
+ elif member == 'splice':
+ assertion(isinstance(obj, list), 'must be applied on a list')
+ assertion(argvals, 'takes one or more arguments')
+ index, howMany = map(int, (argvals + [len(obj)])[:2])
+ if index < 0:
+ index += len(obj)
+ add_items = argvals[2:]
+ res = []
+ for i in range(index, min(index + howMany, len(obj))):
+ res.append(obj.pop(index))
+ for i, item in enumerate(add_items):
+ obj.insert(index + i, item)
+ return res
+ elif member == 'unshift':
+ assertion(isinstance(obj, list), 'must be applied on a list')
+ assertion(argvals, 'takes one or more arguments')
+ for item in reversed(argvals):
+ obj.insert(0, item)
+ return obj
+ elif member == 'pop':
+ assertion(isinstance(obj, list), 'must be applied on a list')
+ assertion(not argvals, 'does not take any arguments')
+ if not obj:
+ return
+ return obj.pop()
+ elif member == 'push':
+ assertion(argvals, 'takes one or more arguments')
+ obj.extend(argvals)
+ return obj
+ elif member == 'forEach':
+ assertion(argvals, 'takes one or more arguments')
+ assertion(len(argvals) <= 2, 'takes at-most 2 arguments')
+ f, this = (argvals + [''])[:2]
+ return [f((item, idx, obj), this=this) for idx, item in enumerate(obj)]
+ elif member == 'indexOf':
+ assertion(argvals, 'takes one or more arguments')
+ assertion(len(argvals) <= 2, 'takes at-most 2 arguments')
+ idx, start = (argvals + [0])[:2]
+ try:
+ return obj.index(idx, start)
+ except ValueError:
+ return -1
+
+ if isinstance(obj, list):
+ member = int(member)
+ nl.member = member
+ return obj[member](argvals)
+
+ if remaining:
+ return self.interpret_expression(
+ self._named_object(local_vars, eval_method()) + remaining,
+ local_vars, allow_recursion)
+ else:
+ return eval_method()
- m = re.match(
- r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr)
+ m = re.match(r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr)
if m:
fname = m.group('func')
argvals = tuple([
int(v) if v.isdigit() else local_vars[v]
- for v in m.group('args').split(',')]) if len(m.group('args')) > 0 else tuple()
- if fname not in self._functions:
+ for v in self._separate(m.group('args'))])
+ if fname in local_vars:
+ return local_vars[fname](argvals)
+ elif fname not in self._functions:
self._functions[fname] = self.extract_function(fname)
return self._functions[fname](argvals)
- raise ExtractorError('Unsupported JS expression %r' % expr)
+ if expr:
+ raise ExtractorError('Unsupported JS expression %r' % expr)
def extract_object(self, objname):
_FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')'''
@@ -233,30 +507,52 @@ class JSInterpreter(object):
return obj
- def extract_function(self, funcname):
+ def extract_function_code(self, funcname):
+ """ @returns argnames, code """
func_m = re.search(
r'''(?x)
- (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s*
+ (?:function\s+%(f_n)s|[{;,]\s*%(f_n)s\s*=\s*function|var\s+%(f_n)s\s*=\s*function)\s*
\((?P<args>[^)]*)\)\s*
- \{(?P<code>[^}]+)\}''' % (
- re.escape(funcname), re.escape(funcname), re.escape(funcname)),
+ (?P<code>\{(?:(?!};)[^"]|"([^"]|\\")*")+\})''' % {'f_n': re.escape(funcname), },
self.code)
+ code, _ = self._separate_at_paren(func_m.group('code'), '}') # refine the match
if func_m is None:
raise ExtractorError('Could not find JS function %r' % funcname)
- argnames = func_m.group('args').split(',')
+ return func_m.group('args').split(','), code
- return self.build_function(argnames, func_m.group('code'))
+ def extract_function(self, funcname):
+ return self.extract_function_from_code(*self.extract_function_code(funcname))
+
+ def extract_function_from_code(self, argnames, code, *global_stack):
+ local_vars = {}
+ while True:
+ mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code)
+ if mobj is None:
+ break
+ start, body_start = mobj.span()
+ body, remaining = self._separate_at_paren(code[body_start - 1:], '}')
+ name = self._named_object(
+ local_vars,
+ self.extract_function_from_code(
+ [x.strip() for x in mobj.group('args').split(',')],
+ body, local_vars, *global_stack))
+ code = code[:start] + name + remaining
+ return self.build_function(argnames, code, local_vars, *global_stack)
def call_function(self, funcname, *args):
- f = self.extract_function(funcname)
- return f(args)
-
- def build_function(self, argnames, code):
- def resf(args):
- local_vars = dict(zip(argnames, args))
- for stmt in code.split(';'):
- res, abort = self.interpret_statement(stmt, local_vars)
- if abort:
+ return self.extract_function(funcname)(args)
+
+ def build_function(self, argnames, code, *global_stack):
+ global_stack = list(global_stack) or [{}]
+ local_vars = global_stack.pop(0)
+
+ def resf(args, **kwargs):
+ local_vars.update(dict(zip(argnames, args)))
+ local_vars.update(kwargs)
+ var_stack = LocalNameSpace(local_vars, *global_stack)
+ for stmt in self._separate(code.replace('\n', ''), ';'):
+ ret, should_abort = self.interpret_statement(stmt, var_stack)
+ if should_abort:
break
- return res
+ return ret
return resf