aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/common.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/common.py')
-rw-r--r--youtube_dl/extractor/common.py51
1 files changed, 48 insertions, 3 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index dbdf456f5..549781186 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
import base64
import datetime
+import functools
import hashlib
import json
import netrc
@@ -23,6 +24,7 @@ from ..compat import (
compat_getpass,
compat_integer_types,
compat_http_client,
+ compat_map as map,
compat_os_name,
compat_str,
compat_urllib_error,
@@ -31,6 +33,7 @@ from ..compat import (
compat_urllib_request,
compat_urlparse,
compat_xml_parse_error,
+ compat_zip as zip,
)
from ..downloader.f4m import (
get_base_url,
@@ -70,6 +73,7 @@ from ..utils import (
str_or_none,
str_to_int,
strip_or_none,
+ traverse_obj,
try_get,
unescapeHTML,
unified_strdate,
@@ -1349,6 +1353,44 @@ class InfoExtractor(object):
break
return dict((k, v) for k, v in info.items() if v is not None)
+ def _search_nextjs_data(self, webpage, video_id, **kw):
+ nkw = dict((k, v) for k, v in kw.items() if k in ('transform_source', 'fatal'))
+ kw.pop('transform_source', None)
+ next_data = self._search_regex(
+ r'''<script[^>]+\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>(?P<nd>[^<]+)</script>''',
+ webpage, 'next.js data', group='nd', **kw)
+ if not next_data:
+ return {}
+ return self._parse_json(next_data, video_id, **nkw)
+
+ def _search_nuxt_data(self, webpage, video_id, *args, **kwargs):
+ """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
+
+ # self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)
+ context_name = args[0] if len(args) > 0 else kwargs.get('context_name', '__NUXT__')
+ fatal = kwargs.get('fatal', True)
+ traverse = kwargs.get('traverse', ('data', 0))
+
+ re_ctx = re.escape(context_name)
+
+ FUNCTION_RE = (r'\(\s*function\s*\((?P<arg_keys>[\s\S]*?)\)\s*\{\s*'
+ r'return\s+(?P<js>\{[\s\S]*?})\s*;?\s*}\s*\((?P<arg_vals>[\s\S]*?)\)')
+
+ js, arg_keys, arg_vals = self._search_regex(
+ (p.format(re_ctx, FUNCTION_RE) for p in
+ (r'<script>\s*window\s*\.\s*{0}\s*=\s*{1}\s*\)\s*;?\s*</script>',
+ r'{0}\s*\([\s\S]*?{1}')),
+ webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
+ default=NO_DEFAULT if fatal else (None, None, None))
+ if js is None:
+ return {}
+
+ args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
+ '[{0}]'.format(arg_vals), video_id, transform_source=js_to_json, fatal=fatal) or ())))
+
+ ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
+ return traverse_obj(ret, traverse) or {}
+
@staticmethod
def _hidden_inputs(html):
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
@@ -2496,7 +2538,8 @@ class InfoExtractor(object):
return f
return {}
- def _media_formats(src, cur_media_type, type_info={}):
+ def _media_formats(src, cur_media_type, type_info=None):
+ type_info = type_info or {}
full_url = absolute_url(src)
ext = type_info.get('ext') or determine_ext(full_url)
if ext == 'm3u8':
@@ -2514,6 +2557,7 @@ class InfoExtractor(object):
formats = [{
'url': full_url,
'vcodec': 'none' if cur_media_type == 'audio' else None,
+ 'ext': ext,
}]
return is_plain_url, formats
@@ -2522,7 +2566,7 @@ class InfoExtractor(object):
# so we wll include them right here (see
# https://www.ampproject.org/docs/reference/components/amp-video)
# For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
- _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
+ _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video(?:-js)?|audio)'
media_tags = [(media_tag, media_tag_name, media_type, '')
for media_tag, media_tag_name, media_type
in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
@@ -2540,7 +2584,8 @@ class InfoExtractor(object):
media_attributes = extract_attributes(media_tag)
src = strip_or_none(media_attributes.get('src'))
if src:
- _, formats = _media_formats(src, media_type)
+ f = parse_content_type(media_attributes.get('type'))
+ _, formats = _media_formats(src, media_type, f)
media_info['formats'].extend(formats)
media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
if media_content: