aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
authorSergey M․ <dstftw@gmail.com>2019-03-17 09:09:32 +0700
committerSergey M․ <dstftw@gmail.com>2019-03-17 09:09:32 +0700
commitd493f15c1158abd817e191ff830fd5481b1ed42d (patch)
tree59b5f0b02261849fd0add8c9d13d877ad0355cf7 /youtube_dl/extractor
parent0dc41787af0fb011f01d88c1f2ecb686a8416df5 (diff)
[extractor/common] Improve HTML5 entries extraction and add some realworld tests
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/common.py43
1 files changed, 36 insertions, 7 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index c291bc1df..0889288f0 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -44,6 +44,7 @@ from ..utils import (
compiled_regex_type,
determine_ext,
determine_protocol,
+ dict_get,
error_to_compat_str,
ExtractorError,
extract_attributes,
@@ -56,13 +57,16 @@ from ..utils import (
JSON_LD_RE,
mimetype2ext,
orderedSet,
+ parse_bitrate,
parse_codecs,
parse_duration,
parse_iso8601,
parse_m3u8_attributes,
+ parse_resolution,
RegexNotFoundError,
sanitized_Request,
sanitize_filename,
+ str_or_none,
unescapeHTML,
unified_strdate,
unified_timestamp,
@@ -2481,18 +2485,43 @@ class InfoExtractor(object):
media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
if media_content:
for source_tag in re.findall(r'<source[^>]+>', media_content):
- source_attributes = extract_attributes(source_tag)
- src = source_attributes.get('src')
+ s_attr = extract_attributes(source_tag)
+ # data-video-src and data-src are non standard but seen
+ # several times in the wild
+ src = dict_get(s_attr, ('src', 'data-video-src', 'data-src'))
if not src:
continue
- f = parse_content_type(source_attributes.get('type'))
+ f = parse_content_type(s_attr.get('type'))
is_plain_url, formats = _media_formats(src, media_type, f)
if is_plain_url:
- # res attribute is not standard but seen several times
- # in the wild
+ # width, height, res, label and title attributes are
+ # all not standard but seen several times in the wild
+ labels = [
+ s_attr.get(lbl)
+ for lbl in ('label', 'title')
+ if str_or_none(s_attr.get(lbl))
+ ]
+ width = int_or_none(s_attr.get('width'))
+ height = (int_or_none(s_attr.get('height')) or
+ int_or_none(s_attr.get('res')))
+ if not width or not height:
+ for lbl in labels:
+ resolution = parse_resolution(lbl)
+ if not resolution:
+ continue
+ width = width or resolution.get('width')
+ height = height or resolution.get('height')
+ for lbl in labels:
+ tbr = parse_bitrate(lbl)
+ if tbr:
+ break
+ else:
+ tbr = None
f.update({
- 'height': int_or_none(source_attributes.get('res')),
- 'format_id': source_attributes.get('label'),
+ 'width': width,
+ 'height': height,
+ 'tbr': tbr,
+ 'format_id': s_attr.get('label') or s_attr.get('title'),
})
f.update(formats[0])
media_info['formats'].append(f)