diff options
author | Yen Chi Hsuan <yan12125@gmail.com> | 2016-07-10 23:40:45 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2016-07-10 23:40:45 +0800 |
commit | 0b68de3cc1f99ce8c49a497245c02d4d03201aa8 (patch) | |
tree | 885385b3b4968715eb2b7d51b4f66f3a12da7f46 /youtube_dl | |
parent | 39e9d524e5fe289936160d4c599a77f10f6e9061 (diff) | |
parent | 59bbe4911acd4493bf407925bfdeb1ad03db6ef3 (diff) |
Merge pull request #8876 from remitamine/html5_media
[extractor/common] add helper method to extract html5 media entries
Diffstat (limited to 'youtube_dl')
-rw-r--r-- | youtube_dl/extractor/common.py | 58 | ||||
-rw-r--r-- | youtube_dl/utils.py | 36 |
2 files changed, 94 insertions, 0 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 816baa424..df546da27 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -55,6 +55,8 @@ from ..utils import ( update_Request, update_url_query, parse_m3u8_attributes, + extract_attributes, + parse_codecs, ) @@ -1635,6 +1637,62 @@ class InfoExtractor(object): self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats + def _parse_html5_media_entries(self, base_url, webpage): + def absolute_url(video_url): + return compat_urlparse.urljoin(base_url, video_url) + + def parse_content_type(content_type): + if not content_type: + return {} + ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type) + if ctr: + mimetype, codecs = ctr.groups() + f = parse_codecs(codecs) + f['ext'] = mimetype2ext(mimetype) + return f + return {} + + entries = [] + for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage): + media_info = { + 'formats': [], + 'subtitles': {}, + } + media_attributes = extract_attributes(media_tag) + src = media_attributes.get('src') + if src: + media_info['formats'].append({ + 'url': absolute_url(src), + 'vcodec': 'none' if media_type == 'audio' else None, + }) + media_info['thumbnail'] = media_attributes.get('poster') + if media_content: + for source_tag in re.findall(r'<source[^>]+>', media_content): + source_attributes = extract_attributes(source_tag) + src = source_attributes.get('src') + if not src: + continue + f = parse_content_type(source_attributes.get('type')) + f.update({ + 'url': absolute_url(src), + 'vcodec': 'none' if media_type == 'audio' else None, + }) + media_info['formats'].append(f) + for track_tag in re.findall(r'<track[^>]+>', media_content): + track_attributes = extract_attributes(track_tag) + kind = track_attributes.get('kind') + if not kind or kind == 'subtitles': + src = track_attributes.get('src') + if not src: + continue + lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') + media_info['subtitles'].setdefault(lang, []).append({ + 'url': absolute_url(src), + }) + if media_info['formats']: + entries.append(media_info) + return entries + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 3498697b6..4c1d0d526 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2126,6 +2126,42 @@ def mimetype2ext(mt): }.get(res, res) +def parse_codecs(codecs_str): + # http://tools.ietf.org/html/rfc6381 + if not codecs_str: + return {} + splited_codecs = list(filter(None, map( + lambda str: str.strip(), codecs_str.strip().strip(',').split(',')))) + vcodec, acodec = None, None + for full_codec in splited_codecs: + codec = full_codec.split('.')[0] + if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'): + if not vcodec: + vcodec = full_codec + elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'): + if not acodec: + acodec = full_codec + else: + write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr) + if not vcodec and not acodec: + if len(splited_codecs) == 2: + return { + 'vcodec': vcodec, + 'acodec': acodec, + } + elif len(splited_codecs) == 1: + return { + 'vcodec': 'none', + 'acodec': vcodec, + } + else: + return { + 'vcodec': vcodec or 'none', + 'acodec': acodec or 'none', + } + return {} + + def urlhandle_detect_ext(url_handle): getheader = url_handle.headers.get |