aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYen Chi Hsuan <yan12125@gmail.com>2016-08-19 23:53:47 +0800
committerYen Chi Hsuan <yan12125@gmail.com>2016-08-19 23:53:47 +0800
commit520251c093f5e0fe6af5e57203a0452aef0682ac (patch)
treebfbb5c4e03bdcd23a0ebe60a7a06ef99b0324f11
parent55af45fcab4295a92d56180cdbebe7b47e094bc3 (diff)
[extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags
-rw-r--r--ChangeLog1
-rw-r--r--youtube_dl/extractor/common.py36
2 files changed, 26 insertions, 11 deletions
diff --git a/ChangeLog b/ChangeLog
index 6281fe325..450351231 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,7 @@
version <unreleased>
Core
+* Support m3u8 manifests in HTML5 multimedia tags
* Fix js_to_json(): correct octal or hexadecimal number detection
Extractors
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 9427ff449..07d58afe7 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1695,7 +1695,7 @@ class InfoExtractor(object):
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
return formats
- def _parse_html5_media_entries(self, base_url, webpage):
+ def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None):
def absolute_url(video_url):
return compat_urlparse.urljoin(base_url, video_url)
@@ -1710,6 +1710,21 @@ class InfoExtractor(object):
return f
return {}
+ def _media_formats(src, cur_media_type):
+ full_url = absolute_url(src)
+ if determine_ext(full_url) == 'm3u8':
+ is_plain_url = False
+ formats = self._extract_m3u8_formats(
+ full_url, video_id, ext='mp4', entry_protocol='m3u8_native',
+ m3u8_id=m3u8_id)
+ else:
+ is_plain_url = True
+ formats = [{
+ 'url': full_url,
+ 'vcodec': 'none' if cur_media_type == 'audio' else None,
+ }]
+ return is_plain_url, formats
+
entries = []
for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
media_info = {
@@ -1719,10 +1734,8 @@ class InfoExtractor(object):
media_attributes = extract_attributes(media_tag)
src = media_attributes.get('src')
if src:
- media_info['formats'].append({
- 'url': absolute_url(src),
- 'vcodec': 'none' if media_type == 'audio' else None,
- })
+ _, formats = _media_formats(src)
+ media_info['formats'].extend(formats)
media_info['thumbnail'] = media_attributes.get('poster')
if media_content:
for source_tag in re.findall(r'<source[^>]+>', media_content):
@@ -1730,12 +1743,13 @@ class InfoExtractor(object):
src = source_attributes.get('src')
if not src:
continue
- f = parse_content_type(source_attributes.get('type'))
- f.update({
- 'url': absolute_url(src),
- 'vcodec': 'none' if media_type == 'audio' else None,
- })
- media_info['formats'].append(f)
+ is_plain_url, formats = _media_formats(src, media_type)
+ if is_plain_url:
+ f = parse_content_type(source_attributes.get('type'))
+ f.update(formats[0])
+ media_info['formats'].append(f)
+ else:
+ media_info['formats'].extend(formats)
for track_tag in re.findall(r'<track[^>]+>', media_content):
track_attributes = extract_attributes(track_tag)
kind = track_attributes.get('kind')