aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--test/test_utils.py24
-rw-r--r--youtube_dl/extractor/common.py58
-rw-r--r--youtube_dl/utils.py36
3 files changed, 118 insertions, 0 deletions
diff --git a/test/test_utils.py b/test/test_utils.py
index afd273a65..2273b5a10 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -81,6 +81,7 @@ from youtube_dl.utils import (
cli_option,
cli_valueless_option,
cli_bool_option,
+ parse_codecs,
)
from youtube_dl.compat import (
compat_chr,
@@ -608,6 +609,29 @@ class TestUtil(unittest.TestCase):
limit_length('foo bar baz asd', 12).startswith('foo bar'))
self.assertTrue('...' in limit_length('foo bar baz asd', 12))
+ def test_parse_codecs(self):
+ self.assertEqual(parse_codecs(''), {})
+ self.assertEqual(parse_codecs('avc1.77.30, mp4a.40.2'), {
+ 'vcodec': 'avc1.77.30',
+ 'acodec': 'mp4a.40.2',
+ })
+ self.assertEqual(parse_codecs('mp4a.40.2'), {
+ 'vcodec': 'none',
+ 'acodec': 'mp4a.40.2',
+ })
+ self.assertEqual(parse_codecs('mp4a.40.5,avc1.42001e'), {
+ 'vcodec': 'avc1.42001e',
+ 'acodec': 'mp4a.40.5',
+ })
+ self.assertEqual(parse_codecs('avc3.640028'), {
+ 'vcodec': 'avc3.640028',
+ 'acodec': 'none',
+ })
+ self.assertEqual(parse_codecs(', h264,,newcodec,aac'), {
+ 'vcodec': 'h264',
+ 'acodec': 'aac',
+ })
+
def test_escape_rfc3986(self):
reserved = "!*'();:@&=+$,/?#[]"
unreserved = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~'
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 816baa424..df546da27 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -55,6 +55,8 @@ from ..utils import (
update_Request,
update_url_query,
parse_m3u8_attributes,
+ extract_attributes,
+ parse_codecs,
)
@@ -1635,6 +1637,62 @@ class InfoExtractor(object):
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
return formats
+ def _parse_html5_media_entries(self, base_url, webpage):
+ def absolute_url(video_url):
+ return compat_urlparse.urljoin(base_url, video_url)
+
+ def parse_content_type(content_type):
+ if not content_type:
+ return {}
+ ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
+ if ctr:
+ mimetype, codecs = ctr.groups()
+ f = parse_codecs(codecs)
+ f['ext'] = mimetype2ext(mimetype)
+ return f
+ return {}
+
+ entries = []
+ for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
+ media_info = {
+ 'formats': [],
+ 'subtitles': {},
+ }
+ media_attributes = extract_attributes(media_tag)
+ src = media_attributes.get('src')
+ if src:
+ media_info['formats'].append({
+ 'url': absolute_url(src),
+ 'vcodec': 'none' if media_type == 'audio' else None,
+ })
+ media_info['thumbnail'] = media_attributes.get('poster')
+ if media_content:
+ for source_tag in re.findall(r'<source[^>]+>', media_content):
+ source_attributes = extract_attributes(source_tag)
+ src = source_attributes.get('src')
+ if not src:
+ continue
+ f = parse_content_type(source_attributes.get('type'))
+ f.update({
+ 'url': absolute_url(src),
+ 'vcodec': 'none' if media_type == 'audio' else None,
+ })
+ media_info['formats'].append(f)
+ for track_tag in re.findall(r'<track[^>]+>', media_content):
+ track_attributes = extract_attributes(track_tag)
+ kind = track_attributes.get('kind')
+ if not kind or kind == 'subtitles':
+ src = track_attributes.get('src')
+ if not src:
+ continue
+ lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
+ media_info['subtitles'].setdefault(lang, []).append({
+ 'url': absolute_url(src),
+ })
+ if media_info['formats']:
+ entries.append(media_info)
+ return entries
+
def _live_title(self, name):
""" Generate the title for a live video """
now = datetime.datetime.now()
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 3498697b6..4c1d0d526 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -2126,6 +2126,42 @@ def mimetype2ext(mt):
}.get(res, res)
+def parse_codecs(codecs_str):
+ # http://tools.ietf.org/html/rfc6381
+ if not codecs_str:
+ return {}
+ splited_codecs = list(filter(None, map(
+ lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
+ vcodec, acodec = None, None
+ for full_codec in splited_codecs:
+ codec = full_codec.split('.')[0]
+ if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
+ if not vcodec:
+ vcodec = full_codec
+ elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'):
+ if not acodec:
+ acodec = full_codec
+ else:
+ write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
+ if not vcodec and not acodec:
+ if len(splited_codecs) == 2:
+ return {
+ 'vcodec': vcodec,
+ 'acodec': acodec,
+ }
+ elif len(splited_codecs) == 1:
+ return {
+ 'vcodec': 'none',
+ 'acodec': vcodec,
+ }
+ else:
+ return {
+ 'vcodec': vcodec or 'none',
+ 'acodec': acodec or 'none',
+ }
+ return {}
+
+
def urlhandle_detect_ext(url_handle):
getheader = url_handle.headers.get