aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/common.py
diff options
context:
space:
mode:
authorTithen-Firion <Tithen-Firion@users.noreply.github.com>2017-05-04 11:00:06 +0200
committerGitHub <noreply@github.com>2017-05-04 11:00:06 +0200
commitc89267d31ad99eb5b1a87cd354de5280a2a087b1 (patch)
tree8bb3b01cd088d0646089344bddd3d4ff272c0065 /youtube_dl/extractor/common.py
parent7552f96352f35cd877e52fd0770b77ba1856fc62 (diff)
parent0c265486016b06342fb257966474ce591667aaff (diff)
downloadyoutube-dl-c89267d31ad99eb5b1a87cd354de5280a2a087b1.tar.xz
Merge branch 'master' into openload-phantomjs-method
Diffstat (limited to 'youtube_dl/extractor/common.py')
-rw-r--r--youtube_dl/extractor/common.py213
1 files changed, 138 insertions, 75 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index e54adc9f0..76b5378e9 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -245,6 +245,10 @@ class InfoExtractor(object):
specified in the URL.
end_time: Time in seconds where the reproduction should end, as
specified in the URL.
+ chapters: A list of dictionaries, with the following entries:
+ * "start_time" - The start time of the chapter in seconds
+ * "end_time" - The end time of the chapter in seconds
+ * "title" (optional, string)
The following fields should only be used when the video belongs to some logical
chapter or section:
@@ -976,6 +980,23 @@ class InfoExtractor(object):
return info
if isinstance(json_ld, dict):
json_ld = [json_ld]
+
+ def extract_video_object(e):
+ assert e['@type'] == 'VideoObject'
+ info.update({
+ 'url': e.get('contentUrl'),
+ 'title': unescapeHTML(e.get('name')),
+ 'description': unescapeHTML(e.get('description')),
+ 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
+ 'duration': parse_duration(e.get('duration')),
+ 'timestamp': unified_timestamp(e.get('uploadDate')),
+ 'filesize': float_or_none(e.get('contentSize')),
+ 'tbr': int_or_none(e.get('bitrate')),
+ 'width': int_or_none(e.get('width')),
+ 'height': int_or_none(e.get('height')),
+ 'view_count': int_or_none(e.get('interactionCount')),
+ })
+
for e in json_ld:
if e.get('@context') == 'http://schema.org':
item_type = e.get('@type')
@@ -1000,18 +1021,11 @@ class InfoExtractor(object):
'description': unescapeHTML(e.get('articleBody')),
})
elif item_type == 'VideoObject':
- info.update({
- 'url': e.get('contentUrl'),
- 'title': unescapeHTML(e.get('name')),
- 'description': unescapeHTML(e.get('description')),
- 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
- 'duration': parse_duration(e.get('duration')),
- 'timestamp': unified_timestamp(e.get('uploadDate')),
- 'filesize': float_or_none(e.get('contentSize')),
- 'tbr': int_or_none(e.get('bitrate')),
- 'width': int_or_none(e.get('width')),
- 'height': int_or_none(e.get('height')),
- })
+ extract_video_object(e)
+ elif item_type == 'WebPage':
+ video = e.get('video')
+ if isinstance(video, dict) and video.get('@type') == 'VideoObject':
+ extract_video_object(video)
break
return dict((k, v) for k, v in info.items() if v is not None)
@@ -1303,40 +1317,50 @@ class InfoExtractor(object):
entry_protocol='m3u8', preference=None,
m3u8_id=None, note=None, errnote=None,
fatal=True, live=False):
-
res = self._download_webpage_handle(
m3u8_url, video_id,
note=note or 'Downloading m3u8 information',
errnote=errnote or 'Failed to download m3u8 information',
fatal=fatal)
+
if res is False:
return []
+
m3u8_doc, urlh = res
m3u8_url = urlh.geturl()
+ return self._parse_m3u8_formats(
+ m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
+ preference=preference, m3u8_id=m3u8_id, live=live)
+
+ def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
+ entry_protocol='m3u8', preference=None,
+ m3u8_id=None, live=False):
if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
return []
- formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
+ formats = []
format_url = lambda u: (
u
if re.match(r'^https?://', u)
else compat_urlparse.urljoin(m3u8_url, u))
- # We should try extracting formats only from master playlists [1], i.e.
- # playlists that describe available qualities. On the other hand media
- # playlists [2] should be returned as is since they contain just the media
- # without qualities renditions.
+ # References:
+ # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
+ # 2. https://github.com/rg3/youtube-dl/issues/12211
+
+ # We should try extracting formats only from master playlists [1, 4.3.4],
+ # i.e. playlists that describe available qualities. On the other hand
+ # media playlists [1, 4.3.3] should be returned as is since they contain
+ # just the media without qualities renditions.
# Fortunately, master playlist can be easily distinguished from media
- # playlist based on particular tags availability. As of [1, 2] master
- # playlist tags MUST NOT appear in a media playist and vice versa.
- # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
- # and MUST NOT appear in master playlist thus we can clearly detect media
- # playlist with this criterion.
- # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
- # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
- # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
+ # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
+ # master playlist tags MUST NOT appear in a media playist and vice versa.
+ # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
+ # media playlist and MUST NOT appear in master playlist thus we can
+ # clearly detect media playlist with this criterion.
+
if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
return [{
'url': m3u8_url,
@@ -1345,52 +1369,72 @@ class InfoExtractor(object):
'protocol': entry_protocol,
'preference': preference,
}]
- audio_in_video_stream = {}
- last_info = {}
- last_media = {}
+
+ groups = {}
+ last_stream_inf = {}
+
+ def extract_media(x_media_line):
+ media = parse_m3u8_attributes(x_media_line)
+ # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
+ media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
+ if not (media_type and group_id and name):
+ return
+ groups.setdefault(group_id, []).append(media)
+ if media_type not in ('VIDEO', 'AUDIO'):
+ return
+ media_url = media.get('URI')
+ if media_url:
+ format_id = []
+ for v in (group_id, name):
+ if v:
+ format_id.append(v)
+ f = {
+ 'format_id': '-'.join(format_id),
+ 'url': format_url(media_url),
+ 'manifest_url': m3u8_url,
+ 'language': media.get('LANGUAGE'),
+ 'ext': ext,
+ 'protocol': entry_protocol,
+ 'preference': preference,
+ }
+ if media_type == 'AUDIO':
+ f['vcodec'] = 'none'
+ formats.append(f)
+
+ def build_stream_name():
+ # Despite specification does not mention NAME attribute for
+ # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
+ # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
+ # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
+ stream_name = last_stream_inf.get('NAME')
+ if stream_name:
+ return stream_name
+ # If there is no NAME in EXT-X-STREAM-INF it will be obtained
+ # from corresponding rendition group
+ stream_group_id = last_stream_inf.get('VIDEO')
+ if not stream_group_id:
+ return
+ stream_group = groups.get(stream_group_id)
+ if not stream_group:
+ return stream_group_id
+ rendition = stream_group[0]
+ return rendition.get('NAME') or stream_group_id
+
for line in m3u8_doc.splitlines():
if line.startswith('#EXT-X-STREAM-INF:'):
- last_info = parse_m3u8_attributes(line)
+ last_stream_inf = parse_m3u8_attributes(line)
elif line.startswith('#EXT-X-MEDIA:'):
- media = parse_m3u8_attributes(line)
- media_type = media.get('TYPE')
- if media_type in ('VIDEO', 'AUDIO'):
- group_id = media.get('GROUP-ID')
- media_url = media.get('URI')
- if media_url:
- format_id = []
- for v in (group_id, media.get('NAME')):
- if v:
- format_id.append(v)
- f = {
- 'format_id': '-'.join(format_id),
- 'url': format_url(media_url),
- 'language': media.get('LANGUAGE'),
- 'ext': ext,
- 'protocol': entry_protocol,
- 'preference': preference,
- }
- if media_type == 'AUDIO':
- f['vcodec'] = 'none'
- if group_id and not audio_in_video_stream.get(group_id):
- audio_in_video_stream[group_id] = False
- formats.append(f)
- else:
- # When there is no URI in EXT-X-MEDIA let this tag's
- # data be used by regular URI lines below
- last_media = media
- if media_type == 'AUDIO' and group_id:
- audio_in_video_stream[group_id] = True
+ extract_media(line)
elif line.startswith('#') or not line.strip():
continue
else:
- tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
+ tbr = float_or_none(
+ last_stream_inf.get('AVERAGE-BANDWIDTH') or
+ last_stream_inf.get('BANDWIDTH'), scale=1000)
format_id = []
if m3u8_id:
format_id.append(m3u8_id)
- # Despite specification does not mention NAME attribute for
- # EXT-X-STREAM-INF it still sometimes may be present
- stream_name = last_info.get('NAME') or last_media.get('NAME')
+ stream_name = build_stream_name()
# Bandwidth of live streams may differ over time thus making
# format_id unpredictable. So it's better to keep provided
# format_id intact.
@@ -1400,14 +1444,14 @@ class InfoExtractor(object):
f = {
'format_id': '-'.join(format_id),
'url': manifest_url,
- 'manifest_url': manifest_url,
+ 'manifest_url': m3u8_url,
'tbr': tbr,
'ext': ext,
- 'fps': float_or_none(last_info.get('FRAME-RATE')),
+ 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
'protocol': entry_protocol,
'preference': preference,
}
- resolution = last_info.get('RESOLUTION')
+ resolution = last_stream_inf.get('RESOLUTION')
if resolution:
mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
if mobj:
@@ -1423,13 +1467,26 @@ class InfoExtractor(object):
'vbr': vbr,
'abr': abr,
})
- f.update(parse_codecs(last_info.get('CODECS')))
- if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
- # TODO: update acodec for audio only formats with the same GROUP-ID
- f['acodec'] = 'none'
+ codecs = parse_codecs(last_stream_inf.get('CODECS'))
+ f.update(codecs)
+ audio_group_id = last_stream_inf.get('AUDIO')
+ # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
+ # references a rendition group MUST have a CODECS attribute.
+ # However, this is not always respected, for example, [2]
+ # contains EXT-X-STREAM-INF tag which references AUDIO
+ # rendition group but does not have CODECS and despite
+ # referencing audio group an audio group, it represents
+ # a complete (with audio and video) format. So, for such cases
+ # we will ignore references to rendition groups and treat them
+ # as complete formats.
+ if audio_group_id and codecs and f.get('vcodec') != 'none':
+ audio_group = groups.get(audio_group_id)
+ if audio_group and audio_group[0].get('URI'):
+ # TODO: update acodec for audio only formats with
+ # the same GROUP-ID
+ f['acodec'] = 'none'
formats.append(f)
- last_info = {}
- last_media = {}
+ last_stream_inf = {}
return formats
@staticmethod
@@ -1803,7 +1860,7 @@ class InfoExtractor(object):
'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')),
'height': int_or_none(representation_attrib.get('height')),
- 'tbr': int_or_none(bandwidth, 1000),
+ 'tbr': float_or_none(bandwidth, 1000),
'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
'fps': int_or_none(representation_attrib.get('frameRate')),
'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
@@ -2182,7 +2239,7 @@ class InfoExtractor(object):
def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
mobj = re.search(
- r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
+ r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
webpage)
if mobj:
try:
@@ -2258,11 +2315,17 @@ class InfoExtractor(object):
def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
+ urls = []
formats = []
for source in jwplayer_sources_data:
- source_url = self._proto_relative_url(source['file'])
+ source_url = self._proto_relative_url(source.get('file'))
+ if not source_url:
+ continue
if base_url:
source_url = compat_urlparse.urljoin(base_url, source_url)
+ if source_url in urls:
+ continue
+ urls.append(source_url)
source_type = source.get('type') or ''
ext = mimetype2ext(source_type) or determine_ext(source_url)
if source_type == 'hls' or ext == 'm3u8':