aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/common.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/common.py')
-rw-r--r--youtube_dl/extractor/common.py534
1 files changed, 447 insertions, 87 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 48742189a..eb9bfa3d1 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -10,30 +10,39 @@ import re
import socket
import sys
import time
-import xml.etree.ElementTree
from ..compat import (
compat_cookiejar,
- compat_HTTPError,
+ compat_cookies,
+ compat_getpass,
compat_http_client,
compat_urllib_error,
+ compat_urllib_parse,
compat_urllib_parse_urlparse,
compat_urlparse,
compat_str,
+ compat_etree_fromstring,
)
from ..utils import (
+ NO_DEFAULT,
age_restricted,
+ bug_reports_message,
clean_html,
compiled_regex_type,
+ determine_ext,
ExtractorError,
+ fix_xml_ampersands,
float_or_none,
- HEADRequest,
int_or_none,
RegexNotFoundError,
sanitize_filename,
+ sanitized_Request,
unescapeHTML,
+ unified_strdate,
+ url_basename,
+ xpath_text,
+ xpath_with_ns,
)
-_NO_DEFAULT = object()
class InfoExtractor(object):
@@ -47,7 +56,7 @@ class InfoExtractor(object):
information possibly downloading the video to the file system, among
other possible outcomes.
- The type field determines the the type of the result.
+ The type field determines the type of the result.
By far the most common value (and the default if _type is missing) is
"video", which indicates a single video.
@@ -63,7 +72,7 @@ class InfoExtractor(object):
Potential fields:
* url Mandatory. The URL of the video file
- * ext Will be calculated from url if missing
+ * ext Will be calculated from URL if missing
* format A human-readable description of the format
("mp4 container with h264/opus").
Calculated from the format_id, width, height.
@@ -111,11 +120,8 @@ class InfoExtractor(object):
(quality takes higher priority)
-1 for default (order by other properties),
-2 or smaller for less than default.
- * http_method HTTP method to use for the download.
* http_headers A dictionary of additional HTTP headers
to add to the request.
- * http_post_data Additional data to send with a POST
- request.
* stretched_ratio If given and not 1, indicates that the
video's pixels are not square.
width : height ratio as float.
@@ -146,17 +152,26 @@ class InfoExtractor(object):
description: Full video description.
uploader: Full name of the video uploader.
creator: The main artist who created the video.
+ release_date: The date (YYYYMMDD) when the video was released.
timestamp: UNIX timestamp of the moment the video became available.
upload_date: Video upload date (YYYYMMDD).
If not explicitly set, calculated from timestamp.
uploader_id: Nickname or id of the video uploader.
location: Physical location where the video was filmed.
- subtitles: The subtitle file contents as a dictionary in the format
- {language: subtitles}.
+ subtitles: The available subtitles as a dictionary in the format
+ {language: subformats}. "subformats" is a list sorted from
+ lower to higher preference, each element is a dictionary
+ with the "ext" entry and one of:
+ * "data": The subtitles file contents
+ * "url": A URL pointing to the subtitles file
+ "ext" will be calculated from URL if missing
+ automatic_captions: Like 'subtitles', used by the YoutubeIE for
+ automatically generated captions
duration: Length of the video in seconds, as an integer.
view_count: How many users have watched the video on the platform.
like_count: Number of positive ratings of the video
dislike_count: Number of negative ratings of the video
+ repost_count: Number of reposts of the video
average_rating: Average rating give by users, the scale used depends on the webpage
comment_count: Number of comments on the video
comments: A list of comments, each with one or more of the following
@@ -171,13 +186,18 @@ class InfoExtractor(object):
Set to "root" to indicate that this is a
comment to the original video.
age_limit: Age restriction for the video, as an integer (years)
- webpage_url: The url to the video webpage, if given to youtube-dl it
+ webpage_url: The URL to the video webpage, if given to youtube-dl it
should allow to get the same result again. (It will be set
by YoutubeDL if it's missing)
categories: A list of categories that the video falls in, for example
["Sports", "Berlin"]
+ tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
is_live: True, False, or None (=unknown). Whether this video is a
live stream that goes on instead of a fixed-length video.
+ start_time: Time in seconds where the reproduction should start, as
+ specified in the URL.
+ end_time: Time in seconds where the reproduction should end, as
+ specified in the URL.
Unless mentioned otherwise, the fields should be Unicode strings.
@@ -188,8 +208,8 @@ class InfoExtractor(object):
There must be a key "entries", which is a list, an iterable, or a PagedList
object, each element of which is a valid dictionary by this specification.
- Additionally, playlists can have "title" and "id" attributes with the same
- semantics as videos (see above).
+ Additionally, playlists can have "title", "description" and "id" attributes
+ with the same semantics as videos (see above).
_type "multi_video" indicates that there are multiple videos that
@@ -290,11 +310,11 @@ class InfoExtractor(object):
@classmethod
def ie_key(cls):
"""A string for getting the InfoExtractor with get_info_extractor"""
- return cls.__name__[:-2]
+ return compat_str(cls.__name__[:-2])
@property
def IE_NAME(self):
- return type(self).__name__[:-2]
+ return compat_str(type(self).__name__[:-2])
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns the response handle """
@@ -319,7 +339,7 @@ class InfoExtractor(object):
self._downloader.report_warning(errmsg)
return False
- def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
+ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
""" Returns a tuple (page content as string, URL handle) """
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)):
@@ -329,14 +349,11 @@ class InfoExtractor(object):
if urlh is False:
assert not fatal
return False
- content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
+ content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
return (content, urlh)
- def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
- content_type = urlh.headers.get('Content-Type', '')
- webpage_bytes = urlh.read()
- if prefix is not None:
- webpage_bytes = prefix + webpage_bytes
+ @staticmethod
+ def _guess_encoding_from_content(content_type, webpage_bytes):
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
if m:
encoding = m.group(1)
@@ -349,6 +366,16 @@ class InfoExtractor(object):
encoding = 'utf-16'
else:
encoding = 'utf-8'
+
+ return encoding
+
+ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
+ content_type = urlh.headers.get('Content-Type', '')
+ webpage_bytes = urlh.read()
+ if prefix is not None:
+ webpage_bytes = prefix + webpage_bytes
+ if not encoding:
+ encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
if self._downloader.params.get('dump_intermediate_pages', False):
try:
url = url_or_request.get_full_url()
@@ -392,16 +419,26 @@ class InfoExtractor(object):
if blocked_iframe:
msg += ' Visit %s for more details' % blocked_iframe
raise ExtractorError(msg, expected=True)
+ if '<title>The URL you requested has been blocked</title>' in content[:512]:
+ msg = (
+ 'Access to this webpage has been blocked by Indian censorship. '
+ 'Use a VPN or proxy server (with --proxy) to route around it.')
+ block_msg = self._html_search_regex(
+ r'</h1><p>(.*?)</p>',
+ content, 'block message', default=None)
+ if block_msg:
+ msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
+ raise ExtractorError(msg, expected=True)
return content
- def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
+ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
""" Returns the data of the page as a string """
success = False
try_count = 0
while success is False:
try:
- res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
+ res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
success = True
except compat_http_client.IncompleteRead as e:
try_count += 1
@@ -416,23 +453,24 @@ class InfoExtractor(object):
def _download_xml(self, url_or_request, video_id,
note='Downloading XML', errnote='Unable to download XML',
- transform_source=None, fatal=True):
+ transform_source=None, fatal=True, encoding=None):
"""Return the xml as an xml.etree.ElementTree.Element"""
xml_string = self._download_webpage(
- url_or_request, video_id, note, errnote, fatal=fatal)
+ url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
if xml_string is False:
return xml_string
if transform_source:
xml_string = transform_source(xml_string)
- return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+ return compat_etree_fromstring(xml_string.encode('utf-8'))
def _download_json(self, url_or_request, video_id,
note='Downloading JSON metadata',
errnote='Unable to download JSON metadata',
transform_source=None,
- fatal=True):
+ fatal=True, encoding=None):
json_string = self._download_webpage(
- url_or_request, video_id, note, errnote, fatal=fatal)
+ url_or_request, video_id, note, errnote, fatal=fatal,
+ encoding=encoding)
if (not fatal) and json_string is False:
return None
return self._parse_json(
@@ -475,16 +513,30 @@ class InfoExtractor(object):
"""Report attempt to log in."""
self.to_screen('Logging in')
+ @staticmethod
+ def raise_login_required(msg='This video is only available for registered users'):
+ raise ExtractorError(
+ '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
+ expected=True)
+
+ @staticmethod
+ def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
+ raise ExtractorError(
+ '%s. You might want to use --proxy to workaround.' % msg,
+ expected=True)
+
# Methods for following #608
@staticmethod
- def url_result(url, ie=None, video_id=None):
- """Returns a url that points to a page that should be processed"""
+ def url_result(url, ie=None, video_id=None, video_title=None):
+ """Returns a URL that points to a page that should be processed"""
# TODO: ie should be the class used for getting the info
video_info = {'_type': 'url',
'url': url,
'ie_key': ie}
if video_id is not None:
video_info['id'] = video_id
+ if video_title is not None:
+ video_info['title'] = video_title
return video_info
@staticmethod
@@ -500,7 +552,7 @@ class InfoExtractor(object):
video_info['description'] = playlist_description
return video_info
- def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
+ def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Perform a regex search on the given string, using a single or a list of
patterns returning the first matching group.
@@ -526,16 +578,15 @@ class InfoExtractor(object):
return next(g for g in mobj.groups() if g is not None)
else:
return mobj.group(group)
- elif default is not _NO_DEFAULT:
+ elif default is not NO_DEFAULT:
return default
elif fatal:
raise RegexNotFoundError('Unable to extract %s' % _name)
else:
- self._downloader.report_warning('unable to extract %s; '
- 'please report this issue on http://yt-dl.org/bug' % _name)
+ self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
return None
- def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
+ def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Like _search_regex, but strips HTML tags and unescapes entities.
"""
@@ -547,7 +598,7 @@ class InfoExtractor(object):
def _get_login_info(self):
"""
- Get the the login info as (username, password)
+ Get the login info as (username, password)
It will look in the netrc file using the _NETRC_MACHINE value
If there's no info available, return (None, None)
"""
@@ -575,7 +626,7 @@ class InfoExtractor(object):
return (username, password)
- def _get_tfa_info(self):
+ def _get_tfa_info(self, note='two-factor verification code'):
"""
Get the two-factor authentication info
TODO - asking the user will be required for sms/phone verify
@@ -589,19 +640,26 @@ class InfoExtractor(object):
if downloader_params.get('twofactor', None) is not None:
return downloader_params['twofactor']
- return None
+ return compat_getpass('Type %s and press [Return]: ' % note)
# Helper functions for extracting OpenGraph info
@staticmethod
def _og_regexes(prop):
- content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
- property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
+ content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
+ property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
+ % {'prop': re.escape(prop)})
template = r'<meta[^>]+?%s[^>]+?%s'
return [
template % (property_re, content_re),
template % (content_re, property_re),
]
+ @staticmethod
+ def _meta_regex(prop):
+ return r'''(?isx)<meta
+ (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
+ [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
+
def _og_search_property(self, prop, html, name=None, **kargs):
if name is None:
name = 'OpenGraph %s' % prop
@@ -611,7 +669,7 @@ class InfoExtractor(object):
return unescapeHTML(escaped)
def _og_search_thumbnail(self, html, **kargs):
- return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
+ return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
def _og_search_description(self, html, **kargs):
return self._og_search_property('description', html, fatal=False, **kargs)
@@ -632,9 +690,7 @@ class InfoExtractor(object):
if display_name is None:
display_name = name
return self._html_search_regex(
- r'''(?isx)<meta
- (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
- [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
+ self._meta_regex(name),
html, display_name, fatal=fatal, group='content', **kwargs)
def _dc_search_uploader(self, html):
@@ -665,7 +721,7 @@ class InfoExtractor(object):
return RATING_TABLE.get(rating.lower(), None)
def _family_friendly_search(self, html):
- # See http://schema.org/VideoObj
+ # See http://schema.org/VideoObject
family_friendly = self._html_search_meta('isFamilyFriendly', html)
if not family_friendly:
@@ -683,7 +739,29 @@ class InfoExtractor(object):
return self._html_search_meta('twitter:player', html,
'twitter card player')
- def _sort_formats(self, formats):
+ @staticmethod
+ def _hidden_inputs(html):
+ html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
+ hidden_inputs = {}
+ for input in re.findall(r'(?i)<input([^>]+)>', html):
+ if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
+ continue
+ name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
+ if not name:
+ continue
+ value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
+ if not value:
+ continue
+ hidden_inputs[name.group('value')] = value.group('value')
+ return hidden_inputs
+
+ def _form_hidden_inputs(self, form_id, html):
+ form = self._search_regex(
+ r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
+ html, '%s form' % form_id, group='form')
+ return self._hidden_inputs(form)
+
+ def _sort_formats(self, formats, field_preference=None):
if not formats:
raise ExtractorError('No video formats found')
@@ -693,6 +771,9 @@ class InfoExtractor(object):
if not f.get('ext') and 'url' in f:
f['ext'] = determine_ext(f['url'])
+ if isinstance(field_preference, (list, tuple)):
+ return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
+
preference = f.get('preference')
if preference is None:
proto = f.get('protocol')
@@ -729,6 +810,7 @@ class InfoExtractor(object):
f.get('language_preference') if f.get('language_preference') is not None else -1,
f.get('quality') if f.get('quality') is not None else -1,
f.get('tbr') if f.get('tbr') is not None else -1,
+ f.get('filesize') if f.get('filesize') is not None else -1,
f.get('vbr') if f.get('vbr') is not None else -1,
f.get('height') if f.get('height') is not None else -1,
f.get('width') if f.get('width') is not None else -1,
@@ -736,10 +818,9 @@ class InfoExtractor(object):
f.get('abr') if f.get('abr') is not None else -1,
audio_ext_preference,
f.get('fps') if f.get('fps') is not None else -1,
- f.get('filesize') if f.get('filesize') is not None else -1,
f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
f.get('source_preference') if f.get('source_preference') is not None else -1,
- f.get('format_id'),
+ f.get('format_id') if f.get('format_id') is not None else '',
)
formats.sort(key=_formats_key)
@@ -752,15 +833,17 @@ class InfoExtractor(object):
formats)
def _is_valid_url(self, url, video_id, item='video'):
+ url = self._proto_relative_url(url, scheme='http:')
+ # For now assume non HTTP(S) URLs always valid
+ if not (url.startswith('http://') or url.startswith('https://')):
+ return True
try:
- self._request_webpage(
- HEADRequest(url), video_id,
- 'Checking %s URL' % item)
+ self._request_webpage(url, video_id, 'Checking %s URL' % item)
return True
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError):
- self.report_warning(
- '%s URL is invalid, skipping' % item, video_id)
+ if isinstance(e.cause, compat_urllib_error.URLError):
+ self.to_screen(
+ '%s: %s URL is invalid, skipping' % (video_id, item))
return False
raise
@@ -788,10 +871,19 @@ class InfoExtractor(object):
self.to_screen(msg)
time.sleep(timeout)
- def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None):
+ def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
+ transform_source=lambda s: fix_xml_ampersands(s).strip(),
+ fatal=True):
manifest = self._download_xml(
manifest_url, video_id, 'Downloading f4m manifest',
- 'Unable to download f4m manifest')
+ 'Unable to download f4m manifest',
+ # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
+ # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
+ transform_source=transform_source,
+ fatal=fatal)
+
+ if manifest is False:
+ return manifest
formats = []
manifest_version = '1.0'
@@ -799,13 +891,32 @@ class InfoExtractor(object):
if not media_nodes:
manifest_version = '2.0'
media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
+ base_url = xpath_text(
+ manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
+ 'base URL', default=None)
+ if base_url:
+ base_url = base_url.strip()
for i, media_el in enumerate(media_nodes):
if manifest_version == '2.0':
- manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/'
- + (media_el.attrib.get('href') or media_el.attrib.get('url')))
+ media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
+ if not media_url:
+ continue
+ manifest_url = (
+ media_url if media_url.startswith('http://') or media_url.startswith('https://')
+ else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
+ # If media_url is itself a f4m manifest do the recursive extraction
+ # since bitrates in parent manifest (this one) and media_url manifest
+ # may differ leading to inability to resolve the format by requested
+ # bitrate in f4m downloader
+ if determine_ext(manifest_url) == 'f4m':
+ f4m_formats = self._extract_f4m_formats(
+ manifest_url, video_id, preference, f4m_id, fatal=fatal)
+ if f4m_formats:
+ formats.extend(f4m_formats)
+ continue
tbr = int_or_none(media_el.attrib.get('bitrate'))
formats.append({
- 'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])),
+ 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
'url': manifest_url,
'ext': 'flv',
'tbr': tbr,
@@ -819,14 +930,15 @@ class InfoExtractor(object):
def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
entry_protocol='m3u8', preference=None,
- m3u8_id=None):
+ m3u8_id=None, note=None, errnote=None,
+ fatal=True):
formats = [{
- 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-meta'])),
+ 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
'url': m3u8_url,
'ext': ext,
'protocol': 'm3u8',
- 'preference': -1,
+ 'preference': preference - 1 if preference else -1,
'resolution': 'multiple',
'format_note': 'Quality selection URL',
}]
@@ -836,11 +948,17 @@ class InfoExtractor(object):
if re.match(r'^https?://', u)
else compat_urlparse.urljoin(m3u8_url, u))
- m3u8_doc = self._download_webpage(
+ res = self._download_webpage_handle(
m3u8_url, video_id,
- note='Downloading m3u8 information',
- errnote='Failed to download m3u8 information')
+ note=note or 'Downloading m3u8 information',
+ errnote=errnote or 'Failed to download m3u8 information',
+ fatal=fatal)
+ if res is False:
+ return res
+ m3u8_doc, urlh = res
+ m3u8_url = urlh.geturl()
last_info = None
+ last_media = None
kv_rex = re.compile(
r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
for line in m3u8_doc.splitlines():
@@ -851,6 +969,13 @@ class InfoExtractor(object):
if v.startswith('"'):
v = v[1:-1]
last_info[m.group('key')] = v
+ elif line.startswith('#EXT-X-MEDIA:'):
+ last_media = {}
+ for m in kv_rex.finditer(line):
+ v = m.group('val')
+ if v.startswith('"'):
+ v = v[1:-1]
+ last_media[m.group('key')] = v
elif line.startswith('#') or not line.strip():
continue
else:
@@ -858,8 +983,13 @@ class InfoExtractor(object):
formats.append({'url': format_url(line)})
continue
tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
+ format_id = []
+ if m3u8_id:
+ format_id.append(m3u8_id)
+ last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
+ format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
f = {
- 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-%d' % (tbr if tbr else len(formats))])),
+ 'format_id': '-'.join(format_id),
'url': format_url(line.strip()),
'tbr': tbr,
'ext': ext,
@@ -879,57 +1009,246 @@ class InfoExtractor(object):
width_str, height_str = resolution.split('x')
f['width'] = int(width_str)
f['height'] = int(height_str)
+ if last_media is not None:
+ f['m3u8_media'] = last_media
+ last_media = None
formats.append(f)
last_info = {}
self._sort_formats(formats)
return formats
- # TODO: improve extraction
- def _extract_smil_formats(self, smil_url, video_id, fatal=True):
- smil = self._download_xml(
- smil_url, video_id, 'Downloading SMIL file',
- 'Unable to download SMIL file', fatal=fatal)
+ @staticmethod
+ def _xpath_ns(path, namespace=None):
+ if not namespace:
+ return path
+ out = []
+ for c in path.split('/'):
+ if not c or c == '.':
+ out.append(c)
+ else:
+ out.append('{%s}%s' % (namespace, c))
+ return '/'.join(out)
+
+ def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
+ smil = self._download_smil(smil_url, video_id, fatal=fatal)
+
if smil is False:
assert not fatal
return []
- base = smil.find('./head/meta').get('base')
+ namespace = self._parse_smil_namespace(smil)
+
+ return self._parse_smil_formats(
+ smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+
+ def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
+ smil = self._download_smil(smil_url, video_id, fatal=fatal)
+ if smil is False:
+ return {}
+ return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
+
+ def _download_smil(self, smil_url, video_id, fatal=True):
+ return self._download_xml(
+ smil_url, video_id, 'Downloading SMIL file',
+ 'Unable to download SMIL file', fatal=fatal)
+
+ def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
+ namespace = self._parse_smil_namespace(smil)
+
+ formats = self._parse_smil_formats(
+ smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+ subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
+
+ video_id = os.path.splitext(url_basename(smil_url))[0]
+ title = None
+ description = None
+ upload_date = None
+ for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+ name = meta.attrib.get('name')
+ content = meta.attrib.get('content')
+ if not name or not content:
+ continue
+ if not title and name == 'title':
+ title = content
+ elif not description and name in ('description', 'abstract'):
+ description = content
+ elif not upload_date and name == 'date':
+ upload_date = unified_strdate(content)
+
+ thumbnails = [{
+ 'id': image.get('type'),
+ 'url': image.get('src'),
+ 'width': int_or_none(image.get('width')),
+ 'height': int_or_none(image.get('height')),
+ } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
+
+ return {
+ 'id': video_id,
+ 'title': title or video_id,
+ 'description': description,
+ 'upload_date': upload_date,
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ def _parse_smil_namespace(self, smil):
+ return self._search_regex(
+ r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
+
+ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+ base = smil_url
+ for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+ b = meta.get('base') or meta.get('httpBase')
+ if b:
+ base = b
+ break
formats = []
rtmp_count = 0
- for video in smil.findall('./body/switch/video'):
+ http_count = 0
+
+ videos = smil.findall(self._xpath_ns('.//video', namespace))
+ for video in videos:
src = video.get('src')
if not src:
continue
- bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+
+ bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+ filesize = int_or_none(video.get('size') or video.get('fileSize'))
width = int_or_none(video.get('width'))
height = int_or_none(video.get('height'))
proto = video.get('proto')
- if not proto:
- if base:
- if base.startswith('rtmp'):
- proto = 'rtmp'
- elif base.startswith('http'):
- proto = 'http'
ext = video.get('ext')
- if proto == 'm3u8':
- formats.extend(self._extract_m3u8_formats(src, video_id, ext))
- elif proto == 'rtmp':
+ src_ext = determine_ext(src)
+ streamer = video.get('streamer') or base
+
+ if proto == 'rtmp' or streamer.startswith('rtmp'):
rtmp_count += 1
- streamer = video.get('streamer') or base
formats.append({
'url': streamer,
'play_path': src,
'ext': 'flv',
'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
'tbr': bitrate,
+ 'filesize': filesize,
+ 'width': width,
+ 'height': height,
+ })
+ if transform_rtmp_url:
+ streamer, src = transform_rtmp_url(streamer, src)
+ formats[-1].update({
+ 'url': streamer,
+ 'play_path': src,
+ })
+ continue
+
+ src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
+
+ if proto == 'm3u8' or src_ext == 'm3u8':
+ m3u8_formats = self._extract_m3u8_formats(
+ src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
+ continue
+
+ if src_ext == 'f4m':
+ f4m_url = src_url
+ if not f4m_params:
+ f4m_params = {
+ 'hdcore': '3.2.0',
+ 'plugin': 'flowplayer-3.2.0.1',
+ }
+ f4m_url += '&' if '?' in f4m_url else '?'
+ f4m_url += compat_urllib_parse.urlencode(f4m_params)
+ f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)
+ if f4m_formats:
+ formats.extend(f4m_formats)
+ continue
+
+ if src_url.startswith('http') and self._is_valid_url(src, video_id):
+ http_count += 1
+ formats.append({
+ 'url': src_url,
+ 'ext': ext or src_ext or 'flv',
+ 'format_id': 'http-%d' % (bitrate or http_count),
+ 'tbr': bitrate,
+ 'filesize': filesize,
'width': width,
'height': height,
})
+ continue
+
self._sort_formats(formats)
return formats
+ def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
+ subtitles = {}
+ for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
+ src = textstream.get('src')
+ if not src:
+ continue
+ ext = textstream.get('ext') or determine_ext(src)
+ if not ext:
+ type_ = textstream.get('type')
+ SUBTITLES_TYPES = {
+ 'text/vtt': 'vtt',
+ 'text/srt': 'srt',
+ 'application/smptett+xml': 'tt',
+ }
+ if type_ in SUBTITLES_TYPES:
+ ext = SUBTITLES_TYPES[type_]
+ lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
+ subtitles.setdefault(lang, []).append({
+ 'url': src,
+ 'ext': ext,
+ })
+ return subtitles
+
+ def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
+ xspf = self._download_xml(
+ playlist_url, playlist_id, 'Downloading xpsf playlist',
+ 'Unable to download xspf manifest', fatal=fatal)
+ if xspf is False:
+ return []
+ return self._parse_xspf(xspf, playlist_id)
+
+ def _parse_xspf(self, playlist, playlist_id):
+ NS_MAP = {
+ 'xspf': 'http://xspf.org/ns/0/',
+ 's1': 'http://static.streamone.nl/player/ns/0',
+ }
+
+ entries = []
+ for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
+ title = xpath_text(
+ track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
+ description = xpath_text(
+ track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
+ thumbnail = xpath_text(
+ track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
+ duration = float_or_none(
+ xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
+
+ formats = [{
+ 'url': location.text,
+ 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
+ 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
+ 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
+ } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
+ self._sort_formats(formats)
+
+ entries.append({
+ 'id': playlist_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ })
+ return entries
+
def _live_title(self, name):
""" Generate the title for a live video """
now = datetime.datetime.now()
@@ -964,6 +1283,12 @@ class InfoExtractor(object):
None, '/', True, False, expire_time, '', None, None, None)
self._downloader.cookiejar.set_cookie(cookie)
+ def _get_cookies(self, url):
+ """ Return a compat_cookies.SimpleCookie with the cookies for the url """
+ req = sanitized_Request(url)
+ self._downloader.cookiejar.add_cookie_header(req)
+ return compat_cookies.SimpleCookie(req.get_header('Cookie'))
+
def get_testcases(self, include_onlymatching=False):
t = getattr(self, '_TEST', None)
if t:
@@ -993,11 +1318,46 @@ class InfoExtractor(object):
any_restricted = any_restricted or is_restricted
return not any_restricted
+ def extract_subtitles(self, *args, **kwargs):
+ if (self._downloader.params.get('writesubtitles', False) or
+ self._downloader.params.get('listsubtitles')):
+ return self._get_subtitles(*args, **kwargs)
+ return {}
+
+ def _get_subtitles(self, *args, **kwargs):
+ raise NotImplementedError("This method must be implemented by subclasses")
+
+ @staticmethod
+ def _merge_subtitle_items(subtitle_list1, subtitle_list2):
+ """ Merge subtitle items for one language. Items with duplicated URLs
+ will be dropped. """
+ list1_urls = set([item['url'] for item in subtitle_list1])
+ ret = list(subtitle_list1)
+ ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
+ return ret
+
+ @classmethod
+ def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
+ """ Merge two subtitle dictionaries, language by language. """
+ ret = dict(subtitle_dict1)
+ for lang in subtitle_dict2:
+ ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
+ return ret
+
+ def extract_automatic_captions(self, *args, **kwargs):
+ if (self._downloader.params.get('writeautomaticsub', False) or
+ self._downloader.params.get('listsubtitles')):
+ return self._get_automatic_captions(*args, **kwargs)
+ return {}
+
+ def _get_automatic_captions(self, *args, **kwargs):
+ raise NotImplementedError("This method must be implemented by subclasses")
+
class SearchInfoExtractor(InfoExtractor):
"""
Base class for paged search queries extractors.
- They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
+ They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
Instances should define _SEARCH_KEY and _MAX_RESULTS.
"""