aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/common.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/common.py')
-rw-r--r--youtube_dl/extractor/common.py58
1 files changed, 50 insertions, 8 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index ec988fc90..2a5a85dc6 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -14,6 +14,7 @@ from ..utils import (
clean_html,
compiled_regex_type,
ExtractorError,
+ unescapeHTML,
)
class InfoExtractor(object):
@@ -34,6 +35,8 @@ class InfoExtractor(object):
title: Video title, unescaped.
ext: Video filename extension.
+ Instead of url and ext, formats can also specified.
+
The following fields are optional:
format: The video format, defaults to ext (used for --get-format)
@@ -46,12 +49,25 @@ class InfoExtractor(object):
uploader_id: Nickname or id of the video uploader.
location: Physical location of the video.
player_url: SWF Player URL (used for rtmpdump).
- subtitles: The subtitle file contents.
+ subtitles: The subtitle file contents as a dictionary in the format
+ {language: subtitles}.
view_count: How many users have watched the video on the platform.
urlhandle: [internal] The urlHandle to be used to download the file,
like returned by urllib.request.urlopen
-
- The fields should all be Unicode strings.
+ age_limit: Age restriction for the video, as an integer (years)
+ formats: A list of dictionaries for each format available, it must
+ be ordered from worst to best quality. Potential fields:
+ * url Mandatory. The URL of the video file
+ * ext Will be calculated from url if missing
+ * format A human-readable description of the format
+ ("mp4 container with h264/opus").
+ Calculated from width and height if missing.
+ * format_id A short description of the format
+ ("mp4_h264_opus" or "19")
+ * width Width of the video, if known
+ * height Height of the video, if known
+
+ Unless mentioned otherwise, the fields should be Unicode strings.
Subclasses of this one should re-define the _real_initialize() and
_real_extract() methods and define a _VALID_URL regexp.
@@ -76,7 +92,13 @@ class InfoExtractor(object):
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
- return re.match(cls._VALID_URL, url) is not None
+
+ # This does not use has/getattr intentionally - we want to know whether
+ # we have cached the regexp for *this* class, whereas getattr would also
+ # match the superclass
+ if '_VALID_URL_RE' not in cls.__dict__:
+ cls._VALID_URL_RE = re.compile(cls._VALID_URL)
+ return cls._VALID_URL_RE.match(url) is not None
@classmethod
def working(cls):
@@ -106,6 +128,11 @@ class InfoExtractor(object):
"""Real extraction process. Redefine in subclasses."""
pass
+ @classmethod
+ def ie_key(cls):
+ """A string for getting the InfoExtractor with get_info_extractor"""
+ return cls.__name__[:-2]
+
@property
def IE_NAME(self):
return type(self).__name__[:-2]
@@ -121,7 +148,7 @@ class InfoExtractor(object):
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
if errnote is None:
errnote = u'Unable to download webpage'
- raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
+ raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
""" Returns a tuple (page content as string, URL handle) """
@@ -132,12 +159,17 @@ class InfoExtractor(object):
urlh = self._request_webpage(url_or_request, video_id, note, errnote)
content_type = urlh.headers.get('Content-Type', '')
+ webpage_bytes = urlh.read()
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
if m:
encoding = m.group(1)
else:
- encoding = 'utf-8'
- webpage_bytes = urlh.read()
+ m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
+ webpage_bytes[:1024])
+ if m:
+ encoding = m.group(1).decode('ascii')
+ else:
+ encoding = 'utf-8'
if self._downloader.params.get('dump_intermediate_pages', False):
try:
url = url_or_request.get_full_url()
@@ -270,7 +302,8 @@ class InfoExtractor(object):
def _og_search_property(self, prop, html, name=None, **kargs):
if name is None:
name = 'OpenGraph %s' % prop
- return self._html_search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
+ escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
+ return unescapeHTML(escaped)
def _og_search_thumbnail(self, html, **kargs):
return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
@@ -286,6 +319,15 @@ class InfoExtractor(object):
self._og_regex('video')],
html, name, **kargs)
+ def _rta_search(self, html):
+ # See http://www.rtalabel.org/index.php?content=howtofaq#single
+ if re.search(r'(?ix)<meta\s+name="rating"\s+'
+ r' content="RTA-5042-1996-1400-1577-RTA"',
+ html):
+ return 18
+ return 0
+
+
class SearchInfoExtractor(InfoExtractor):
"""
Base class for paged search queries extractors.