aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/common.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/common.py')
-rw-r--r--youtube_dl/extractor/common.py90
1 files changed, 60 insertions, 30 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 1e7db8a9b..655207447 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -10,20 +10,17 @@ import re
import socket
import sys
import time
-import xml.etree.ElementTree
from ..compat import (
compat_cookiejar,
compat_cookies,
compat_getpass,
- compat_HTTPError,
compat_http_client,
compat_urllib_error,
compat_urllib_parse,
- compat_urllib_parse_urlparse,
- compat_urllib_request,
compat_urlparse,
compat_str,
+ compat_etree_fromstring,
)
from ..utils import (
NO_DEFAULT,
@@ -32,16 +29,20 @@ from ..utils import (
clean_html,
compiled_regex_type,
determine_ext,
+ error_to_compat_str,
ExtractorError,
fix_xml_ampersands,
float_or_none,
int_or_none,
RegexNotFoundError,
sanitize_filename,
+ sanitized_Request,
unescapeHTML,
+ unified_strdate,
url_basename,
xpath_text,
xpath_with_ns,
+ determine_protocol,
)
@@ -152,6 +153,7 @@ class InfoExtractor(object):
description: Full video description.
uploader: Full name of the video uploader.
creator: The main artist who created the video.
+ release_date: The date (YYYYMMDD) when the video was released.
timestamp: UNIX timestamp of the moment the video became available.
upload_date: Video upload date (YYYYMMDD).
If not explicitly set, calculated from timestamp.
@@ -163,12 +165,14 @@ class InfoExtractor(object):
with the "ext" entry and one of:
* "data": The subtitles file contents
* "url": A URL pointing to the subtitles file
+ "ext" will be calculated from URL if missing
automatic_captions: Like 'subtitles', used by the YoutubeIE for
automatically generated captions
- duration: Length of the video in seconds, as an integer.
+ duration: Length of the video in seconds, as an integer or float.
view_count: How many users have watched the video on the platform.
like_count: Number of positive ratings of the video
dislike_count: Number of negative ratings of the video
+ repost_count: Number of reposts of the video
average_rating: Average rating give by users, the scale used depends on the webpage
comment_count: Number of comments on the video
comments: A list of comments, each with one or more of the following
@@ -307,11 +311,11 @@ class InfoExtractor(object):
@classmethod
def ie_key(cls):
"""A string for getting the InfoExtractor with get_info_extractor"""
- return cls.__name__[:-2]
+ return compat_str(cls.__name__[:-2])
@property
def IE_NAME(self):
- return type(self).__name__[:-2]
+ return compat_str(type(self).__name__[:-2])
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns the response handle """
@@ -329,7 +333,8 @@ class InfoExtractor(object):
return False
if errnote is None:
errnote = 'Unable to download webpage'
- errmsg = '%s: %s' % (errnote, compat_str(err))
+
+ errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
if fatal:
raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
else:
@@ -458,7 +463,7 @@ class InfoExtractor(object):
return xml_string
if transform_source:
xml_string = transform_source(xml_string)
- return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+ return compat_etree_fromstring(xml_string.encode('utf-8'))
def _download_json(self, url_or_request, video_id,
note='Downloading JSON metadata',
@@ -619,7 +624,7 @@ class InfoExtractor(object):
else:
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
except (IOError, netrc.NetrcParseError) as err:
- self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
+ self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
return (username, password)
@@ -642,8 +647,9 @@ class InfoExtractor(object):
# Helper functions for extracting OpenGraph info
@staticmethod
def _og_regexes(prop):
- content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
- property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
+ content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
+ property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
+ % {'prop': re.escape(prop)})
template = r'<meta[^>]+?%s[^>]+?%s'
return [
template % (property_re, content_re),
@@ -772,14 +778,12 @@ class InfoExtractor(object):
preference = f.get('preference')
if preference is None:
- proto = f.get('protocol')
- if proto is None:
- proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
-
- preference = 0 if proto in ['http', 'https'] else -0.1
+ preference = 0
if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
preference -= 0.5
+ proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
+
if f.get('vcodec') == 'none': # audio only
if self._downloader.params.get('prefer_free_formats'):
ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
@@ -810,6 +814,7 @@ class InfoExtractor(object):
f.get('vbr') if f.get('vbr') is not None else -1,
f.get('height') if f.get('height') is not None else -1,
f.get('width') if f.get('width') is not None else -1,
+ proto_preference,
ext_preference,
f.get('abr') if f.get('abr') is not None else -1,
audio_ext_preference,
@@ -837,7 +842,7 @@ class InfoExtractor(object):
self._request_webpage(url, video_id, 'Checking %s URL' % item)
return True
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError):
+ if isinstance(e.cause, compat_urllib_error.URLError):
self.to_screen(
'%s: %s URL is invalid, skipping' % (video_id, item))
return False
@@ -868,13 +873,18 @@ class InfoExtractor(object):
time.sleep(timeout)
def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
- transform_source=lambda s: fix_xml_ampersands(s).strip()):
+ transform_source=lambda s: fix_xml_ampersands(s).strip(),
+ fatal=True):
manifest = self._download_xml(
manifest_url, video_id, 'Downloading f4m manifest',
'Unable to download f4m manifest',
# Some manifests may be malformed, e.g. prosiebensat1 generated manifests
# (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
- transform_source=transform_source)
+ transform_source=transform_source,
+ fatal=fatal)
+
+ if manifest is False:
+ return []
formats = []
manifest_version = '1.0'
@@ -882,6 +892,11 @@ class InfoExtractor(object):
if not media_nodes:
manifest_version = '2.0'
media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
+ base_url = xpath_text(
+ manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
+ 'base URL', default=None)
+ if base_url:
+ base_url = base_url.strip()
for i, media_el in enumerate(media_nodes):
if manifest_version == '2.0':
media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
@@ -889,13 +904,14 @@ class InfoExtractor(object):
continue
manifest_url = (
media_url if media_url.startswith('http://') or media_url.startswith('https://')
- else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
+ else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
# If media_url is itself a f4m manifest do the recursive extraction
# since bitrates in parent manifest (this one) and media_url manifest
# may differ leading to inability to resolve the format by requested
# bitrate in f4m downloader
if determine_ext(manifest_url) == 'f4m':
- formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
+ formats.extend(self._extract_f4m_formats(
+ manifest_url, video_id, preference, f4m_id, fatal=fatal))
continue
tbr = int_or_none(media_el.attrib.get('bitrate'))
formats.append({
@@ -931,13 +947,15 @@ class InfoExtractor(object):
if re.match(r'^https?://', u)
else compat_urlparse.urljoin(m3u8_url, u))
- m3u8_doc = self._download_webpage(
+ res = self._download_webpage_handle(
m3u8_url, video_id,
note=note or 'Downloading m3u8 information',
errnote=errnote or 'Failed to download m3u8 information',
fatal=fatal)
- if m3u8_doc is False:
- return m3u8_doc
+ if res is False:
+ return []
+ m3u8_doc, urlh = res
+ m3u8_url = urlh.geturl()
last_info = None
last_media = None
kv_rex = re.compile(
@@ -1043,6 +1061,7 @@ class InfoExtractor(object):
video_id = os.path.splitext(url_basename(smil_url))[0]
title = None
description = None
+ upload_date = None
for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
name = meta.attrib.get('name')
content = meta.attrib.get('content')
@@ -1052,11 +1071,22 @@ class InfoExtractor(object):
title = content
elif not description and name in ('description', 'abstract'):
description = content
+ elif not upload_date and name == 'date':
+ upload_date = unified_strdate(content)
+
+ thumbnails = [{
+ 'id': image.get('type'),
+ 'url': image.get('src'),
+ 'width': int_or_none(image.get('width')),
+ 'height': int_or_none(image.get('height')),
+ } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
return {
'id': video_id,
'title': title or video_id,
'description': description,
+ 'upload_date': upload_date,
+ 'thumbnails': thumbnails,
'formats': formats,
'subtitles': subtitles,
}
@@ -1083,7 +1113,7 @@ class InfoExtractor(object):
if not src:
continue
- bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+ bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
filesize = int_or_none(video.get('size') or video.get('fileSize'))
width = int_or_none(video.get('width'))
height = int_or_none(video.get('height'))
@@ -1116,7 +1146,7 @@ class InfoExtractor(object):
if proto == 'm3u8' or src_ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
- src_url, video_id, ext or 'mp4', m3u8_id='hls'))
+ src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False))
continue
if src_ext == 'f4m':
@@ -1128,10 +1158,10 @@ class InfoExtractor(object):
}
f4m_url += '&' if '?' in f4m_url else '?'
f4m_url += compat_urllib_parse.urlencode(f4m_params)
- formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
+ formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
continue
- if src_url.startswith('http'):
+ if src_url.startswith('http') and self._is_valid_url(src, video_id):
http_count += 1
formats.append({
'url': src_url,
@@ -1250,7 +1280,7 @@ class InfoExtractor(object):
def _get_cookies(self, url):
""" Return a compat_cookies.SimpleCookie with the cookies for the url """
- req = compat_urllib_request.Request(url)
+ req = sanitized_Request(url)
self._downloader.cookiejar.add_cookie_header(req)
return compat_cookies.SimpleCookie(req.get_header('Cookie'))