aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/common.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/common.py')
-rw-r--r--youtube_dl/extractor/common.py63
1 files changed, 32 insertions, 31 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 10c0d5d1f..655207447 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -10,7 +10,6 @@ import re
import socket
import sys
import time
-import xml.etree.ElementTree
from ..compat import (
compat_cookiejar,
@@ -19,10 +18,9 @@ from ..compat import (
compat_http_client,
compat_urllib_error,
compat_urllib_parse,
- compat_urllib_parse_urlparse,
- compat_urllib_request,
compat_urlparse,
compat_str,
+ compat_etree_fromstring,
)
from ..utils import (
NO_DEFAULT,
@@ -31,17 +29,20 @@ from ..utils import (
clean_html,
compiled_regex_type,
determine_ext,
+ error_to_compat_str,
ExtractorError,
fix_xml_ampersands,
float_or_none,
int_or_none,
RegexNotFoundError,
sanitize_filename,
+ sanitized_Request,
unescapeHTML,
unified_strdate,
url_basename,
xpath_text,
xpath_with_ns,
+ determine_protocol,
)
@@ -167,7 +168,7 @@ class InfoExtractor(object):
"ext" will be calculated from URL if missing
automatic_captions: Like 'subtitles', used by the YoutubeIE for
automatically generated captions
- duration: Length of the video in seconds, as an integer.
+ duration: Length of the video in seconds, as an integer or float.
view_count: How many users have watched the video on the platform.
like_count: Number of positive ratings of the video
dislike_count: Number of negative ratings of the video
@@ -310,11 +311,11 @@ class InfoExtractor(object):
@classmethod
def ie_key(cls):
"""A string for getting the InfoExtractor with get_info_extractor"""
- return cls.__name__[:-2]
+ return compat_str(cls.__name__[:-2])
@property
def IE_NAME(self):
- return type(self).__name__[:-2]
+ return compat_str(type(self).__name__[:-2])
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns the response handle """
@@ -332,7 +333,8 @@ class InfoExtractor(object):
return False
if errnote is None:
errnote = 'Unable to download webpage'
- errmsg = '%s: %s' % (errnote, compat_str(err))
+
+ errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
if fatal:
raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
else:
@@ -461,7 +463,7 @@ class InfoExtractor(object):
return xml_string
if transform_source:
xml_string = transform_source(xml_string)
- return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+ return compat_etree_fromstring(xml_string.encode('utf-8'))
def _download_json(self, url_or_request, video_id,
note='Downloading JSON metadata',
@@ -622,7 +624,7 @@ class InfoExtractor(object):
else:
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
except (IOError, netrc.NetrcParseError) as err:
- self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
+ self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
return (username, password)
@@ -776,14 +778,12 @@ class InfoExtractor(object):
preference = f.get('preference')
if preference is None:
- proto = f.get('protocol')
- if proto is None:
- proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
-
- preference = 0 if proto in ['http', 'https'] else -0.1
+ preference = 0
if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
preference -= 0.5
+ proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
+
if f.get('vcodec') == 'none': # audio only
if self._downloader.params.get('prefer_free_formats'):
ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
@@ -814,6 +814,7 @@ class InfoExtractor(object):
f.get('vbr') if f.get('vbr') is not None else -1,
f.get('height') if f.get('height') is not None else -1,
f.get('width') if f.get('width') is not None else -1,
+ proto_preference,
ext_preference,
f.get('abr') if f.get('abr') is not None else -1,
audio_ext_preference,
@@ -883,7 +884,7 @@ class InfoExtractor(object):
fatal=fatal)
if manifest is False:
- return manifest
+ return []
formats = []
manifest_version = '1.0'
@@ -891,6 +892,11 @@ class InfoExtractor(object):
if not media_nodes:
manifest_version = '2.0'
media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
+ base_url = xpath_text(
+ manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
+ 'base URL', default=None)
+ if base_url:
+ base_url = base_url.strip()
for i, media_el in enumerate(media_nodes):
if manifest_version == '2.0':
media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
@@ -898,16 +904,14 @@ class InfoExtractor(object):
continue
manifest_url = (
media_url if media_url.startswith('http://') or media_url.startswith('https://')
- else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
+ else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
# If media_url is itself a f4m manifest do the recursive extraction
# since bitrates in parent manifest (this one) and media_url manifest
# may differ leading to inability to resolve the format by requested
# bitrate in f4m downloader
if determine_ext(manifest_url) == 'f4m':
- f4m_formats = self._extract_f4m_formats(
- manifest_url, video_id, preference, f4m_id, fatal=fatal)
- if f4m_formats:
- formats.extend(f4m_formats)
+ formats.extend(self._extract_f4m_formats(
+ manifest_url, video_id, preference, f4m_id, fatal=fatal))
continue
tbr = int_or_none(media_el.attrib.get('bitrate'))
formats.append({
@@ -943,13 +947,14 @@ class InfoExtractor(object):
if re.match(r'^https?://', u)
else compat_urlparse.urljoin(m3u8_url, u))
- m3u8_doc, urlh = self._download_webpage_handle(
+ res = self._download_webpage_handle(
m3u8_url, video_id,
note=note or 'Downloading m3u8 information',
errnote=errnote or 'Failed to download m3u8 information',
fatal=fatal)
- if m3u8_doc is False:
- return m3u8_doc
+ if res is False:
+ return []
+ m3u8_doc, urlh = res
m3u8_url = urlh.geturl()
last_info = None
last_media = None
@@ -1140,10 +1145,8 @@ class InfoExtractor(object):
src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
if proto == 'm3u8' or src_ext == 'm3u8':
- m3u8_formats = self._extract_m3u8_formats(
- src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
- if m3u8_formats:
- formats.extend(m3u8_formats)
+ formats.extend(self._extract_m3u8_formats(
+ src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False))
continue
if src_ext == 'f4m':
@@ -1155,9 +1158,7 @@ class InfoExtractor(object):
}
f4m_url += '&' if '?' in f4m_url else '?'
f4m_url += compat_urllib_parse.urlencode(f4m_params)
- f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)
- if f4m_formats:
- formats.extend(f4m_formats)
+ formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
continue
if src_url.startswith('http') and self._is_valid_url(src, video_id):
@@ -1279,7 +1280,7 @@ class InfoExtractor(object):
def _get_cookies(self, url):
""" Return a compat_cookies.SimpleCookie with the cookies for the url """
- req = compat_urllib_request.Request(url)
+ req = sanitized_Request(url)
self._downloader.cookiejar.add_cookie_header(req)
return compat_cookies.SimpleCookie(req.get_header('Cookie'))