aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/utils.py')
-rw-r--r--youtube_dl/utils.py116
1 files changed, 76 insertions, 40 deletions
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 1dc3153fd..d7b737e21 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -3,6 +3,7 @@
from __future__ import unicode_literals
+import base64
import calendar
import codecs
import contextlib
@@ -35,6 +36,7 @@ import zlib
from .compat import (
compat_basestring,
compat_chr,
+ compat_etree_fromstring,
compat_html_entities,
compat_http_client,
compat_kwargs,
@@ -177,10 +179,19 @@ def xpath_with_ns(path, ns_map):
def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
- if sys.version_info < (2, 7): # Crazy 2.6
- xpath = xpath.encode('ascii')
+ def _find_xpath(xpath):
+ if sys.version_info < (2, 7): # Crazy 2.6
+ xpath = xpath.encode('ascii')
+ return node.find(xpath)
+
+ if isinstance(xpath, (str, compat_str)):
+ n = _find_xpath(xpath)
+ else:
+ for xp in xpath:
+ n = _find_xpath(xp)
+ if n is not None:
+ break
- n = node.find(xpath)
if n is None:
if default is not NO_DEFAULT:
return default
@@ -355,13 +366,20 @@ def sanitize_path(s):
if drive_or_unc:
norm_path.pop(0)
sanitized_path = [
- path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
+ path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
for path_part in norm_path]
if drive_or_unc:
sanitized_path.insert(0, drive_or_unc + os.path.sep)
return os.path.join(*sanitized_path)
+# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
+# unwanted failures due to missing protocol
+def sanitized_Request(url, *args, **kwargs):
+ return compat_urllib_request.Request(
+ 'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
+
+
def orderedSet(iterable):
""" Remove all duplicates from the input iterable """
res = []
@@ -385,10 +403,14 @@ def _htmlentity_transform(entity):
numstr = '0%s' % numstr
else:
base = 10
- return compat_chr(int(numstr, base))
+ # See https://github.com/rg3/youtube-dl/issues/7518
+ try:
+ return compat_chr(int(numstr, base))
+ except ValueError:
+ pass
# Unknown entity in name, return its literal representation
- return ('&%s;' % entity)
+ return '&%s;' % entity
def unescapeHTML(s):
@@ -813,9 +835,11 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
if date_str is None:
return None
+ date_str = re.sub(r'\.[0-9]+', '', date_str)
+
if timezone is None:
m = re.search(
- r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
+ r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
date_str)
if not m:
timezone = datetime.timedelta()
@@ -828,9 +852,12 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
timezone = datetime.timedelta(
hours=sign * int(m.group('hours')),
minutes=sign * int(m.group('minutes')))
- date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
- dt = datetime.datetime.strptime(date_str, date_format) - timezone
- return calendar.timegm(dt.timetuple())
+ try:
+ date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
+ dt = datetime.datetime.strptime(date_str, date_format) - timezone
+ return calendar.timegm(dt.timetuple())
+ except ValueError:
+ pass
def unified_strdate(date_str, day_first=True):
@@ -895,7 +922,8 @@ def unified_strdate(date_str, day_first=True):
timetuple = email.utils.parsedate_tz(date_str)
if timetuple:
upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
- return upload_date
+ if upload_date is not None:
+ return compat_str(upload_date)
def determine_ext(url, default_ext='unknown_video'):
@@ -904,6 +932,21 @@ def determine_ext(url, default_ext='unknown_video'):
guess = url.partition('?')[0].rpartition('.')[2]
if re.match(r'^[A-Za-z0-9]+$', guess):
return guess
+ elif guess.rstrip('/') in (
+ 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
+ 'flv', 'f4v', 'f4a', 'f4b',
+ 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
+ 'mkv', 'mka', 'mk3d',
+ 'avi', 'divx',
+ 'mov',
+ 'asf', 'wmv', 'wma',
+ '3gp', '3g2',
+ 'mp3',
+ 'flac',
+ 'ape',
+ 'wav',
+ 'f4f', 'f4m', 'm3u8', 'smil'):
+ return guess.rstrip('/')
else:
return default_ext
@@ -1371,7 +1414,12 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
v = getattr(v, get_attr, None)
if v == '':
v = None
- return default if v is None else (int(v) * invscale // scale)
+ if v is None:
+ return default
+ try:
+ return int(v) * invscale // scale
+ except ValueError:
+ return default
def str_or_none(v, default=None):
@@ -1387,7 +1435,12 @@ def str_to_int(int_str):
def float_or_none(v, scale=1, invscale=1, default=None):
- return default if v is None else (float(v) * invscale / scale)
+ if v is None:
+ return default
+ try:
+ return float(v) * invscale / scale
+ except ValueError:
+ return default
def parse_duration(s):
@@ -1637,30 +1690,9 @@ def urlencode_postdata(*args, **kargs):
def encode_dict(d, encoding='utf-8'):
- return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
-
-
-try:
- etree_iter = xml.etree.ElementTree.Element.iter
-except AttributeError: # Python <=2.6
- etree_iter = lambda n: n.findall('.//*')
-
-
-def parse_xml(s):
- class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
- def doctype(self, name, pubid, system):
- pass # Ignore doctypes
-
- parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
- kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
- tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
- # Fix up XML parser in Python 2.x
- if sys.version_info < (3, 0):
- for n in etree_iter(tree):
- if n.text is not None:
- if not isinstance(n.text, compat_str):
- n.text = n.text.decode('utf-8')
- return tree
+ def encode(v):
+ return v.encode(encoding) if isinstance(v, compat_basestring) else v
+ return dict((encode(k), encode(v)) for k, v in d.items())
US_RATINGS = {
@@ -1690,8 +1722,8 @@ def js_to_json(code):
if v in ('true', 'false', 'null'):
return v
if v.startswith('"'):
- return v
- if v.startswith("'"):
+ v = re.sub(r"\\'", "'", v[1:-1])
+ elif v.startswith("'"):
v = v[1:-1]
v = re.sub(r"\\\\|\\'|\"", lambda m: {
'\\\\': '\\\\',
@@ -1785,6 +1817,10 @@ def urlhandle_detect_ext(url_handle):
return mimetype2ext(getheader('Content-Type'))
+def encode_data_uri(data, mime_type):
+ return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
+
+
def age_restricted(content_limit, age_limit):
""" Returns True iff the content should be blocked """
@@ -1959,7 +1995,7 @@ def dfxp2srt(dfxp_data):
return out
- dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
+ dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
out = []
paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')