From 1e399778ee870ee583135e65458268cd7c0fb923 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 22 Jul 2015 20:03:05 +0800 Subject: [letv] Fix extraction Using data URIs for passing the decrypted M3U8 manifest, which is supported by ffmpeg only. --- youtube_dl/utils.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7dbe25661..db5b3698e 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals +import base64 import calendar import codecs import contextlib @@ -1795,6 +1796,10 @@ def urlhandle_detect_ext(url_handle): return mimetype2ext(getheader('Content-Type')) +def encode_data_uri(data, mime_type): + return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii')) + + def age_restricted(content_limit, age_limit): """ Returns True iff the content should be blocked """ -- cgit v1.2.3 From d01949dc89feb2441f251e42e8a6bfa4711b9715 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 20 Oct 2015 23:09:51 +0600 Subject: [utils:js_to_json] Fix bad escape in double quoted strings --- youtube_dl/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index db5b3698e..a61e47646 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1701,8 +1701,8 @@ def js_to_json(code): if v in ('true', 'false', 'null'): return v if v.startswith('"'): - return v - if v.startswith("'"): + v = re.sub(r"\\'", "'", v[1:-1]) + elif v.startswith("'"): v = v[1:-1] v = re.sub(r"\\\\|\\'|\"", lambda m: { '\\\\': '\\\\', -- cgit v1.2.3 From 36e6f62cd0883f0f486d1666d010e5d9e6d515bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 25 Oct 2015 20:04:55 +0100 Subject: Use a wrapper around xml.etree.ElementTree.fromstring in python 2.x (#7178) Attributes aren't unicode objects, so they couldn't be directly used in info_dict fields (for example '--write-description' doesn't work with bytes). --- youtube_dl/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a61e47646..7d846d680 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -36,6 +36,7 @@ import zlib from .compat import ( compat_basestring, compat_chr, + compat_etree_fromstring, compat_html_entities, compat_http_client, compat_kwargs, @@ -1974,7 +1975,7 @@ def dfxp2srt(dfxp_data): return out - dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) + dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) out = [] paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') -- cgit v1.2.3 From f78546272cf7c4b10c8003870728ab69bec982fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 26 Oct 2015 16:41:24 +0100 Subject: [compat] compat_etree_fromstring: also decode the text attribute Deletes parse_xml from utils, because it also does it. --- youtube_dl/utils.py | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7d846d680..c761ea22a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1652,29 +1652,6 @@ def encode_dict(d, encoding='utf-8'): return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items()) -try: - etree_iter = xml.etree.ElementTree.Element.iter -except AttributeError: # Python <=2.6 - etree_iter = lambda n: n.findall('.//*') - - -def parse_xml(s): - class TreeBuilder(xml.etree.ElementTree.TreeBuilder): - def doctype(self, name, pubid, system): - pass # Ignore doctypes - - parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder()) - kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {} - tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) - # Fix up XML parser in Python 2.x - if sys.version_info < (3, 0): - for n in etree_iter(tree): - if n.text is not None: - if not isinstance(n.text, compat_str): - n.text = n.text.decode('utf-8') - return tree - - US_RATINGS = { 'G': 0, 'PG': 10, -- cgit v1.2.3 From 52c3a6e49d2cbc1932992d816d28bbed629daadc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 28 Oct 2015 21:40:22 +0600 Subject: [utils] Improve parse_iso8601 --- youtube_dl/utils.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a61e47646..558c9c7d5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -814,9 +814,11 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): if date_str is None: return None + date_str = re.sub(r'\.[0-9]+', '', date_str) + if timezone is None: m = re.search( - r'(\.[0-9]+)?(?:Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', + r'(?:Z$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', date_str) if not m: timezone = datetime.timedelta() @@ -829,9 +831,12 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): timezone = datetime.timedelta( hours=sign * int(m.group('hours')), minutes=sign * int(m.group('minutes'))) - date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) - dt = datetime.datetime.strptime(date_str, date_format) - timezone - return calendar.timegm(dt.timetuple()) + try: + date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) + dt = datetime.datetime.strptime(date_str, date_format) - timezone + return calendar.timegm(dt.timetuple()) + except ValueError: + pass def unified_strdate(date_str, day_first=True): -- cgit v1.2.3 From 578c074575f45ffdfd032d7b84f6fe449614f511 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 31 Oct 2015 22:39:44 +0600 Subject: [utils] Support list of xpath in xpath_element --- youtube_dl/utils.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 558c9c7d5..89c88a4d3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -178,10 +178,19 @@ def xpath_with_ns(path, ns_map): def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): - if sys.version_info < (2, 7): # Crazy 2.6 - xpath = xpath.encode('ascii') + def _find_xpath(xpath): + if sys.version_info < (2, 7): # Crazy 2.6 + xpath = xpath.encode('ascii') + return node.find(xpath) + + if isinstance(xpath, (str, compat_str)): + n = _find_xpath(xpath) + else: + for xp in xpath: + n = _find_xpath(xp) + if n is not None: + break - n = node.find(xpath) if n is None: if default is not NO_DEFAULT: return default -- cgit v1.2.3 From ae12bc3ebb4cb377c2b4337ec255e652b36f5143 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 31 Oct 2015 23:07:37 +0600 Subject: [utils] Make unified_strdate always return unicode string --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 89c88a4d3..764a89cca 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -910,7 +910,7 @@ def unified_strdate(date_str, day_first=True): timetuple = email.utils.parsedate_tz(date_str) if timetuple: upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') - return upload_date + return compat_str(upload_date) def determine_ext(url, default_ext='unknown_video'): -- cgit v1.2.3 From c90d16cf36d8edf03f4dc923ee9dbeadca910844 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 Nov 2015 04:26:20 +0600 Subject: [utils:sanitize_path] Disallow trailing whitespace in path segment (Closes #7332) --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index efd5f4ae1..7b3f79141 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -366,7 +366,7 @@ def sanitize_path(s): if drive_or_unc: norm_path.pop(0) sanitized_path = [ - path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part) + path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part) for path_part in norm_path] if drive_or_unc: sanitized_path.insert(0, drive_or_unc + os.path.sep) -- cgit v1.2.3 From 6a750402787dfc1f39a9ad347f2d78ae1c94c52c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 2 Nov 2015 14:08:38 +0100 Subject: [utils] unified_strdate: Return None if the date format can't be recognized (fixes #7340) This issue was introduced with ae12bc3ebb4cb377c2b4337ec255e652b36f5143, it returned 'None'. --- youtube_dl/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7b3f79141..d39f313a4 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -911,7 +911,8 @@ def unified_strdate(date_str, day_first=True): timetuple = email.utils.parsedate_tz(date_str) if timetuple: upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') - return compat_str(upload_date) + if upload_date is not None: + return compat_str(upload_date) def determine_ext(url, default_ext='unknown_video'): -- cgit v1.2.3 From 7aefc49c4013efb5056b2c1237e22c52cb5d3c49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 16 Nov 2015 20:20:16 +0600 Subject: [utils] Skip invalid/non HTML entities (Closes #7518) --- youtube_dl/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d39f313a4..b7013a6aa 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -396,7 +396,11 @@ def _htmlentity_transform(entity): numstr = '0%s' % numstr else: base = 10 - return compat_chr(int(numstr, base)) + # See https://github.com/rg3/youtube-dl/issues/7518 + try: + return compat_chr(int(numstr, base)) + except ValueError: + pass # Unknown entity in name, return its literal representation return ('&%s;' % entity) -- cgit v1.2.3 From 7a3f0c00ad138eef396ae8fd1583fe29b4c4c684 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 16 Nov 2015 20:24:09 +0600 Subject: [utils] Style --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b7013a6aa..d00b14b86 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -403,7 +403,7 @@ def _htmlentity_transform(entity): pass # Unknown entity in name, return its literal representation - return ('&%s;' % entity) + return '&%s;' % entity def unescapeHTML(s): -- cgit v1.2.3 From 7e1f5447e76e57af58bf45ce565742c813c80b99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 21 Nov 2015 20:45:50 +0600 Subject: [utils] Improve encode_dict --- youtube_dl/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d00b14b86..bff59eb73 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1668,7 +1668,9 @@ def urlencode_postdata(*args, **kargs): def encode_dict(d, encoding='utf-8'): - return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items()) + def encode(v): + return v.encode(encoding) if isinstance(v, compat_basestring) else v + return dict((encode(k), encode(v)) for k, v in d.items()) US_RATINGS = { -- cgit v1.2.3 From 3e12bc583af9d5abf5f144ed6e092c59f4b83fdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Nov 2015 06:29:39 +0600 Subject: [utils] Improve determine_ext (Closes #7593) --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index bff59eb73..7dab60bb8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -922,7 +922,7 @@ def unified_strdate(date_str, day_first=True): def determine_ext(url, default_ext='unknown_video'): if url is None: return default_ext - guess = url.partition('?')[0].rpartition('.')[2] + guess = url.partition('?')[0].rpartition('.')[2].rstrip('/') if re.match(r'^[A-Za-z0-9]+$', guess): return guess else: -- cgit v1.2.3 From 9cb9a5df7794579c38efff1c4b1451a7d13da3c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Nov 2015 17:27:13 +0600 Subject: [utils] Check ext with trailing slash against the list of known extensions --- youtube_dl/utils.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7dab60bb8..c0325f054 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -922,9 +922,24 @@ def unified_strdate(date_str, day_first=True): def determine_ext(url, default_ext='unknown_video'): if url is None: return default_ext - guess = url.partition('?')[0].rpartition('.')[2].rstrip('/') + guess = url.partition('?')[0].rpartition('.')[2] if re.match(r'^[A-Za-z0-9]+$', guess): return guess + elif guess.rstrip('/') in ( + 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', + 'flv', 'f4v', 'f4a', 'f4b', + 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus', + 'mkv', 'mka', 'mk3d', + 'avi', 'divx', + 'mov', + 'asf', 'wmv', 'wma', + '3gp', '3g2', + 'mp3', + 'flac', + 'ape', + 'wav', + 'f4f', 'f4m', 'm3u8', 'smil'): + return guess.rstrip('/') else: return default_ext -- cgit v1.2.3 From 67dda51722f1ce12b956782d43047b3fff390115 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 20 Nov 2015 20:33:49 +0600 Subject: Rename compat_urllib_request_Request to sanitized_Request and move to utils --- youtube_dl/utils.py | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c0325f054..d7b737e21 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -373,6 +373,13 @@ def sanitize_path(s): return os.path.join(*sanitized_path) +# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of +# unwanted failures due to missing protocol +def sanitized_Request(url, *args, **kwargs): + return compat_urllib_request.Request( + 'http:%s' % url if url.startswith('//') else url, *args, **kwargs) + + def orderedSet(iterable): """ Remove all duplicates from the input iterable """ res = [] -- cgit v1.2.3 From 87f0e62d94e0486598d123e26db3173e6f1d18e6 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 29 Nov 2015 12:42:50 +0800 Subject: [utils] Separate codes for handling Youtubedl-* headers --- youtube_dl/utils.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d7b737e21..653a49055 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -663,6 +663,15 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): return hc +def handle_youtubedl_headers(headers): + if 'Youtubedl-no-compression' in headers: + filtered_headers = dict((k, v) for k, v in headers.items() if k.lower() != 'accept-encoding') + del filtered_headers['Youtubedl-no-compression'] + return filtered_headers + + return headers + + class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. @@ -731,10 +740,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): # The dict keys are capitalized because of this bug by urllib if h.capitalize() not in req.headers: req.add_header(h, v) - if 'Youtubedl-no-compression' in req.headers: - if 'Accept-encoding' in req.headers: - del req.headers['Accept-encoding'] - del req.headers['Youtubedl-no-compression'] + + req.headers = handle_youtubedl_headers(req.headers) if sys.version_info < (2, 7) and '#' in req.get_full_url(): # Python 2.6 is brain-dead when it comes to fragments -- cgit v1.2.3 From 0424ec307bb920a2a7c217a741241f3d2af84efa Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 29 Nov 2015 12:46:04 +0800 Subject: [utils] Correct docstring of YoutubeDLHandler --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 653a49055..c43e9e3a1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -679,7 +679,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): the standard headers to every HTTP request and handles gzipped and deflated responses from web servers. If compression is to be avoided in a particular request, the original request in the program code only has - to include the HTTP header "Youtubedl-No-Compression", which will be + to include the HTTP header "Youtubedl-no-compression", which will be removed before making the real request. Part of this code was copied from: -- cgit v1.2.3 From 992fc9d6e124b910ff3d720e252ef9aad99b2a8b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 29 Nov 2015 12:58:29 +0800 Subject: [utils] Refactor handle_youtubedl_headers for future extension --- youtube_dl/utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c43e9e3a1..d0606b4bc 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -664,12 +664,13 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): def handle_youtubedl_headers(headers): - if 'Youtubedl-no-compression' in headers: - filtered_headers = dict((k, v) for k, v in headers.items() if k.lower() != 'accept-encoding') + filtered_headers = headers + + if 'Youtubedl-no-compression' in filtered_headers: + filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding') del filtered_headers['Youtubedl-no-compression'] - return filtered_headers - return headers + return filtered_headers class YoutubeDLHandler(compat_urllib_request.HTTPHandler): -- cgit v1.2.3