From bae611f216ac7b1f1a24a506da6dffc518d09d5b Mon Sep 17 00:00:00 2001 From: Arvydas Sidorenko Date: Sun, 1 Jul 2012 18:21:27 +0200 Subject: Simplified preferredencoding() Not sure what is the point to use yield to return encoding, thus it will simplify the whole function. Signed-off-by: Arvydas Sidorenko --- youtube_dl/utils.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2853ba50f..7faa046c8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -32,15 +32,13 @@ def preferredencoding(): Returns the best encoding scheme for the system, based on locale.getpreferredencoding() and some further tweaks. """ - def yield_preferredencoding(): - try: - pref = locale.getpreferredencoding() - u'TEST'.encode(pref) - except: - pref = 'UTF-8' - while True: - yield pref - return yield_preferredencoding().next() + try: + pref = locale.getpreferredencoding() + u'TEST'.encode(pref) + except: + pref = 'UTF-8' + + return pref def htmlentity_transform(matchobj): -- cgit v1.2.3 From 51937c086943a3bdbf6f707c75d041ed3b0ba743 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 26 Nov 2012 04:05:54 +0100 Subject: Add some parentheses around print for #180 --- youtube_dl/utils.py | 1 - 1 file changed, 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 56d046145..6f53337d4 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -83,7 +83,6 @@ class IDParser(HTMLParser.HTMLParser): HTMLParser.HTMLParser.__init__(self) def error(self, message): - #print >> sys.stderr, self.getpos() if self.error_count > 10 or self.started: raise HTMLParser.HTMLParseError(message, self.getpos()) self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line -- cgit v1.2.3 From 92b91c18780938283c505f5662c458e049bf3567 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 26 Nov 2012 04:23:20 +0100 Subject: Use character instead of byte strings --- youtube_dl/utils.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 6f53337d4..658fd2686 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -26,6 +26,11 @@ std_headers = { 'Accept-Language': 'en-us,en;q=0.5', } +try: + compat_str = unicode # Python 2 +except NameError: + compat_str = str + def preferredencoding(): """Get preferred encoding. -- cgit v1.2.3 From 1c469a9480e9d8bea45950898eb46e07b0c58290 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 26 Nov 2012 23:58:46 +0100 Subject: New optoin --restrict-filenames --- youtube_dl/utils.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 658fd2686..55f2fe02c 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -194,18 +194,22 @@ def timeconvert(timestr): if timetuple is not None: timestamp = email.utils.mktime_tz(timetuple) return timestamp - -def sanitize_filename(s): - """Sanitizes a string so it could be used as part of a filename.""" + +def sanitize_filename(s, restricted=False): + """Sanitizes a string so it could be used as part of a filename. + If restricted is set, use a stricter subset of allowed characters. + """ def replace_insane(char): if char == '?' or ord(char) < 32 or ord(char) == 127: return '' elif char == '"': - return '\'' + return '' if restricted else 'FOO\'' elif char == ':': - return ' -' + return '_-' if restricted else ' -' elif char in '\\/|*<>': return '-' + if restricted and (char in '&\'' or char.isspace()): + return '_' return char result = u''.join(map(replace_insane, s)) -- cgit v1.2.3 From 240089e5df640a12b1d300da05932c2f74ff8c69 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 27 Nov 2012 00:14:12 +0100 Subject: remove accidental remnants --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 55f2fe02c..1f60d34ae 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -203,7 +203,7 @@ def sanitize_filename(s, restricted=False): if char == '?' or ord(char) < 32 or ord(char) == 127: return '' elif char == '"': - return '' if restricted else 'FOO\'' + return '' if restricted else '\'' elif char == ':': return '_-' if restricted else ' -' elif char in '\\/|*<>': -- cgit v1.2.3 From 56781d3d2e476e2e109d0907d89548fd4da05058 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 27 Nov 2012 12:46:09 +0100 Subject: Switch back to underline for invalid characters, and make restricted ASCII-only --- youtube_dl/utils.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 1f60d34ae..3339f56ec 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -207,15 +207,20 @@ def sanitize_filename(s, restricted=False): elif char == ':': return '_-' if restricted else ' -' elif char in '\\/|*<>': - return '-' + return '_' if restricted and (char in '&\'' or char.isspace()): return '_' + if restricted and ord(char) > 127: + return '_' return char result = u''.join(map(replace_insane, s)) - while '--' in result: - result = result.replace('--', '-') - return result.strip('-') + while '__' in result: + result = result.replace('__', '_') + result = result.strip('_') + if not result: + result = '_' + return result def orderedSet(iterable): """ Remove all duplicates from the input iterable """ -- cgit v1.2.3 From 46cbda0be4bed00122a5cf43e640808e6c32222d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 27 Nov 2012 15:07:10 +0100 Subject: Minor filename encoding improvement in a common case --- youtube_dl/utils.py | 3 +++ 1 file changed, 3 insertions(+) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 3339f56ec..4ace22c2f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -218,6 +218,9 @@ def sanitize_filename(s, restricted=False): while '__' in result: result = result.replace('__', '_') result = result.strip('_') + # Common case of "Foreign band name - English song title" + if restricted and result.startswith('-_'): + result = result[2:] if not result: result = '_' return result -- cgit v1.2.3 From dffe658bac71531dc6aa72088c4d98f6a556bfbf Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 27 Nov 2012 17:15:33 +0100 Subject: Remove exclamation mark in --restrict-filenames mode --- youtube_dl/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4ace22c2f..a0c41081a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -208,7 +208,7 @@ def sanitize_filename(s, restricted=False): return '_-' if restricted else ' -' elif char in '\\/|*<>': return '_' - if restricted and (char in '&\'' or char.isspace()): + if restricted and (char in '!&\'' or char.isspace()): return '_' if restricted and ord(char) > 127: return '_' @@ -316,7 +316,7 @@ class ContentTooShortError(Exception): class Trouble(Exception): """Trouble helper exception - + This is an exception to be handled with FileDownloader.trouble """ -- cgit v1.2.3 From 0969bdd3053fc95c5f545caabdbc77628afec4b5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 27 Nov 2012 18:49:18 +0100 Subject: unify spacing --- youtube_dl/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 40d6823a0..45582ca11 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -27,9 +27,9 @@ std_headers = { } try: - compat_str = unicode # Python 2 + compat_str = unicode # Python 2 except NameError: - compat_str = str + compat_str = str def preferredencoding(): """Get preferred encoding. -- cgit v1.2.3 From b514df2034a7291971427588977dd612c4b5581f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 27 Nov 2012 18:55:35 +0100 Subject: Clean up with the help of pep8 --- youtube_dl/utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 45582ca11..68a6fae1b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -126,8 +126,10 @@ class IDParser(HTMLParser.HTMLParser): handle_decl = handle_pi = unknown_decl = find_startpos def get_result(self): - if self.result == None: return None - if len(self.result) != 3: return None + if self.result is None: + return None + if len(self.result) != 3: + return None lines = self.html.split('\n') lines = lines[self.result[1][0]-1:self.result[2][0]] lines[0] = lines[0][self.result[1][1]:] -- cgit v1.2.3 From dd109dee8ebd51a019529fa494e56651162ad6d9 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 27 Nov 2012 19:02:37 +0100 Subject: Remove mentions of unicode --- youtube_dl/utils.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 68a6fae1b..bde446bcb 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -27,9 +27,9 @@ std_headers = { } try: - compat_str = unicode # Python 2 + u = unicode # Python 2 except NameError: - compat_str = str + u = str def preferredencoding(): """Get preferred encoding. @@ -47,7 +47,7 @@ def preferredencoding(): def htmlentity_transform(matchobj): - """Transforms an HTML entity to a Unicode character. + """Transforms an HTML entity to a character. This function receives a match object and is intended to be used with the re.sub() function. @@ -58,7 +58,6 @@ def htmlentity_transform(matchobj): if entity in htmlentitydefs.name2codepoint: return unichr(htmlentitydefs.name2codepoint[entity]) - # Unicode character mobj = re.match(ur'(?u)#(x?\d+)', entity) if mobj is not None: numstr = mobj.group(1) @@ -67,7 +66,7 @@ def htmlentity_transform(matchobj): numstr = u'0%s' % numstr else: base = 10 - return unichr(long(numstr, base)) + return unichr(int(numstr, base)) # Unknown entity in name, return its literal representation return (u'&%s;' % entity) @@ -235,7 +234,7 @@ def orderedSet(iterable): def unescapeHTML(s): """ - @param s a string (of type unicode) + @param s a string """ assert type(s) == type(u'') @@ -244,7 +243,7 @@ def unescapeHTML(s): def encodeFilename(s): """ - @param s The name of the file (of type unicode) + @param s The name of the file """ assert type(s) == type(u'') -- cgit v1.2.3 From 96731798dbdd5a8878ac5cf29b69c6c7c821311b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 27 Nov 2012 23:29:18 +0100 Subject: Rename util.u to util.compat_str --- youtube_dl/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index bde446bcb..4d098a377 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -27,9 +27,9 @@ std_headers = { } try: - u = unicode # Python 2 + compat_str = unicode # Python 2 except NameError: - u = str + compat_str = str def preferredencoding(): """Get preferred encoding. -- cgit v1.2.3 From e08bee320e7c2933590d108ff4f8546b4dff935f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 27 Nov 2012 23:31:55 +0100 Subject: Use except .. as everywhere (#180) --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4d098a377..29e1b0e97 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -177,7 +177,7 @@ def sanitize_open(filename, open_mode): return (sys.stdout, filename) stream = open(encodeFilename(filename), open_mode) return (stream, filename) - except (IOError, OSError), err: + except (IOError, OSError) as err: # In case of error, try to remove win32 forbidden chars filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename) -- cgit v1.2.3 From 01ba00ca42899436c13439226ec61651a6ea6af0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 27 Nov 2012 23:54:09 +0100 Subject: Prepare urllib references for 2/3 compatibility --- youtube_dl/utils.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 29e1b0e97..12e32be98 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -9,7 +9,6 @@ import os import re import sys import zlib -import urllib2 import email.utils import json @@ -31,6 +30,26 @@ try: except NameError: compat_str = str +try: + import urllib.request as compat_urllib_request +except ImportError: # Python 2 + import urllib2 as compat_urllib_request + +try: + import urllib.error as compat_urllib_error +except ImportError: # Python 2 + import urllib2 as compat_urllib_error + +try: + import urllib.parse as compat_urllib_parse +except ImportError: # Python 2 + import urllib2 as compat_urllib_parse + +try: + import http.cookiejar as compat_cookiejar +except ImportError: # Python 2 + import cookielib as compat_cookiejar + def preferredencoding(): """Get preferred encoding. @@ -320,7 +339,7 @@ class Trouble(Exception): FileDownloader.trouble """ -class YoutubeDLHandler(urllib2.HTTPHandler): +class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. This class, when installed with an OpenerDirector, automatically adds @@ -347,9 +366,9 @@ class YoutubeDLHandler(urllib2.HTTPHandler): @staticmethod def addinfourl_wrapper(stream, headers, url, code): - if hasattr(urllib2.addinfourl, 'getcode'): - return urllib2.addinfourl(stream, headers, url, code) - ret = urllib2.addinfourl(stream, headers, url) + if hasattr(compat_urllib_request.addinfourl, 'getcode'): + return compat_urllib_request.addinfourl(stream, headers, url, code) + ret = compat_urllib_request.addinfourl(stream, headers, url) ret.code = code return ret -- cgit v1.2.3 From 89fb51dd2d4d7464b919f17b9d5d24a448319dfc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 27 Nov 2012 23:56:10 +0100 Subject: Remove ur references for Python 3.3 support --- youtube_dl/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 12e32be98..ccefc66a0 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -77,7 +77,7 @@ def htmlentity_transform(matchobj): if entity in htmlentitydefs.name2codepoint: return unichr(htmlentitydefs.name2codepoint[entity]) - mobj = re.match(ur'(?u)#(x?\d+)', entity) + mobj = re.match(u'(?u)#(x?\\d+)', entity) if mobj is not None: numstr = mobj.group(1) if numstr.startswith(u'x'): @@ -198,7 +198,7 @@ def sanitize_open(filename, open_mode): return (stream, filename) except (IOError, OSError) as err: # In case of error, try to remove win32 forbidden chars - filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename) + filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename) # An exception here should be caught in the caller stream = open(encodeFilename(filename), open_mode) @@ -257,7 +257,7 @@ def unescapeHTML(s): """ assert type(s) == type(u'') - result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s) + result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s) return result def encodeFilename(s): -- cgit v1.2.3 From da779b4924eb3078233c7f5730e26cc73dd91a4a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 27 Nov 2012 23:58:47 +0100 Subject: Fall back to urllib instead of urllib2 for Python 3 urllib.parse --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ccefc66a0..ac7e161af 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -43,7 +43,7 @@ except ImportError: # Python 2 try: import urllib.parse as compat_urllib_parse except ImportError: # Python 2 - import urllib2 as compat_urllib_parse + import urllib as compat_urllib_parse try: import http.cookiejar as compat_cookiejar -- cgit v1.2.3 From 3e669f369f886dff8fa8272f3bfa37be6360a0ba Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Nov 2012 00:02:55 +0100 Subject: Py3 compat for unichr and htmlentitydefs --- youtube_dl/utils.py | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ac7e161af..668338270 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import gzip -import htmlentitydefs import HTMLParser import locale import os @@ -17,19 +16,6 @@ try: except ImportError: import StringIO -std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', - 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Encoding': 'gzip, deflate', - 'Accept-Language': 'en-us,en;q=0.5', -} - -try: - compat_str = unicode # Python 2 -except NameError: - compat_str = str - try: import urllib.request as compat_urllib_request except ImportError: # Python 2 @@ -50,6 +36,29 @@ try: except ImportError: # Python 2 import cookielib as compat_cookiejar +try: + import html.entities as compat_html_entities +except NameError: # Python 2 + import htmlentitydefs as compat_html_entities + +try: + compat_str = unicode # Python 2 +except NameError: + compat_str = str + +try: + compat_chr = unichr # Python 2 +except NameError: + compat_chr = chr + + +std_headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', + 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate', + 'Accept-Language': 'en-us,en;q=0.5', +} def preferredencoding(): """Get preferred encoding. @@ -74,8 +83,8 @@ def htmlentity_transform(matchobj): entity = matchobj.group(1) # Known non-numeric HTML entity - if entity in htmlentitydefs.name2codepoint: - return unichr(htmlentitydefs.name2codepoint[entity]) + if entity in compat_html_entities.name2codepoint: + return compat_chr(compat_html_entities.name2codepoint[entity]) mobj = re.match(u'(?u)#(x?\\d+)', entity) if mobj is not None: @@ -85,7 +94,7 @@ def htmlentity_transform(matchobj): numstr = u'0%s' % numstr else: base = 10 - return unichr(int(numstr, base)) + return compat_chr(int(numstr, base)) # Unknown entity in name, return its literal representation return (u'&%s;' % entity) -- cgit v1.2.3 From a8156c1d2e4b2a7ac5e034c247c6fccaca15a21d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Nov 2012 00:06:28 +0100 Subject: Python 3 version of HTMLParser --- youtube_dl/utils.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 668338270..c4917012b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import gzip -import HTMLParser import locale import os import re @@ -41,6 +40,11 @@ try: except NameError: # Python 2 import htmlentitydefs as compat_html_entities +try: + import html.parser as compat_html_parser +except NameError: # Python 2 + import HTMLParser as compat_html_parser + try: compat_str = unicode # Python 2 except NameError: @@ -99,8 +103,8 @@ def htmlentity_transform(matchobj): # Unknown entity in name, return its literal representation return (u'&%s;' % entity) -HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class IDParser(HTMLParser.HTMLParser): +compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix +class IDParser(compat_html_parser.HTMLParser): """Modified HTMLParser that isolates a tag with the specified id""" def __init__(self, id): self.id = id @@ -110,11 +114,11 @@ class IDParser(HTMLParser.HTMLParser): self.html = None self.watch_startpos = False self.error_count = 0 - HTMLParser.HTMLParser.__init__(self) + compat_html_parser.HTMLParser.__init__(self) def error(self, message): if self.error_count > 10 or self.started: - raise HTMLParser.HTMLParseError(message, self.getpos()) + raise compat_html_parser.HTMLParseError(message, self.getpos()) self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line self.error_count += 1 self.goahead(1) @@ -170,7 +174,7 @@ def get_element_by_id(id, html): parser = IDParser(id) try: parser.loads(html) - except HTMLParser.HTMLParseError: + except compat_html_parser.HTMLParseError: pass return parser.get_result() -- cgit v1.2.3 From 03f9daab34605f538294fdffb141ef5d9fc670e6 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Nov 2012 00:09:17 +0100 Subject: Use io.BytesIO instead of StringIO --- youtube_dl/utils.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c4917012b..ebff2e8f2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import gzip +import io import locale import os import re @@ -10,11 +11,6 @@ import zlib import email.utils import json -try: - import cStringIO as StringIO -except ImportError: - import StringIO - try: import urllib.request as compat_urllib_request except ImportError: # Python 2 @@ -400,12 +396,12 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): old_resp = resp # gzip if resp.headers.get('Content-encoding', '') == 'gzip': - gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r') + gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r') resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg # deflate if resp.headers.get('Content-encoding', '') == 'deflate': - gz = StringIO.StringIO(self.deflate(resp.read())) + gz = io.BytesIO(self.deflate(resp.read())) resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg return resp -- cgit v1.2.3 From 348d0a7a18fd4aa2512418b8be48f5011fbd3f99 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Nov 2012 00:13:00 +0100 Subject: Py2/3 compatibility for http.client --- youtube_dl/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ebff2e8f2..370567705 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -41,6 +41,12 @@ try: except NameError: # Python 2 import HTMLParser as compat_html_parser +try: + import http.client as compat_html_client +except NameError: # Python 2 + import httplib as compat_html_client + + try: compat_str = unicode # Python 2 except NameError: -- cgit v1.2.3 From 9f37a9594147b71b1ce796219f75fc3a2bb544d3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Nov 2012 00:17:12 +0100 Subject: Py2/3 parse_qs compatibility --- youtube_dl/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 370567705..a27ac77dd 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -33,19 +33,23 @@ except ImportError: # Python 2 try: import html.entities as compat_html_entities -except NameError: # Python 2 +except ImportError: # Python 2 import htmlentitydefs as compat_html_entities try: import html.parser as compat_html_parser -except NameError: # Python 2 +except ImportError: # Python 2 import HTMLParser as compat_html_parser try: import http.client as compat_html_client -except NameError: # Python 2 +except ImportError: # Python 2 import httplib as compat_html_client +try: + from urllib.parse.parse_qs import parse_qs as compat_parse_qs +except ImportError: # Python 2 + from urlparse import parse_qs as compat_parse_qs try: compat_str = unicode # Python 2 -- cgit v1.2.3 From 73dce4b2e4cb6eea951dbd682a92ad7508c957b0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Nov 2012 00:17:59 +0100 Subject: Import from the correct module --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a27ac77dd..41e6b8550 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -47,7 +47,7 @@ except ImportError: # Python 2 import httplib as compat_html_client try: - from urllib.parse.parse_qs import parse_qs as compat_parse_qs + from urllib.parse import parse_qs as compat_parse_qs except ImportError: # Python 2 from urlparse import parse_qs as compat_parse_qs -- cgit v1.2.3 From 5bd9cc7a6a93fbd2b3b06cbdfbaaf8b7e89e8b7b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Nov 2012 00:22:55 +0100 Subject: typo --- youtube_dl/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 41e6b8550..3fcb0927f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -42,9 +42,9 @@ except ImportError: # Python 2 import HTMLParser as compat_html_parser try: - import http.client as compat_html_client + import http.client as compat_http_client except ImportError: # Python 2 - import httplib as compat_html_client + import httplib as compat_http_client try: from urllib.parse import parse_qs as compat_parse_qs -- cgit v1.2.3 From 8cd10ac4efb5168f3ceb18ec94338bce73e166a7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Nov 2012 00:46:21 +0100 Subject: Fix printing title etc. --- youtube_dl/utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 3fcb0927f..9db7b9d9d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -61,7 +61,6 @@ try: except NameError: compat_chr = chr - std_headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', @@ -83,6 +82,12 @@ def preferredencoding(): return pref +if sys.version_info < (3,0): + def compat_print(s): + print(s.encode(preferredencoding(), 'xmlcharrefreplace')) +else: + def compat_print(s): + print(s) def htmlentity_transform(matchobj): """Transforms an HTML entity to a character. -- cgit v1.2.3 From e6137fd61d47ba8624e9baa84701adf929e38849 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Nov 2012 00:53:09 +0100 Subject: Remove superfluous encodings --- youtube_dl/utils.py | 1 + 1 file changed, 1 insertion(+) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9db7b9d9d..0aa350e64 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -87,6 +87,7 @@ if sys.version_info < (3,0): print(s.encode(preferredencoding(), 'xmlcharrefreplace')) else: def compat_print(s): + assert type(s) == type(u'') print(s) def htmlentity_transform(matchobj): -- cgit v1.2.3 From 0f00efed4c06fefcd4da7294cb3c92bccf081eaa Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Nov 2012 00:56:20 +0100 Subject: Woooohooo! python3 youtube_dl BaW_jenozKc -t works! --- youtube_dl/utils.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0aa350e64..a5df62bf8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -292,6 +292,10 @@ def encodeFilename(s): assert type(s) == type(u'') + # Python 3 has a Unicode API + if sys.version_info >= (3, 0): + return s + if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: # Pass u'' directly to use Unicode APIs on Windows 2000 and up # (Detecting Windows NT 4 is tricky because 'major >= 4' would -- cgit v1.2.3 From 40b35b4aa6040ecc3ff7b3c9c8b908249633d86e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Nov 2012 02:01:09 +0100 Subject: hack for apparently broken parse_qs in python2 --- youtube_dl/utils.py | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a5df62bf8..cf78e9dc8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -49,7 +49,81 @@ except ImportError: # Python 2 try: from urllib.parse import parse_qs as compat_parse_qs except ImportError: # Python 2 - from urlparse import parse_qs as compat_parse_qs + # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. + # Python 2's version is apparently totally broken + def _unquote(string, encoding='utf-8', errors='replace'): + if string == '': + return string + res = string.split('%') + if len(res) == 1: + return string + if encoding is None: + encoding = 'utf-8' + if errors is None: + errors = 'replace' + # pct_sequence: contiguous sequence of percent-encoded bytes, decoded + pct_sequence = b'' + string = res[0] + for item in res[1:]: + try: + if not item: + raise ValueError + pct_sequence += item[:2].decode('hex') + rest = item[2:] + if not rest: + # This segment was just a single percent-encoded character. + # May be part of a sequence of code units, so delay decoding. + # (Stored in pct_sequence). + continue + except ValueError: + rest = '%' + item + # Encountered non-percent-encoded characters. Flush the current + # pct_sequence. + string += pct_sequence.decode(encoding, errors) + rest + pct_sequence = b'' + if pct_sequence: + # Flush the final pct_sequence + string += pct_sequence.decode(encoding, errors) + return string + + def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + qs, _coerce_result = qs, unicode + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + r = [] + for name_value in pairs: + if not name_value and not strict_parsing: + continue + nv = name_value.split('=', 1) + if len(nv) != 2: + if strict_parsing: + raise ValueError("bad query field: %r" % (name_value,)) + # Handle case of a control-name with no equal sign + if keep_blank_values: + nv.append('') + else: + continue + if len(nv[1]) or keep_blank_values: + name = nv[0].replace('+', ' ') + name = _unquote(name, encoding=encoding, errors=errors) + name = _coerce_result(name) + value = nv[1].replace('+', ' ') + value = _unquote(value, encoding=encoding, errors=errors) + value = _coerce_result(value) + r.append((name, value)) + return r + + def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + parsed_result = {} + pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, + encoding=encoding, errors=errors) + for name, value in pairs: + if name in parsed_result: + parsed_result[name].append(value) + else: + parsed_result[name] = [value] + return parsed_result try: compat_str = unicode # Python 2 -- cgit v1.2.3 From 59ae15a507cbf93de13f8fda8444d2d9a251747a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Nov 2012 02:04:46 +0100 Subject: Convert all tabs to 4 spaces (PEP8) --- youtube_dl/utils.py | 788 ++++++++++++++++++++++++++-------------------------- 1 file changed, 394 insertions(+), 394 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index cf78e9dc8..836138277 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -12,490 +12,490 @@ import email.utils import json try: - import urllib.request as compat_urllib_request + import urllib.request as compat_urllib_request except ImportError: # Python 2 - import urllib2 as compat_urllib_request + import urllib2 as compat_urllib_request try: - import urllib.error as compat_urllib_error + import urllib.error as compat_urllib_error except ImportError: # Python 2 - import urllib2 as compat_urllib_error + import urllib2 as compat_urllib_error try: - import urllib.parse as compat_urllib_parse + import urllib.parse as compat_urllib_parse except ImportError: # Python 2 - import urllib as compat_urllib_parse + import urllib as compat_urllib_parse try: - import http.cookiejar as compat_cookiejar + import http.cookiejar as compat_cookiejar except ImportError: # Python 2 - import cookielib as compat_cookiejar + import cookielib as compat_cookiejar try: - import html.entities as compat_html_entities + import html.entities as compat_html_entities except ImportError: # Python 2 - import htmlentitydefs as compat_html_entities + import htmlentitydefs as compat_html_entities try: - import html.parser as compat_html_parser + import html.parser as compat_html_parser except ImportError: # Python 2 - import HTMLParser as compat_html_parser + import HTMLParser as compat_html_parser try: - import http.client as compat_http_client + import http.client as compat_http_client except ImportError: # Python 2 - import httplib as compat_http_client + import httplib as compat_http_client try: - from urllib.parse import parse_qs as compat_parse_qs + from urllib.parse import parse_qs as compat_parse_qs except ImportError: # Python 2 - # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. - # Python 2's version is apparently totally broken - def _unquote(string, encoding='utf-8', errors='replace'): - if string == '': - return string - res = string.split('%') - if len(res) == 1: - return string - if encoding is None: - encoding = 'utf-8' - if errors is None: - errors = 'replace' - # pct_sequence: contiguous sequence of percent-encoded bytes, decoded - pct_sequence = b'' - string = res[0] - for item in res[1:]: - try: - if not item: - raise ValueError - pct_sequence += item[:2].decode('hex') - rest = item[2:] - if not rest: - # This segment was just a single percent-encoded character. - # May be part of a sequence of code units, so delay decoding. - # (Stored in pct_sequence). - continue - except ValueError: - rest = '%' + item - # Encountered non-percent-encoded characters. Flush the current - # pct_sequence. - string += pct_sequence.decode(encoding, errors) + rest - pct_sequence = b'' - if pct_sequence: - # Flush the final pct_sequence - string += pct_sequence.decode(encoding, errors) - return string - - def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - qs, _coerce_result = qs, unicode - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] - r = [] - for name_value in pairs: - if not name_value and not strict_parsing: - continue - nv = name_value.split('=', 1) - if len(nv) != 2: - if strict_parsing: - raise ValueError("bad query field: %r" % (name_value,)) - # Handle case of a control-name with no equal sign - if keep_blank_values: - nv.append('') - else: - continue - if len(nv[1]) or keep_blank_values: - name = nv[0].replace('+', ' ') - name = _unquote(name, encoding=encoding, errors=errors) - name = _coerce_result(name) - value = nv[1].replace('+', ' ') - value = _unquote(value, encoding=encoding, errors=errors) - value = _coerce_result(value) - r.append((name, value)) - return r - - def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - parsed_result = {} - pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, - encoding=encoding, errors=errors) - for name, value in pairs: - if name in parsed_result: - parsed_result[name].append(value) - else: - parsed_result[name] = [value] - return parsed_result + # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. + # Python 2's version is apparently totally broken + def _unquote(string, encoding='utf-8', errors='replace'): + if string == '': + return string + res = string.split('%') + if len(res) == 1: + return string + if encoding is None: + encoding = 'utf-8' + if errors is None: + errors = 'replace' + # pct_sequence: contiguous sequence of percent-encoded bytes, decoded + pct_sequence = b'' + string = res[0] + for item in res[1:]: + try: + if not item: + raise ValueError + pct_sequence += item[:2].decode('hex') + rest = item[2:] + if not rest: + # This segment was just a single percent-encoded character. + # May be part of a sequence of code units, so delay decoding. + # (Stored in pct_sequence). + continue + except ValueError: + rest = '%' + item + # Encountered non-percent-encoded characters. Flush the current + # pct_sequence. + string += pct_sequence.decode(encoding, errors) + rest + pct_sequence = b'' + if pct_sequence: + # Flush the final pct_sequence + string += pct_sequence.decode(encoding, errors) + return string + + def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + qs, _coerce_result = qs, unicode + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + r = [] + for name_value in pairs: + if not name_value and not strict_parsing: + continue + nv = name_value.split('=', 1) + if len(nv) != 2: + if strict_parsing: + raise ValueError("bad query field: %r" % (name_value,)) + # Handle case of a control-name with no equal sign + if keep_blank_values: + nv.append('') + else: + continue + if len(nv[1]) or keep_blank_values: + name = nv[0].replace('+', ' ') + name = _unquote(name, encoding=encoding, errors=errors) + name = _coerce_result(name) + value = nv[1].replace('+', ' ') + value = _unquote(value, encoding=encoding, errors=errors) + value = _coerce_result(value) + r.append((name, value)) + return r + + def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + parsed_result = {} + pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, + encoding=encoding, errors=errors) + for name, value in pairs: + if name in parsed_result: + parsed_result[name].append(value) + else: + parsed_result[name] = [value] + return parsed_result try: - compat_str = unicode # Python 2 + compat_str = unicode # Python 2 except NameError: - compat_str = str + compat_str = str try: - compat_chr = unichr # Python 2 + compat_chr = unichr # Python 2 except NameError: - compat_chr = chr + compat_chr = chr std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', - 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Encoding': 'gzip, deflate', - 'Accept-Language': 'en-us,en;q=0.5', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', + 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate', + 'Accept-Language': 'en-us,en;q=0.5', } def preferredencoding(): - """Get preferred encoding. + """Get preferred encoding. - Returns the best encoding scheme for the system, based on - locale.getpreferredencoding() and some further tweaks. - """ - try: - pref = locale.getpreferredencoding() - u'TEST'.encode(pref) - except: - pref = 'UTF-8' + Returns the best encoding scheme for the system, based on + locale.getpreferredencoding() and some further tweaks. + """ + try: + pref = locale.getpreferredencoding() + u'TEST'.encode(pref) + except: + pref = 'UTF-8' - return pref + return pref if sys.version_info < (3,0): - def compat_print(s): - print(s.encode(preferredencoding(), 'xmlcharrefreplace')) + def compat_print(s): + print(s.encode(preferredencoding(), 'xmlcharrefreplace')) else: - def compat_print(s): - assert type(s) == type(u'') - print(s) + def compat_print(s): + assert type(s) == type(u'') + print(s) def htmlentity_transform(matchobj): - """Transforms an HTML entity to a character. - - This function receives a match object and is intended to be used with - the re.sub() function. - """ - entity = matchobj.group(1) - - # Known non-numeric HTML entity - if entity in compat_html_entities.name2codepoint: - return compat_chr(compat_html_entities.name2codepoint[entity]) - - mobj = re.match(u'(?u)#(x?\\d+)', entity) - if mobj is not None: - numstr = mobj.group(1) - if numstr.startswith(u'x'): - base = 16 - numstr = u'0%s' % numstr - else: - base = 10 - return compat_chr(int(numstr, base)) - - # Unknown entity in name, return its literal representation - return (u'&%s;' % entity) + """Transforms an HTML entity to a character. + + This function receives a match object and is intended to be used with + the re.sub() function. + """ + entity = matchobj.group(1) + + # Known non-numeric HTML entity + if entity in compat_html_entities.name2codepoint: + return compat_chr(compat_html_entities.name2codepoint[entity]) + + mobj = re.match(u'(?u)#(x?\\d+)', entity) + if mobj is not None: + numstr = mobj.group(1) + if numstr.startswith(u'x'): + base = 16 + numstr = u'0%s' % numstr + else: + base = 10 + return compat_chr(int(numstr, base)) + + # Unknown entity in name, return its literal representation + return (u'&%s;' % entity) compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix class IDParser(compat_html_parser.HTMLParser): - """Modified HTMLParser that isolates a tag with the specified id""" - def __init__(self, id): - self.id = id - self.result = None - self.started = False - self.depth = {} - self.html = None - self.watch_startpos = False - self.error_count = 0 - compat_html_parser.HTMLParser.__init__(self) - - def error(self, message): - if self.error_count > 10 or self.started: - raise compat_html_parser.HTMLParseError(message, self.getpos()) - self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line - self.error_count += 1 - self.goahead(1) - - def loads(self, html): - self.html = html - self.feed(html) - self.close() - - def handle_starttag(self, tag, attrs): - attrs = dict(attrs) - if self.started: - self.find_startpos(None) - if 'id' in attrs and attrs['id'] == self.id: - self.result = [tag] - self.started = True - self.watch_startpos = True - if self.started: - if not tag in self.depth: self.depth[tag] = 0 - self.depth[tag] += 1 - - def handle_endtag(self, tag): - if self.started: - if tag in self.depth: self.depth[tag] -= 1 - if self.depth[self.result[0]] == 0: - self.started = False - self.result.append(self.getpos()) - - def find_startpos(self, x): - """Needed to put the start position of the result (self.result[1]) - after the opening tag with the requested id""" - if self.watch_startpos: - self.watch_startpos = False - self.result.append(self.getpos()) - handle_entityref = handle_charref = handle_data = handle_comment = \ - handle_decl = handle_pi = unknown_decl = find_startpos - - def get_result(self): - if self.result is None: - return None - if len(self.result) != 3: - return None - lines = self.html.split('\n') - lines = lines[self.result[1][0]-1:self.result[2][0]] - lines[0] = lines[0][self.result[1][1]:] - if len(lines) == 1: - lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] - lines[-1] = lines[-1][:self.result[2][1]] - return '\n'.join(lines).strip() + """Modified HTMLParser that isolates a tag with the specified id""" + def __init__(self, id): + self.id = id + self.result = None + self.started = False + self.depth = {} + self.html = None + self.watch_startpos = False + self.error_count = 0 + compat_html_parser.HTMLParser.__init__(self) + + def error(self, message): + if self.error_count > 10 or self.started: + raise compat_html_parser.HTMLParseError(message, self.getpos()) + self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line + self.error_count += 1 + self.goahead(1) + + def loads(self, html): + self.html = html + self.feed(html) + self.close() + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + if self.started: + self.find_startpos(None) + if 'id' in attrs and attrs['id'] == self.id: + self.result = [tag] + self.started = True + self.watch_startpos = True + if self.started: + if not tag in self.depth: self.depth[tag] = 0 + self.depth[tag] += 1 + + def handle_endtag(self, tag): + if self.started: + if tag in self.depth: self.depth[tag] -= 1 + if self.depth[self.result[0]] == 0: + self.started = False + self.result.append(self.getpos()) + + def find_startpos(self, x): + """Needed to put the start position of the result (self.result[1]) + after the opening tag with the requested id""" + if self.watch_startpos: + self.watch_startpos = False + self.result.append(self.getpos()) + handle_entityref = handle_charref = handle_data = handle_comment = \ + handle_decl = handle_pi = unknown_decl = find_startpos + + def get_result(self): + if self.result is None: + return None + if len(self.result) != 3: + return None + lines = self.html.split('\n') + lines = lines[self.result[1][0]-1:self.result[2][0]] + lines[0] = lines[0][self.result[1][1]:] + if len(lines) == 1: + lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] + lines[-1] = lines[-1][:self.result[2][1]] + return '\n'.join(lines).strip() def get_element_by_id(id, html): - """Return the content of the tag with the specified id in the passed HTML document""" - parser = IDParser(id) - try: - parser.loads(html) - except compat_html_parser.HTMLParseError: - pass - return parser.get_result() + """Return the content of the tag with the specified id in the passed HTML document""" + parser = IDParser(id) + try: + parser.loads(html) + except compat_html_parser.HTMLParseError: + pass + return parser.get_result() def clean_html(html): - """Clean an HTML snippet into a readable string""" - # Newline vs
- html = html.replace('\n', ' ') - html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html) - # Strip html tags - html = re.sub('<.*?>', '', html) - # Replace html entities - html = unescapeHTML(html) - return html + """Clean an HTML snippet into a readable string""" + # Newline vs
+ html = html.replace('\n', ' ') + html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html) + # Strip html tags + html = re.sub('<.*?>', '', html) + # Replace html entities + html = unescapeHTML(html) + return html def sanitize_open(filename, open_mode): - """Try to open the given filename, and slightly tweak it if this fails. - - Attempts to open the given filename. If this fails, it tries to change - the filename slightly, step by step, until it's either able to open it - or it fails and raises a final exception, like the standard open() - function. - - It returns the tuple (stream, definitive_file_name). - """ - try: - if filename == u'-': - if sys.platform == 'win32': - import msvcrt - msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) - return (sys.stdout, filename) - stream = open(encodeFilename(filename), open_mode) - return (stream, filename) - except (IOError, OSError) as err: - # In case of error, try to remove win32 forbidden chars - filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename) - - # An exception here should be caught in the caller - stream = open(encodeFilename(filename), open_mode) - return (stream, filename) + """Try to open the given filename, and slightly tweak it if this fails. + + Attempts to open the given filename. If this fails, it tries to change + the filename slightly, step by step, until it's either able to open it + or it fails and raises a final exception, like the standard open() + function. + + It returns the tuple (stream, definitive_file_name). + """ + try: + if filename == u'-': + if sys.platform == 'win32': + import msvcrt + msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) + return (sys.stdout, filename) + stream = open(encodeFilename(filename), open_mode) + return (stream, filename) + except (IOError, OSError) as err: + # In case of error, try to remove win32 forbidden chars + filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename) + + # An exception here should be caught in the caller + stream = open(encodeFilename(filename), open_mode) + return (stream, filename) def timeconvert(timestr): - """Convert RFC 2822 defined time string into system timestamp""" - timestamp = None - timetuple = email.utils.parsedate_tz(timestr) - if timetuple is not None: - timestamp = email.utils.mktime_tz(timetuple) - return timestamp + """Convert RFC 2822 defined time string into system timestamp""" + timestamp = None + timetuple = email.utils.parsedate_tz(timestr) + if timetuple is not None: + timestamp = email.utils.mktime_tz(timetuple) + return timestamp def sanitize_filename(s, restricted=False): - """Sanitizes a string so it could be used as part of a filename. - If restricted is set, use a stricter subset of allowed characters. - """ - def replace_insane(char): - if char == '?' or ord(char) < 32 or ord(char) == 127: - return '' - elif char == '"': - return '' if restricted else '\'' - elif char == ':': - return '_-' if restricted else ' -' - elif char in '\\/|*<>': - return '_' - if restricted and (char in '!&\'' or char.isspace()): - return '_' - if restricted and ord(char) > 127: - return '_' - return char - - result = u''.join(map(replace_insane, s)) - while '__' in result: - result = result.replace('__', '_') - result = result.strip('_') - # Common case of "Foreign band name - English song title" - if restricted and result.startswith('-_'): - result = result[2:] - if not result: - result = '_' - return result + """Sanitizes a string so it could be used as part of a filename. + If restricted is set, use a stricter subset of allowed characters. + """ + def replace_insane(char): + if char == '?' or ord(char) < 32 or ord(char) == 127: + return '' + elif char == '"': + return '' if restricted else '\'' + elif char == ':': + return '_-' if restricted else ' -' + elif char in '\\/|*<>': + return '_' + if restricted and (char in '!&\'' or char.isspace()): + return '_' + if restricted and ord(char) > 127: + return '_' + return char + + result = u''.join(map(replace_insane, s)) + while '__' in result: + result = result.replace('__', '_') + result = result.strip('_') + # Common case of "Foreign band name - English song title" + if restricted and result.startswith('-_'): + result = result[2:] + if not result: + result = '_' + return result def orderedSet(iterable): - """ Remove all duplicates from the input iterable """ - res = [] - for el in iterable: - if el not in res: - res.append(el) - return res + """ Remove all duplicates from the input iterable """ + res = [] + for el in iterable: + if el not in res: + res.append(el) + return res def unescapeHTML(s): - """ - @param s a string - """ - assert type(s) == type(u'') + """ + @param s a string + """ + assert type(s) == type(u'') - result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s) - return result + result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s) + return result def encodeFilename(s): - """ - @param s The name of the file - """ + """ + @param s The name of the file + """ - assert type(s) == type(u'') + assert type(s) == type(u'') - # Python 3 has a Unicode API - if sys.version_info >= (3, 0): - return s + # Python 3 has a Unicode API + if sys.version_info >= (3, 0): + return s - if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: - # Pass u'' directly to use Unicode APIs on Windows 2000 and up - # (Detecting Windows NT 4 is tricky because 'major >= 4' would - # match Windows 9x series as well. Besides, NT 4 is obsolete.) - return s - else: - return s.encode(sys.getfilesystemencoding(), 'ignore') + if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: + # Pass u'' directly to use Unicode APIs on Windows 2000 and up + # (Detecting Windows NT 4 is tricky because 'major >= 4' would + # match Windows 9x series as well. Besides, NT 4 is obsolete.) + return s + else: + return s.encode(sys.getfilesystemencoding(), 'ignore') class DownloadError(Exception): - """Download Error exception. + """Download Error exception. - This exception may be thrown by FileDownloader objects if they are not - configured to continue on errors. They will contain the appropriate - error message. - """ - pass + This exception may be thrown by FileDownloader objects if they are not + configured to continue on errors. They will contain the appropriate + error message. + """ + pass class SameFileError(Exception): - """Same File exception. + """Same File exception. - This exception will be thrown by FileDownloader objects if they detect - multiple files would have to be downloaded to the same file on disk. - """ - pass + This exception will be thrown by FileDownloader objects if they detect + multiple files would have to be downloaded to the same file on disk. + """ + pass class PostProcessingError(Exception): - """Post Processing exception. + """Post Processing exception. - This exception may be raised by PostProcessor's .run() method to - indicate an error in the postprocessing task. - """ - pass + This exception may be raised by PostProcessor's .run() method to + indicate an error in the postprocessing task. + """ + pass class MaxDownloadsReached(Exception): - """ --max-downloads limit has been reached. """ - pass + """ --max-downloads limit has been reached. """ + pass class UnavailableVideoError(Exception): - """Unavailable Format exception. + """Unavailable Format exception. - This exception will be thrown when a video is requested - in a format that is not available for that video. - """ - pass + This exception will be thrown when a video is requested + in a format that is not available for that video. + """ + pass class ContentTooShortError(Exception): - """Content Too Short exception. + """Content Too Short exception. - This exception may be raised by FileDownloader objects when a file they - download is too small for what the server announced first, indicating - the connection was probably interrupted. - """ - # Both in bytes - downloaded = None - expected = None + This exception may be raised by FileDownloader objects when a file they + download is too small for what the server announced first, indicating + the connection was probably interrupted. + """ + # Both in bytes + downloaded = None + expected = None - def __init__(self, downloaded, expected): - self.downloaded = downloaded - self.expected = expected + def __init__(self, downloaded, expected): + self.downloaded = downloaded + self.expected = expected class Trouble(Exception): - """Trouble helper exception + """Trouble helper exception - This is an exception to be handled with - FileDownloader.trouble - """ + This is an exception to be handled with + FileDownloader.trouble + """ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): - """Handler for HTTP requests and responses. - - This class, when installed with an OpenerDirector, automatically adds - the standard headers to every HTTP request and handles gzipped and - deflated responses from web servers. If compression is to be avoided in - a particular request, the original request in the program code only has - to include the HTTP header "Youtubedl-No-Compression", which will be - removed before making the real request. - - Part of this code was copied from: - - http://techknack.net/python-urllib2-handlers/ - - Andrew Rowls, the author of that code, agreed to release it to the - public domain. - """ - - @staticmethod - def deflate(data): - try: - return zlib.decompress(data, -zlib.MAX_WBITS) - except zlib.error: - return zlib.decompress(data) - - @staticmethod - def addinfourl_wrapper(stream, headers, url, code): - if hasattr(compat_urllib_request.addinfourl, 'getcode'): - return compat_urllib_request.addinfourl(stream, headers, url, code) - ret = compat_urllib_request.addinfourl(stream, headers, url) - ret.code = code - return ret - - def http_request(self, req): - for h in std_headers: - if h in req.headers: - del req.headers[h] - req.add_header(h, std_headers[h]) - if 'Youtubedl-no-compression' in req.headers: - if 'Accept-encoding' in req.headers: - del req.headers['Accept-encoding'] - del req.headers['Youtubedl-no-compression'] - return req - - def http_response(self, req, resp): - old_resp = resp - # gzip - if resp.headers.get('Content-encoding', '') == 'gzip': - gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r') - resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - # deflate - if resp.headers.get('Content-encoding', '') == 'deflate': - gz = io.BytesIO(self.deflate(resp.read())) - resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - return resp + """Handler for HTTP requests and responses. + + This class, when installed with an OpenerDirector, automatically adds + the standard headers to every HTTP request and handles gzipped and + deflated responses from web servers. If compression is to be avoided in + a particular request, the original request in the program code only has + to include the HTTP header "Youtubedl-No-Compression", which will be + removed before making the real request. + + Part of this code was copied from: + + http://techknack.net/python-urllib2-handlers/ + + Andrew Rowls, the author of that code, agreed to release it to the + public domain. + """ + + @staticmethod + def deflate(data): + try: + return zlib.decompress(data, -zlib.MAX_WBITS) + except zlib.error: + return zlib.decompress(data) + + @staticmethod + def addinfourl_wrapper(stream, headers, url, code): + if hasattr(compat_urllib_request.addinfourl, 'getcode'): + return compat_urllib_request.addinfourl(stream, headers, url, code) + ret = compat_urllib_request.addinfourl(stream, headers, url) + ret.code = code + return ret + + def http_request(self, req): + for h in std_headers: + if h in req.headers: + del req.headers[h] + req.add_header(h, std_headers[h]) + if 'Youtubedl-no-compression' in req.headers: + if 'Accept-encoding' in req.headers: + del req.headers['Accept-encoding'] + del req.headers['Youtubedl-no-compression'] + return req + + def http_response(self, req, resp): + old_resp = resp + # gzip + if resp.headers.get('Content-encoding', '') == 'gzip': + gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r') + resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + # deflate + if resp.headers.get('Content-encoding', '') == 'deflate': + gz = io.BytesIO(self.deflate(resp.read())) + resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + return resp -- cgit v1.2.3 From 799c0763845dfb82d53ecae0080b276f447144de Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Nov 2012 04:51:27 +0100 Subject: collegehumor: able to download a single f4f file (not yet playable) --- youtube_dl/utils.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 836138277..4600dc967 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -26,6 +26,11 @@ try: except ImportError: # Python 2 import urllib as compat_urllib_parse +try: + from urllib.parse import urlparse as compat_urllib_parse_urlparse +except ImportError: # Python 2 + from urlparse import urlparse as compat_urllib_parse_urlparse + try: import http.cookiejar as compat_cookiejar except ImportError: # Python 2 -- cgit v1.2.3 From 627dcfff398b00429c8b310f1c1775f9be6f0268 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Nov 2012 12:59:27 +0100 Subject: Restrict more characters (Closes #566) --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4600dc967..7f73b8476 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -330,7 +330,7 @@ def sanitize_filename(s, restricted=False): return '_-' if restricted else ' -' elif char in '\\/|*<>': return '_' - if restricted and (char in '!&\'' or char.isspace()): + if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()): return '_' if restricted and ord(char) > 127: return '_' -- cgit v1.2.3 From 796173d08b514182eedc704541eb55d5c9e1dc0d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 3 Dec 2012 15:36:24 +0100 Subject: Keep video IDs verbatim if possible (Closes #571) --- youtube_dl/utils.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7f73b8476..4dcf18991 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -317,9 +317,10 @@ def timeconvert(timestr): timestamp = email.utils.mktime_tz(timetuple) return timestamp -def sanitize_filename(s, restricted=False): +def sanitize_filename(s, restricted=False, is_id=False): """Sanitizes a string so it could be used as part of a filename. If restricted is set, use a stricter subset of allowed characters. + Set is_id if this is not an arbitrary string, but an ID that should be kept if possible """ def replace_insane(char): if char == '?' or ord(char) < 32 or ord(char) == 127: @@ -337,14 +338,15 @@ def sanitize_filename(s, restricted=False): return char result = u''.join(map(replace_insane, s)) - while '__' in result: - result = result.replace('__', '_') - result = result.strip('_') - # Common case of "Foreign band name - English song title" - if restricted and result.startswith('-_'): - result = result[2:] - if not result: - result = '_' + if not is_id: + while '__' in result: + result = result.replace('__', '_') + result = result.strip('_') + # Common case of "Foreign band name - English song title" + if restricted and result.startswith('-_'): + result = result[2:] + if not result: + result = '_' return result def orderedSet(iterable): -- cgit v1.2.3 From 0f8d03f81cb20ba0f2a4358b8111146b589d1c5d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 7 Dec 2012 00:39:44 +0100 Subject: Let YoutubeDLHandler (transparent gzip) handle HTTPS URLs as well (Needed for #579) --- youtube_dl/utils.py | 3 +++ 1 file changed, 3 insertions(+) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4dcf18991..44f939053 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -506,3 +506,6 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg return resp + + https_request = http_request + https_response = http_response -- cgit v1.2.3 From 5910e210f48826c6d078ef3744b25d209535c3ae Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 16 Dec 2012 12:29:03 +0100 Subject: Fix --extract-audio on Python 3 --- youtube_dl/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 44f939053..25b67db06 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -51,6 +51,12 @@ try: except ImportError: # Python 2 import httplib as compat_http_client +try: + from subprocess import DEVNULL + compat_subprocess_get_DEVNULL = lambda: DEVNULL +except ImportError: + compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') + try: from urllib.parse import parse_qs as compat_parse_qs except ImportError: # Python 2 -- cgit v1.2.3 From f4bfd65ff2bfce77a6953281c037ca8e516b7648 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 20 Dec 2012 13:13:24 +0100 Subject: Correct JSON writing (Closes #596) --- youtube_dl/utils.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 25b67db06..4e64f327a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3,6 +3,7 @@ import gzip import io +import json import locale import os import re @@ -175,6 +176,18 @@ else: assert type(s) == type(u'') print(s) +# In Python 2.x, json.dump expects a bytestream. +# In Python 3.x, it writes to a character stream +if sys.version_info < (3,0): + def write_json_file(obj, fn): + with open(fn, 'wb') as f: + json.dump(obj, f) +else: + def write_json_file(obj, fn): + with open(fn, 'w', encoding='utf-8') as f: + json.dump(obj, f) + + def htmlentity_transform(matchobj): """Transforms an HTML entity to a character. -- cgit v1.2.3 From 43e8fafd49f94ebf4776c84697e4b815750ec701 Mon Sep 17 00:00:00 2001 From: Nick Daniels Date: Wed, 19 Dec 2012 14:21:14 +0000 Subject: Refactor IDParser to search for elements by any attribute not just ID --- youtube_dl/utils.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4e64f327a..a5196b0ae 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -214,10 +214,11 @@ def htmlentity_transform(matchobj): return (u'&%s;' % entity) compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class IDParser(compat_html_parser.HTMLParser): - """Modified HTMLParser that isolates a tag with the specified id""" - def __init__(self, id): - self.id = id +class AttrParser(compat_html_parser.HTMLParser): + """Modified HTMLParser that isolates a tag with the specified attribute""" + def __init__(self, attribute, value): + self.attribute = attribute + self.value = value self.result = None self.started = False self.depth = {} @@ -242,7 +243,7 @@ class IDParser(compat_html_parser.HTMLParser): attrs = dict(attrs) if self.started: self.find_startpos(None) - if 'id' in attrs and attrs['id'] == self.id: + if self.attribute in attrs and attrs[self.attribute] == self.value: self.result = [tag] self.started = True self.watch_startpos = True @@ -280,8 +281,12 @@ class IDParser(compat_html_parser.HTMLParser): return '\n'.join(lines).strip() def get_element_by_id(id, html): - """Return the content of the tag with the specified id in the passed HTML document""" - parser = IDParser(id) + """Return the content of the tag with the specified ID in the passed HTML document""" + return get_element_by_attribute("id", id, html) + +def get_element_by_attribute(attribute, value, html): + """Return the content of the tag with the specified attribute in the passed HTML document""" + parser = AttrParser(attribute, value) try: parser.loads(html) except compat_html_parser.HTMLParseError: -- cgit v1.2.3 From 056d857571158264aefb8d9f7d47c0dad768be63 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Thu, 20 Dec 2012 11:26:38 +0100 Subject: refactor YouTube subtitles code, it was ugly (my bad) --- youtube_dl/utils.py | 8 -------- 1 file changed, 8 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a5196b0ae..c18c9beed 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -465,14 +465,6 @@ class ContentTooShortError(Exception): self.downloaded = downloaded self.expected = expected - -class Trouble(Exception): - """Trouble helper exception - - This is an exception to be handled with - FileDownloader.trouble - """ - class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. -- cgit v1.2.3 From 6b3aef80ceba9b4715065be924dcb1f83ec36655 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Thu, 20 Dec 2012 16:30:55 +0100 Subject: better Vimeo tests; fixed a couple of VimeoIE fields --- youtube_dl/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c18c9beed..463804e18 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -298,7 +298,8 @@ def clean_html(html): """Clean an HTML snippet into a readable string""" # Newline vs
html = html.replace('\n', ' ') - html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html) + html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) + html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) # Strip html tags html = re.sub('<.*?>', '', html) # Replace html entities -- cgit v1.2.3 From cb6ff87fbb05e421f77b57a79699c647866ceb09 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Wed, 26 Dec 2012 23:22:49 +0100 Subject: The new updates system, relies on gh-pages, secured by RSA, uses external web servers --- youtube_dl/utils.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 463804e18..7d6041929 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -410,6 +410,34 @@ def encodeFilename(s): else: return s.encode(sys.getfilesystemencoding(), 'ignore') +def rsa_verify(message, signature, key): + from struct import pack + from hashlib import sha256 + from sys import version_info + def b(x): + if version_info[0] == 2: return x + else: return x.encode('latin1') + assert(type(message) == type(b(''))) + block_size = 0 + n = key[0] + while n: + block_size += 1 + n >>= 8 + signature = pow(int(signature, 16), key[1], key[0]) + raw_bytes = [] + while signature: + raw_bytes.insert(0, pack("B", signature & 0xFF)) + signature >>= 8 + signature = (block_size - len(raw_bytes)) * b('\x00') + b('').join(raw_bytes) + if signature[0:2] != b('\x00\x01'): return False + signature = signature[2:] + if not b('\x00') in signature: return False + signature = signature[signature.index(b('\x00'))+1:] + if not signature.startswith(b('\x30\x31\x30\x0D\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x01\x05\x00\x04\x20')): return False + signature = signature[19:] + if signature != sha256(message).digest(): return False + return True + class DownloadError(Exception): """Download Error exception. -- cgit v1.2.3 From f427df17abc9508f88af9d904ac0520d610c0e9c Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sun, 30 Dec 2012 18:22:36 +0100 Subject: some fixes, pulled the codename from the code --- youtube_dl/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7d6041929..9784abb24 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -154,6 +154,7 @@ std_headers = { 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-us,en;q=0.5', } + def preferredencoding(): """Get preferred encoding. @@ -187,6 +188,11 @@ else: with open(fn, 'w', encoding='utf-8') as f: json.dump(obj, f) +# Some library functions return bytestring on 2.X and unicode on 3.X +def enforce_unicode(s, encoding='utf-8'): + if type(s) != type(u''): + return s.decode(encoding) + return s def htmlentity_transform(matchobj): """Transforms an HTML entity to a character. -- cgit v1.2.3 From d5ed35b664628f40945959e5c07bfbf4c0bd134a Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sun, 30 Dec 2012 19:49:14 +0100 Subject: moved updating code to update.py --- youtube_dl/utils.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9784abb24..e9336bcdd 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -188,12 +188,6 @@ else: with open(fn, 'w', encoding='utf-8') as f: json.dump(obj, f) -# Some library functions return bytestring on 2.X and unicode on 3.X -def enforce_unicode(s, encoding='utf-8'): - if type(s) != type(u''): - return s.decode(encoding) - return s - def htmlentity_transform(matchobj): """Transforms an HTML entity to a character. @@ -416,34 +410,6 @@ def encodeFilename(s): else: return s.encode(sys.getfilesystemencoding(), 'ignore') -def rsa_verify(message, signature, key): - from struct import pack - from hashlib import sha256 - from sys import version_info - def b(x): - if version_info[0] == 2: return x - else: return x.encode('latin1') - assert(type(message) == type(b(''))) - block_size = 0 - n = key[0] - while n: - block_size += 1 - n >>= 8 - signature = pow(int(signature, 16), key[1], key[0]) - raw_bytes = [] - while signature: - raw_bytes.insert(0, pack("B", signature & 0xFF)) - signature >>= 8 - signature = (block_size - len(raw_bytes)) * b('\x00') + b('').join(raw_bytes) - if signature[0:2] != b('\x00\x01'): return False - signature = signature[2:] - if not b('\x00') in signature: return False - signature = signature[signature.index(b('\x00'))+1:] - if not signature.startswith(b('\x30\x31\x30\x0D\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x01\x05\x00\x04\x20')): return False - signature = signature[19:] - if signature != sha256(message).digest(): return False - return True - class DownloadError(Exception): """Download Error exception. -- cgit v1.2.3 From 1c256f7047051bf351ed5aedb95d8e705685a06b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 1 Jan 2013 20:27:53 +0100 Subject: ExtractorError for errors during extraction --- youtube_dl/utils.py | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e9336bcdd..91e180326 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -410,6 +410,17 @@ def encodeFilename(s): else: return s.encode(sys.getfilesystemencoding(), 'ignore') + +class ExtractorError(Exception): + """Error during info extraction.""" + def __init__(self, msg, tb=None): + """ tb is the original traceback (so that it can be printed out) """ + super(ExtractorError, self).__init__(msg) + if tb is None: + tb = sys.exc_info()[2] + self.traceback = tb + + class DownloadError(Exception): """Download Error exception. -- cgit v1.2.3 From 01951dda7a27d3bd2331f22ded5d33876cf1dad9 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 3 Jan 2013 15:39:55 +0100 Subject: Make ExtractorError usable for other causes --- youtube_dl/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 91e180326..8f856ee8c 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -8,6 +8,7 @@ import locale import os import re import sys +import traceback import zlib import email.utils import json @@ -414,12 +415,15 @@ def encodeFilename(s): class ExtractorError(Exception): """Error during info extraction.""" def __init__(self, msg, tb=None): - """ tb is the original traceback (so that it can be printed out) """ + """ tb, if given, is the original traceback (so that it can be printed out). """ super(ExtractorError, self).__init__(msg) - if tb is None: - tb = sys.exc_info()[2] self.traceback = tb + def format_traceback(self): + if self.traceback is None: + return None + return u''.join(traceback.format_tb(self.traceback)) + class DownloadError(Exception): """Download Error exception. -- cgit v1.2.3