diff options
Diffstat (limited to 'youtube_dl/utils.py')
| -rw-r--r-- | youtube_dl/utils.py | 120 | 
1 files changed, 77 insertions, 43 deletions
| diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7dbe25661..d0606b4bc 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3,6 +3,7 @@  from __future__ import unicode_literals +import base64  import calendar  import codecs  import contextlib @@ -35,6 +36,7 @@ import zlib  from .compat import (      compat_basestring,      compat_chr, +    compat_etree_fromstring,      compat_html_entities,      compat_http_client,      compat_kwargs, @@ -177,10 +179,19 @@ def xpath_with_ns(path, ns_map):  def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): -    if sys.version_info < (2, 7):  # Crazy 2.6 -        xpath = xpath.encode('ascii') +    def _find_xpath(xpath): +        if sys.version_info < (2, 7):  # Crazy 2.6 +            xpath = xpath.encode('ascii') +        return node.find(xpath) + +    if isinstance(xpath, (str, compat_str)): +        n = _find_xpath(xpath) +    else: +        for xp in xpath: +            n = _find_xpath(xp) +            if n is not None: +                break -    n = node.find(xpath)      if n is None:          if default is not NO_DEFAULT:              return default @@ -355,13 +366,20 @@ def sanitize_path(s):      if drive_or_unc:          norm_path.pop(0)      sanitized_path = [ -        path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part) +        path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)          for path_part in norm_path]      if drive_or_unc:          sanitized_path.insert(0, drive_or_unc + os.path.sep)      return os.path.join(*sanitized_path) +# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of +# unwanted failures due to missing protocol +def sanitized_Request(url, *args, **kwargs): +    return compat_urllib_request.Request( +        'http:%s' % url if url.startswith('//') else url, *args, **kwargs) + +  def orderedSet(iterable):      """ Remove all duplicates from the input iterable """      res = [] @@ -385,10 +403,14 @@ def _htmlentity_transform(entity):              numstr = '0%s' % numstr          else:              base = 10 -        return compat_chr(int(numstr, base)) +        # See https://github.com/rg3/youtube-dl/issues/7518 +        try: +            return compat_chr(int(numstr, base)) +        except ValueError: +            pass      # Unknown entity in name, return its literal representation -    return ('&%s;' % entity) +    return '&%s;' % entity  def unescapeHTML(s): @@ -641,6 +663,16 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):      return hc +def handle_youtubedl_headers(headers): +    filtered_headers = headers + +    if 'Youtubedl-no-compression' in filtered_headers: +        filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding') +        del filtered_headers['Youtubedl-no-compression'] + +    return filtered_headers + +  class YoutubeDLHandler(compat_urllib_request.HTTPHandler):      """Handler for HTTP requests and responses. @@ -648,7 +680,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):      the standard headers to every HTTP request and handles gzipped and      deflated responses from web servers. If compression is to be avoided in      a particular request, the original request in the program code only has -    to include the HTTP header "Youtubedl-No-Compression", which will be +    to include the HTTP header "Youtubedl-no-compression", which will be      removed before making the real request.      Part of this code was copied from: @@ -709,10 +741,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):              # The dict keys are capitalized because of this bug by urllib              if h.capitalize() not in req.headers:                  req.add_header(h, v) -        if 'Youtubedl-no-compression' in req.headers: -            if 'Accept-encoding' in req.headers: -                del req.headers['Accept-encoding'] -            del req.headers['Youtubedl-no-compression'] + +        req.headers = handle_youtubedl_headers(req.headers)          if sys.version_info < (2, 7) and '#' in req.get_full_url():              # Python 2.6 is brain-dead when it comes to fragments @@ -813,9 +843,11 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):      if date_str is None:          return None +    date_str = re.sub(r'\.[0-9]+', '', date_str) +      if timezone is None:          m = re.search( -            r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', +            r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',              date_str)          if not m:              timezone = datetime.timedelta() @@ -828,9 +860,12 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):                  timezone = datetime.timedelta(                      hours=sign * int(m.group('hours')),                      minutes=sign * int(m.group('minutes'))) -    date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) -    dt = datetime.datetime.strptime(date_str, date_format) - timezone -    return calendar.timegm(dt.timetuple()) +    try: +        date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) +        dt = datetime.datetime.strptime(date_str, date_format) - timezone +        return calendar.timegm(dt.timetuple()) +    except ValueError: +        pass  def unified_strdate(date_str, day_first=True): @@ -895,7 +930,8 @@ def unified_strdate(date_str, day_first=True):          timetuple = email.utils.parsedate_tz(date_str)          if timetuple:              upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') -    return upload_date +    if upload_date is not None: +        return compat_str(upload_date)  def determine_ext(url, default_ext='unknown_video'): @@ -904,6 +940,21 @@ def determine_ext(url, default_ext='unknown_video'):      guess = url.partition('?')[0].rpartition('.')[2]      if re.match(r'^[A-Za-z0-9]+$', guess):          return guess +    elif guess.rstrip('/') in ( +            'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', +            'flv', 'f4v', 'f4a', 'f4b', +            'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus', +            'mkv', 'mka', 'mk3d', +            'avi', 'divx', +            'mov', +            'asf', 'wmv', 'wma', +            '3gp', '3g2', +            'mp3', +            'flac', +            'ape', +            'wav', +            'f4f', 'f4m', 'm3u8', 'smil'): +        return guess.rstrip('/')      else:          return default_ext @@ -1647,30 +1698,9 @@ def urlencode_postdata(*args, **kargs):  def encode_dict(d, encoding='utf-8'): -    return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items()) - - -try: -    etree_iter = xml.etree.ElementTree.Element.iter -except AttributeError:  # Python <=2.6 -    etree_iter = lambda n: n.findall('.//*') - - -def parse_xml(s): -    class TreeBuilder(xml.etree.ElementTree.TreeBuilder): -        def doctype(self, name, pubid, system): -            pass  # Ignore doctypes - -    parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder()) -    kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {} -    tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) -    # Fix up XML parser in Python 2.x -    if sys.version_info < (3, 0): -        for n in etree_iter(tree): -            if n.text is not None: -                if not isinstance(n.text, compat_str): -                    n.text = n.text.decode('utf-8') -    return tree +    def encode(v): +        return v.encode(encoding) if isinstance(v, compat_basestring) else v +    return dict((encode(k), encode(v)) for k, v in d.items())  US_RATINGS = { @@ -1700,8 +1730,8 @@ def js_to_json(code):          if v in ('true', 'false', 'null'):              return v          if v.startswith('"'): -            return v -        if v.startswith("'"): +            v = re.sub(r"\\'", "'", v[1:-1]) +        elif v.startswith("'"):              v = v[1:-1]              v = re.sub(r"\\\\|\\'|\"", lambda m: {                  '\\\\': '\\\\', @@ -1795,6 +1825,10 @@ def urlhandle_detect_ext(url_handle):      return mimetype2ext(getheader('Content-Type')) +def encode_data_uri(data, mime_type): +    return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii')) + +  def age_restricted(content_limit, age_limit):      """ Returns True iff the content should be blocked """ @@ -1969,7 +2003,7 @@ def dfxp2srt(dfxp_data):          return out -    dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) +    dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))      out = []      paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') | 
