diff options
Diffstat (limited to 'youtube_dl/utils.py')
| -rw-r--r-- | youtube_dl/utils.py | 123 | 
1 files changed, 84 insertions, 39 deletions
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 16bc7408a..b644f4e92 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -280,6 +280,11 @@ if sys.version_info >= (2, 7):          return node.find(expr)  else:      def find_xpath_attr(node, xpath, key, val): +        # Here comes the crazy part: In 2.6, if the xpath is a unicode, +        # .//node does not match if a node is a direct child of . ! +        if isinstance(xpath, unicode): +            xpath = xpath.encode('ascii') +          for f in node.findall(xpath):              if f.attrib.get(key) == val:                  return f @@ -298,30 +303,20 @@ def xpath_with_ns(path, ns_map):              replaced.append('{%s}%s' % (ns_map[ns], tag))      return '/'.join(replaced) -def htmlentity_transform(matchobj): -    """Transforms an HTML entity to a character. -    This function receives a match object and is intended to be used with -    the re.sub() function. -    """ -    entity = matchobj.group(1) +def xpath_text(node, xpath, name=None, fatal=False): +    if sys.version_info < (2, 7):  # Crazy 2.6 +        xpath = xpath.encode('ascii') -    # Known non-numeric HTML entity -    if entity in compat_html_entities.name2codepoint: -        return compat_chr(compat_html_entities.name2codepoint[entity]) - -    mobj = re.match(u'(?u)#(x?\\d+)', entity) -    if mobj is not None: -        numstr = mobj.group(1) -        if numstr.startswith(u'x'): -            base = 16 -            numstr = u'0%s' % numstr +    n = node.find(xpath) +    if n is None: +        if fatal: +            name = xpath if name is None else name +            raise ExtractorError('Could not find XML element %s' % name)          else: -            base = 10 -        return compat_chr(int(numstr, base)) +            return None +    return n.text -    # Unknown entity in name, return its literal representation -    return (u'&%s;' % entity)  compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix  class BaseHTMLParser(compat_html_parser.HTMLParser): @@ -543,13 +538,33 @@ def orderedSet(iterable):      return res +def _htmlentity_transform(entity): +    """Transforms an HTML entity to a character.""" +    # Known non-numeric HTML entity +    if entity in compat_html_entities.name2codepoint: +        return compat_chr(compat_html_entities.name2codepoint[entity]) + +    mobj = re.match(r'#(x?[0-9]+)', entity) +    if mobj is not None: +        numstr = mobj.group(1) +        if numstr.startswith(u'x'): +            base = 16 +            numstr = u'0%s' % numstr +        else: +            base = 10 +        return compat_chr(int(numstr, base)) + +    # Unknown entity in name, return its literal representation +    return (u'&%s;' % entity) + +  def unescapeHTML(s):      if s is None:          return None      assert type(s) == compat_str -    result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s) -    return result +    return re.sub( +        r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)  def encodeFilename(s, for_subprocess=False): @@ -621,7 +636,7 @@ def make_HTTPS_handler(opts_no_check_certificate, **kwargs):                      self.sock = sock                      self._tunnel()                  try: -                    self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3) +                    self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)                  except ssl.SSLError:                      self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23) @@ -629,8 +644,14 @@ def make_HTTPS_handler(opts_no_check_certificate, **kwargs):              def https_open(self, req):                  return self.do_open(HTTPSConnectionV3, req)          return HTTPSHandlerV3(**kwargs) -    else: -        context = ssl.SSLContext(ssl.PROTOCOL_SSLv3) +    elif hasattr(ssl, 'create_default_context'):  # Python >= 3.4 +        context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) +        context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3 +        if opts_no_check_certificate: +            context.verify_mode = ssl.CERT_NONE +        return compat_urllib_request.HTTPSHandler(context=context, **kwargs) +    else:  # Python < 3.4 +        context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)          context.verify_mode = (ssl.CERT_NONE                                 if opts_no_check_certificate                                 else ssl.CERT_REQUIRED) @@ -766,10 +787,9 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):          return ret      def http_request(self, req): -        for h,v in std_headers.items(): -            if h in req.headers: -                del req.headers[h] -            req.add_header(h, v) +        for h, v in std_headers.items(): +            if h not in req.headers: +                req.add_header(h, v)          if 'Youtubedl-no-compression' in req.headers:              if 'Accept-encoding' in req.headers:                  del req.headers['Accept-encoding'] @@ -1081,12 +1101,6 @@ def intlist_to_bytes(xs):          return bytes(xs) -def get_cachedir(params={}): -    cache_root = os.environ.get('XDG_CACHE_HOME', -                                os.path.expanduser('~/.cache')) -    return params.get('cachedir', os.path.join(cache_root, 'youtube-dl')) - -  # Cross-platform file locking  if sys.platform == 'win32':      import ctypes.wintypes @@ -1146,10 +1160,10 @@ else:      import fcntl      def _lock_file(f, exclusive): -        fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) +        fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)      def _unlock_file(f): -        fcntl.lockf(f, fcntl.LOCK_UN) +        fcntl.flock(f, fcntl.LOCK_UN)  class locked_file(object): @@ -1323,9 +1337,10 @@ def str_or_none(v, default=None):  def str_to_int(int_str): +    """ A more relaxed version of int_or_none """      if int_str is None:          return None -    int_str = re.sub(r'[,\.]', u'', int_str) +    int_str = re.sub(r'[,\.\+]', u'', int_str)      return int(int_str) @@ -1337,8 +1352,10 @@ def parse_duration(s):      if s is None:          return None +    s = s.strip() +      m = re.match( -        r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?(?P<ms>\.[0-9]+)?$', s) +        r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)      if not m:          return None      res = int(m.group('secs')) @@ -1420,6 +1437,24 @@ def uppercase_escape(s):          lambda m: unicode_escape(m.group(0))[0],          s) + +def escape_rfc3986(s): +    """Escape non-ASCII characters as suggested by RFC 3986""" +    if sys.version_info < (3, 0) and isinstance(s, unicode): +        s = s.encode('utf-8') +    return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]") + + +def escape_url(url): +    """Escape URL as suggested by RFC 3986""" +    url_parsed = compat_urllib_parse_urlparse(url) +    return url_parsed._replace( +        path=escape_rfc3986(url_parsed.path), +        params=escape_rfc3986(url_parsed.params), +        query=escape_rfc3986(url_parsed.query), +        fragment=escape_rfc3986(url_parsed.fragment) +    ).geturl() +  try:      struct.pack(u'!I', 0)  except TypeError: @@ -1554,3 +1589,13 @@ except AttributeError:          if ret:              raise subprocess.CalledProcessError(ret, p.args, output=output)          return output + + +def limit_length(s, length): +    """ Add ellipses to overly long strings """ +    if s is None: +        return None +    ELLIPSES = '...' +    if len(s) > length: +        return s[:length - len(ELLIPSES)] + ELLIPSES +    return s  | 
