diff options
Diffstat (limited to 'youtube_dl/utils.py')
| -rw-r--r-- | youtube_dl/utils.py | 273 | 
1 files changed, 255 insertions, 18 deletions
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index cf2ea654e..3e81c308b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,19 +1,21 @@  #!/usr/bin/env python  # -*- coding: utf-8 -*- +import datetime +import email.utils  import errno  import gzip  import io  import json  import locale  import os +import pipes +import platform  import re +import socket  import sys  import traceback  import zlib -import email.utils -import socket -import datetime  try:      import urllib.request as compat_urllib_request @@ -61,6 +63,17 @@ except ImportError: # Python 2      import httplib as compat_http_client  try: +    from urllib.error import HTTPError as compat_HTTPError +except ImportError:  # Python 2 +    from urllib2 import HTTPError as compat_HTTPError + +try: +    from urllib.request import urlretrieve as compat_urlretrieve +except ImportError:  # Python 2 +    from urllib import urlretrieve as compat_urlretrieve + + +try:      from subprocess import DEVNULL      compat_subprocess_get_DEVNULL = lambda: DEVNULL  except ImportError: @@ -163,7 +176,7 @@ def compat_ord(c):  compiled_regex_type = type(re.compile(''))  std_headers = { -    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', +    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',      'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      'Accept-Encoding': 'gzip, deflate', @@ -207,7 +220,7 @@ if sys.version_info >= (2,7):      def find_xpath_attr(node, xpath, key, val):          """ Find the xpath xpath[@key=val] """          assert re.match(r'^[a-zA-Z]+$', key) -        assert re.match(r'^[a-zA-Z@]*$', val) +        assert re.match(r'^[a-zA-Z0-9@\s]*$', val)          expr = xpath + u"[@%s='%s']" % (key, val)          return node.find(expr)  else: @@ -217,6 +230,19 @@ else:                  return f          return None +# On python2.6 the xml.etree.ElementTree.Element methods don't support +# the namespace parameter +def xpath_with_ns(path, ns_map): +    components = [c.split(':') for c in path.split('/')] +    replaced = [] +    for c in components: +        if len(c) == 1: +            replaced.append(c[0]) +        else: +            ns, tag = c +            replaced.append('{%s}%s' % (ns_map[ns], tag)) +    return '/'.join(replaced) +  def htmlentity_transform(matchobj):      """Transforms an HTML entity to a character. @@ -243,7 +269,17 @@ def htmlentity_transform(matchobj):      return (u'&%s;' % entity)  compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class AttrParser(compat_html_parser.HTMLParser): +class BaseHTMLParser(compat_html_parser.HTMLParser): +    def __init(self): +        compat_html_parser.HTMLParser.__init__(self) +        self.html = None + +    def loads(self, html): +        self.html = html +        self.feed(html) +        self.close() + +class AttrParser(BaseHTMLParser):      """Modified HTMLParser that isolates a tag with the specified attribute"""      def __init__(self, attribute, value):          self.attribute = attribute @@ -251,10 +287,9 @@ class AttrParser(compat_html_parser.HTMLParser):          self.result = None          self.started = False          self.depth = {} -        self.html = None          self.watch_startpos = False          self.error_count = 0 -        compat_html_parser.HTMLParser.__init__(self) +        BaseHTMLParser.__init__(self)      def error(self, message):          if self.error_count > 10 or self.started: @@ -263,11 +298,6 @@ class AttrParser(compat_html_parser.HTMLParser):          self.error_count += 1          self.goahead(1) -    def loads(self, html): -        self.html = html -        self.feed(html) -        self.close() -      def handle_starttag(self, tag, attrs):          attrs = dict(attrs)          if self.started: @@ -328,6 +358,38 @@ def get_element_by_attribute(attribute, value, html):          pass      return parser.get_result() +class MetaParser(BaseHTMLParser): +    """ +    Modified HTMLParser that isolates a meta tag with the specified name  +    attribute. +    """ +    def __init__(self, name): +        BaseHTMLParser.__init__(self) +        self.name = name +        self.content = None +        self.result = None + +    def handle_starttag(self, tag, attrs): +        if tag != 'meta': +            return +        attrs = dict(attrs) +        if attrs.get('name') == self.name: +            self.result = attrs.get('content') + +    def get_result(self): +        return self.result + +def get_meta_content(name, html): +    """ +    Return the content attribute from the meta tag with the given name attribute. +    """ +    parser = MetaParser(name) +    try: +        parser.loads(html) +    except compat_html_parser.HTMLParseError: +        pass +    return parser.get_result() +  def clean_html(html):      """Clean an HTML snippet into a readable string""" @@ -489,7 +551,7 @@ def make_HTTPS_handler(opts):  class ExtractorError(Exception):      """Error during info extraction.""" -    def __init__(self, msg, tb=None, expected=False): +    def __init__(self, msg, tb=None, expected=False, cause=None):          """ tb, if given, is the original traceback (so that it can be printed out).          If expected is set, this is a normal error message and most likely not a bug in youtube-dl.          """ @@ -497,11 +559,12 @@ class ExtractorError(Exception):          if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):              expected = True          if not expected: -            msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output.' +            msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'          super(ExtractorError, self).__init__(msg)          self.traceback = tb          self.exc_info = sys.exc_info()  # preserve original exception +        self.cause = cause      def format_traceback(self):          if self.traceback is None: @@ -622,8 +685,23 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):          old_resp = resp          # gzip          if resp.headers.get('Content-encoding', '') == 'gzip': -            gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r') -            resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) +            content = resp.read() +            gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') +            try: +                uncompressed = io.BytesIO(gz.read()) +            except IOError as original_ioerror: +                # There may be junk add the end of the file +                # See http://stackoverflow.com/q/4928560/35070 for details +                for i in range(1, 1024): +                    try: +                        gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb') +                        uncompressed = io.BytesIO(gz.read()) +                    except IOError: +                        continue +                    break +                else: +                    raise original_ioerror +            resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)              resp.msg = old_resp.msg          # deflate          if resp.headers.get('Content-encoding', '') == 'deflate': @@ -642,7 +720,17 @@ def unified_strdate(date_str):      date_str = date_str.replace(',',' ')      # %z (UTC offset) is only supported in python>=3.2      date_str = re.sub(r' (\+|-)[\d]*$', '', date_str) -    format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M'] +    format_expressions = [ +        '%d %B %Y', +        '%B %d %Y', +        '%b %d %Y', +        '%Y-%m-%d', +        '%d/%m/%Y', +        '%Y/%m/%d %H:%M:%S', +        '%d.%m.%Y %H:%M', +        '%Y-%m-%dT%H:%M:%SZ', +        '%Y-%m-%dT%H:%M:%S', +    ]      for expression in format_expressions:          try:              upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') @@ -657,6 +745,9 @@ def determine_ext(url, default_ext=u'unknown_video'):      else:          return default_ext +def subtitles_filename(filename, sub_lang, sub_format): +    return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format +  def date_from_str(date_str):      """      Return a datetime object from a string in the format YYYYMMDD or @@ -708,3 +799,149 @@ class DateRange(object):          return self.start <= date <= self.end      def __str__(self):          return '%s - %s' % ( self.start.isoformat(), self.end.isoformat()) + + +def platform_name(): +    """ Returns the platform name as a compat_str """ +    res = platform.platform() +    if isinstance(res, bytes): +        res = res.decode(preferredencoding()) + +    assert isinstance(res, compat_str) +    return res + + +def write_string(s, out=None): +    if out is None: +        out = sys.stderr +    assert type(s) == type(u'') + +    if ('b' in getattr(out, 'mode', '') or +            sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr +        s = s.encode(preferredencoding(), 'ignore') +    out.write(s) +    out.flush() + + +def bytes_to_intlist(bs): +    if not bs: +        return [] +    if isinstance(bs[0], int):  # Python 3 +        return list(bs) +    else: +        return [ord(c) for c in bs] + + +def intlist_to_bytes(xs): +    if not xs: +        return b'' +    if isinstance(chr(0), bytes):  # Python 2 +        return ''.join([chr(x) for x in xs]) +    else: +        return bytes(xs) + + +def get_cachedir(params={}): +    cache_root = os.environ.get('XDG_CACHE_HOME', +                                os.path.expanduser('~/.cache')) +    return params.get('cachedir', os.path.join(cache_root, 'youtube-dl')) + + +# Cross-platform file locking +if sys.platform == 'win32': +    import ctypes.wintypes +    import msvcrt + +    class OVERLAPPED(ctypes.Structure): +        _fields_ = [ +            ('Internal', ctypes.wintypes.LPVOID), +            ('InternalHigh', ctypes.wintypes.LPVOID), +            ('Offset', ctypes.wintypes.DWORD), +            ('OffsetHigh', ctypes.wintypes.DWORD), +            ('hEvent', ctypes.wintypes.HANDLE), +        ] + +    kernel32 = ctypes.windll.kernel32 +    LockFileEx = kernel32.LockFileEx +    LockFileEx.argtypes = [ +        ctypes.wintypes.HANDLE,     # hFile +        ctypes.wintypes.DWORD,      # dwFlags +        ctypes.wintypes.DWORD,      # dwReserved +        ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow +        ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh +        ctypes.POINTER(OVERLAPPED)  # Overlapped +    ] +    LockFileEx.restype = ctypes.wintypes.BOOL +    UnlockFileEx = kernel32.UnlockFileEx +    UnlockFileEx.argtypes = [ +        ctypes.wintypes.HANDLE,     # hFile +        ctypes.wintypes.DWORD,      # dwReserved +        ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow +        ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh +        ctypes.POINTER(OVERLAPPED)  # Overlapped +    ] +    UnlockFileEx.restype = ctypes.wintypes.BOOL +    whole_low = 0xffffffff +    whole_high = 0x7fffffff + +    def _lock_file(f, exclusive): +        overlapped = OVERLAPPED() +        overlapped.Offset = 0 +        overlapped.OffsetHigh = 0 +        overlapped.hEvent = 0 +        f._lock_file_overlapped_p = ctypes.pointer(overlapped) +        handle = msvcrt.get_osfhandle(f.fileno()) +        if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0, +                          whole_low, whole_high, f._lock_file_overlapped_p): +            raise OSError('Locking file failed: %r' % ctypes.FormatError()) + +    def _unlock_file(f): +        assert f._lock_file_overlapped_p +        handle = msvcrt.get_osfhandle(f.fileno()) +        if not UnlockFileEx(handle, 0, +                            whole_low, whole_high, f._lock_file_overlapped_p): +            raise OSError('Unlocking file failed: %r' % ctypes.FormatError()) + +else: +    import fcntl + +    def _lock_file(f, exclusive): +        fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) + +    def _unlock_file(f): +        fcntl.lockf(f, fcntl.LOCK_UN) + + +class locked_file(object): +    def __init__(self, filename, mode, encoding=None): +        assert mode in ['r', 'a', 'w'] +        self.f = io.open(filename, mode, encoding=encoding) +        self.mode = mode + +    def __enter__(self): +        exclusive = self.mode != 'r' +        try: +            _lock_file(self.f, exclusive) +        except IOError: +            self.f.close() +            raise +        return self + +    def __exit__(self, etype, value, traceback): +        try: +            _unlock_file(self.f) +        finally: +            self.f.close() + +    def __iter__(self): +        return iter(self.f) + +    def write(self, *args): +        return self.f.write(*args) + +    def read(self, *args): +        return self.f.read(*args) + + +def shell_quote(args): +    return ' '.join(map(pipes.quote, args))  | 
