diff options
Diffstat (limited to 'youtube_dl/utils.py')
| -rw-r--r-- | youtube_dl/utils.py | 250 | 
1 files changed, 206 insertions, 44 deletions
| diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 94b496dd0..079e8d2c3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -41,6 +41,7 @@ from .compat import (      compat_urllib_parse_urlparse,      compat_urllib_request,      compat_urlparse, +    shlex_quote,  ) @@ -55,6 +56,7 @@ std_headers = {      'Accept-Language': 'en-us,en;q=0.5',  } +  def preferredencoding():      """Get preferred encoding. @@ -71,10 +73,10 @@ def preferredencoding():  def write_json_file(obj, fn): -    """ Encode obj as JSON and write it to fn, atomically """ +    """ Encode obj as JSON and write it to fn, atomically if possible """      fn = encodeFilename(fn) -    if sys.version_info < (3, 0): +    if sys.version_info < (3, 0) and sys.platform != 'win32':          encoding = get_filesystem_encoding()          # os.path.basename returns a bytes object, but NamedTemporaryFile          # will fail if the filename contains non ascii characters unless we @@ -108,6 +110,13 @@ def write_json_file(obj, fn):      try:          with tf:              json.dump(obj, tf) +        if sys.platform == 'win32': +            # Need to remove existing file on Windows, else os.rename raises +            # WindowsError or FileExistsError. +            try: +                os.unlink(fn) +            except OSError: +                pass          os.rename(tf.name, fn)      except:          try: @@ -122,7 +131,7 @@ if sys.version_info >= (2, 7):          """ Find the xpath xpath[@key=val] """          assert re.match(r'^[a-zA-Z-]+$', key)          assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) -        expr = xpath + u"[@%s='%s']" % (key, val) +        expr = xpath + "[@%s='%s']" % (key, val)          return node.find(expr)  else:      def find_xpath_attr(node, xpath, key, val): @@ -138,6 +147,8 @@ else:  # On python2.6 the xml.etree.ElementTree.Element methods don't support  # the namespace parameter + +  def xpath_with_ns(path, ns_map):      components = [c.split(':') for c in path.split('/')]      replaced = [] @@ -155,7 +166,7 @@ def xpath_text(node, xpath, name=None, fatal=False):          xpath = xpath.encode('ascii')      n = node.find(xpath) -    if n is None: +    if n is None or n.text is None:          if fatal:              name = xpath if name is None else name              raise ExtractorError('Could not find XML element %s' % name) @@ -194,6 +205,10 @@ def get_element_by_attribute(attribute, value, html):  def clean_html(html):      """Clean an HTML snippet into a readable string""" + +    if html is None:  # Convenience for sanitizing descriptions etc. +        return html +      # Newline vs <br />      html = html.replace('\n', ' ')      html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) @@ -229,9 +244,9 @@ def sanitize_open(filename, open_mode):          # In case of error, try to remove win32 forbidden chars          alt_filename = os.path.join( -                        re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part) -                        for path_part in os.path.split(filename) -                       ) +            re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part) +            for path_part in os.path.split(filename) +        )          if alt_filename == filename:              raise          else: @@ -248,6 +263,7 @@ def timeconvert(timestr):          timestamp = email.utils.mktime_tz(timetuple)      return timestamp +  def sanitize_filename(s, restricted=False, is_id=False):      """Sanitizes a string so it could be used as part of a filename.      If restricted is set, use a stricter subset of allowed characters. @@ -280,6 +296,7 @@ def sanitize_filename(s, restricted=False, is_id=False):              result = '_'      return result +  def orderedSet(iterable):      """ Remove all duplicates from the input iterable """      res = [] @@ -350,7 +367,7 @@ def encodeArgument(s):      if not isinstance(s, compat_str):          # Legacy code that uses byte strings          # Uncomment the following line after fixing all post processors -        #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s)) +        # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))          s = s.decode('ascii')      return encodeFilename(s, True) @@ -364,6 +381,7 @@ def decodeOption(optval):      assert isinstance(optval, compat_str)      return optval +  def formatSeconds(secs):      if secs > 3600:          return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60) @@ -374,6 +392,17 @@ def formatSeconds(secs):  def make_HTTPS_handler(opts_no_check_certificate, **kwargs): +    if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9 +        context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) +        if opts_no_check_certificate: +            context.verify_mode = ssl.CERT_NONE +        try: +            return compat_urllib_request.HTTPSHandler(context=context, **kwargs) +        except TypeError: +            # Python 2.7.8 +            # (create_default_context present but HTTPSHandler has no context=) +            pass +      if sys.version_info < (3, 2):          import httplib @@ -395,26 +424,18 @@ def make_HTTPS_handler(opts_no_check_certificate, **kwargs):              def https_open(self, req):                  return self.do_open(HTTPSConnectionV3, req)          return HTTPSHandlerV3(**kwargs) -    elif hasattr(ssl, 'create_default_context'):  # Python >= 3.4 -        context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) -        context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3 -        if opts_no_check_certificate: -            context.verify_mode = ssl.CERT_NONE -        return compat_urllib_request.HTTPSHandler(context=context, **kwargs)      else:  # Python < 3.4          context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)          context.verify_mode = (ssl.CERT_NONE                                 if opts_no_check_certificate                                 else ssl.CERT_REQUIRED)          context.set_default_verify_paths() -        try: -            context.load_default_certs() -        except AttributeError: -            pass  # Python < 3.4          return compat_urllib_request.HTTPSHandler(context=context, **kwargs) +  class ExtractorError(Exception):      """Error during info extraction.""" +      def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):          """ tb, if given, is the original traceback (so that it can be printed out).          If expected is set, this is a normal error message and most likely not a bug in youtube-dl. @@ -427,7 +448,13 @@ class ExtractorError(Exception):          if cause:              msg += ' (caused by %r)' % cause          if not expected: -            msg = msg + '; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.' +            if ytdl_is_updateable(): +                update_cmd = 'type  youtube-dl -U  to update' +            else: +                update_cmd = 'see  https://yt-dl.org/update  on how to update' +            msg += '; please report this issue on https://yt-dl.org/bug .' +            msg += ' Make sure you are using the latest version; %s.' % update_cmd +            msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'          super(ExtractorError, self).__init__(msg)          self.traceback = tb @@ -441,6 +468,13 @@ class ExtractorError(Exception):          return ''.join(traceback.format_tb(self.traceback)) +class UnsupportedError(ExtractorError): +    def __init__(self, url): +        super(UnsupportedError, self).__init__( +            'Unsupported URL: %s' % url, expected=True) +        self.url = url + +  class RegexNotFoundError(ExtractorError):      """Error when a regex didn't match"""      pass @@ -453,6 +487,7 @@ class DownloadError(Exception):      configured to continue on errors. They will contain the appropriate      error message.      """ +      def __init__(self, msg, exc_info=None):          """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """          super(DownloadError, self).__init__(msg) @@ -474,9 +509,11 @@ class PostProcessingError(Exception):      This exception may be raised by PostProcessor's .run() method to      indicate an error in the postprocessing task.      """ +      def __init__(self, msg):          self.msg = msg +  class MaxDownloadsReached(Exception):      """ --max-downloads limit has been reached. """      pass @@ -506,6 +543,7 @@ class ContentTooShortError(Exception):          self.downloaded = downloaded          self.expected = expected +  class YoutubeDLHandler(compat_urllib_request.HTTPHandler):      """Handler for HTTP requests and responses. @@ -618,17 +656,19 @@ def parse_iso8601(date_str, delimiter='T'):      return calendar.timegm(dt.timetuple()) -def unified_strdate(date_str): +def unified_strdate(date_str, day_first=True):      """Return a string with the date in the format YYYYMMDD"""      if date_str is None:          return None -      upload_date = None -    #Replace commas +    # Replace commas      date_str = date_str.replace(',', ' ')      # %z (UTC offset) is only supported in python>=3.2      date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) +    # Remove AM/PM + timezone +    date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str) +      format_expressions = [          '%d %B %Y',          '%d %b %Y', @@ -643,7 +683,6 @@ def unified_strdate(date_str):          '%d/%m/%Y',          '%d/%m/%y',          '%Y/%m/%d %H:%M:%S', -        '%d/%m/%Y %H:%M:%S',          '%Y-%m-%d %H:%M:%S',          '%Y-%m-%d %H:%M:%S.%f',          '%d.%m.%Y %H:%M', @@ -655,6 +694,14 @@ def unified_strdate(date_str):          '%Y-%m-%dT%H:%M:%S.%f',          '%Y-%m-%dT%H:%M',      ] +    if day_first: +        format_expressions.extend([ +            '%d/%m/%Y %H:%M:%S', +        ]) +    else: +        format_expressions.extend([ +            '%m/%d/%Y %H:%M:%S', +        ])      for expression in format_expressions:          try:              upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') @@ -666,6 +713,7 @@ def unified_strdate(date_str):              upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')      return upload_date +  def determine_ext(url, default_ext='unknown_video'):      if url is None:          return default_ext @@ -675,16 +723,20 @@ def determine_ext(url, default_ext='unknown_video'):      else:          return default_ext +  def subtitles_filename(filename, sub_lang, sub_format):      return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format +  def date_from_str(date_str):      """      Return a datetime object from a string in the format YYYYMMDD or      (now|today)[+-][0-9](day|week|month|year)(s)?"""      today = datetime.date.today() -    if date_str == 'now'or date_str == 'today': +    if date_str in ('now', 'today'):          return today +    if date_str == 'yesterday': +        return today - datetime.timedelta(days=1)      match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)      if match is not None:          sign = match.group('sign') @@ -692,7 +744,7 @@ def date_from_str(date_str):          if sign == '-':              time = -time          unit = match.group('unit') -        #A bad aproximation? +        # A bad aproximation?          if unit == 'month':              unit = 'day'              time *= 30 @@ -703,7 +755,8 @@ def date_from_str(date_str):          delta = datetime.timedelta(**{unit: time})          return today + delta      return datetime.datetime.strptime(date_str, "%Y%m%d").date() -     + +  def hyphenate_date(date_str):      """      Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format""" @@ -713,8 +766,10 @@ def hyphenate_date(date_str):      else:          return date_str +  class DateRange(object):      """Represents a time interval between two dates""" +      def __init__(self, start=None, end=None):          """start and end must be strings in the format accepted by date"""          if start is not None: @@ -727,17 +782,20 @@ class DateRange(object):              self.end = datetime.datetime.max.date()          if self.start > self.end:              raise ValueError('Date range: "%s" , the start date must be before the end date' % self) +      @classmethod      def day(cls, day):          """Returns a range that only contains the given day""" -        return cls(day,day) +        return cls(day, day) +      def __contains__(self, date):          """Check if the date is in the range"""          if not isinstance(date, datetime.date):              date = date_from_str(date)          return self.start <= date <= self.end +      def __str__(self): -        return '%s - %s' % ( self.start.isoformat(), self.end.isoformat()) +        return '%s - %s' % (self.start.isoformat(), self.end.isoformat())  def platform_name(): @@ -773,22 +831,22 @@ def _windows_write_string(s, out):      GetStdHandle = ctypes.WINFUNCTYPE(          ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)( -        ("GetStdHandle", ctypes.windll.kernel32)) +        (b"GetStdHandle", ctypes.windll.kernel32))      h = GetStdHandle(WIN_OUTPUT_IDS[fileno])      WriteConsoleW = ctypes.WINFUNCTYPE(          ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,          ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD), -        ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32)) +        ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))      written = ctypes.wintypes.DWORD(0) -    GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32)) +    GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))      FILE_TYPE_CHAR = 0x0002      FILE_TYPE_REMOTE = 0x8000      GetConsoleMode = ctypes.WINFUNCTYPE(          ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,          ctypes.POINTER(ctypes.wintypes.DWORD))( -        ("GetConsoleMode", ctypes.windll.kernel32)) +        (b"GetConsoleMode", ctypes.windll.kernel32))      INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value      def not_a_console(handle): @@ -989,7 +1047,7 @@ def smuggle_url(url, data):  def unsmuggle_url(smug_url, default=None): -    if not '#__youtubedl_smuggle' in smug_url: +    if '#__youtubedl_smuggle' not in smug_url:          return smug_url, default      url, _, sdata = smug_url.rpartition('#')      jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0] @@ -1011,6 +1069,60 @@ def format_bytes(bytes):      return '%.2f%s' % (converted, suffix) +def parse_filesize(s): +    if s is None: +        return None + +    # The lower-case forms are of course incorrect and inofficial, +    # but we support those too +    _UNIT_TABLE = { +        'B': 1, +        'b': 1, +        'KiB': 1024, +        'KB': 1000, +        'kB': 1024, +        'Kb': 1000, +        'MiB': 1024 ** 2, +        'MB': 1000 ** 2, +        'mB': 1024 ** 2, +        'Mb': 1000 ** 2, +        'GiB': 1024 ** 3, +        'GB': 1000 ** 3, +        'gB': 1024 ** 3, +        'Gb': 1000 ** 3, +        'TiB': 1024 ** 4, +        'TB': 1000 ** 4, +        'tB': 1024 ** 4, +        'Tb': 1000 ** 4, +        'PiB': 1024 ** 5, +        'PB': 1000 ** 5, +        'pB': 1024 ** 5, +        'Pb': 1000 ** 5, +        'EiB': 1024 ** 6, +        'EB': 1000 ** 6, +        'eB': 1024 ** 6, +        'Eb': 1000 ** 6, +        'ZiB': 1024 ** 7, +        'ZB': 1000 ** 7, +        'zB': 1024 ** 7, +        'Zb': 1000 ** 7, +        'YiB': 1024 ** 8, +        'YB': 1000 ** 8, +        'yB': 1024 ** 8, +        'Yb': 1000 ** 8, +    } + +    units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE) +    m = re.match( +        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s) +    if not m: +        return None + +    num_str = m.group('num').replace(',', '.') +    mult = _UNIT_TABLE[m.group('unit')] +    return int(float(num_str) * mult) + +  def get_term_width():      columns = compat_getenv('COLUMNS', None)      if columns: @@ -1117,25 +1229,36 @@ def parse_duration(s):      m = re.match(          r'''(?ix)T? +        (?: +            (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*| +            (?P<only_hours>[0-9.]+)\s*(?:hours?)| +              (?:                  (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?                  (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*              )? -            (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$''', s) +            (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)? +        )$''', s)      if not m:          return None -    res = int(m.group('secs')) +    res = 0 +    if m.group('only_mins'): +        return float_or_none(m.group('only_mins'), invscale=60) +    if m.group('only_hours'): +        return float_or_none(m.group('only_hours'), invscale=60 * 60) +    if m.group('secs'): +        res += int(m.group('secs'))      if m.group('mins'):          res += int(m.group('mins')) * 60 -        if m.group('hours'): -            res += int(m.group('hours')) * 60 * 60 +    if m.group('hours'): +        res += int(m.group('hours')) * 60 * 60      if m.group('ms'):          res += float(m.group('ms'))      return res  def prepend_extension(filename, ext): -    name, real_ext = os.path.splitext(filename)  +    name, real_ext = os.path.splitext(filename)      return '{0}.{1}{2}'.format(name, ext, real_ext) @@ -1150,18 +1273,25 @@ def check_executable(exe, args=[]):  def get_exe_version(exe, args=['--version'], -                    version_re=r'version\s+([0-9._-a-zA-Z]+)', -                    unrecognized='present'): +                    version_re=None, unrecognized='present'):      """ Returns the version of the specified executable,      or False if the executable is not present """      try: -        out, err = subprocess.Popen( +        out, _ = subprocess.Popen(              [exe] + args,              stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()      except OSError:          return False -    firstline = out.partition(b'\n')[0].decode('ascii', 'ignore') -    m = re.search(version_re, firstline) +    if isinstance(out, bytes):  # Python 2.x +        out = out.decode('ascii', 'ignore') +    return detect_exe_version(out, version_re, unrecognized) + + +def detect_exe_version(output, version_re=None, unrecognized='present'): +    assert isinstance(output, compat_str) +    if version_re is None: +        version_re = r'version\s+([-0-9._a-zA-Z]+)' +    m = re.search(version_re, output)      if m:          return m.group(1)      else: @@ -1402,7 +1532,7 @@ def limit_length(s, length):  def version_tuple(v): -    return [int(e) for e in v.split('.')] +    return tuple(int(e) for e in re.split(r'[-.]', v))  def is_outdated_version(version, limit, assume_new=True): @@ -1412,3 +1542,35 @@ def is_outdated_version(version, limit, assume_new=True):          return version_tuple(version) < version_tuple(limit)      except ValueError:          return not assume_new + + +def ytdl_is_updateable(): +    """ Returns if youtube-dl can be updated with -U """ +    from zipimport import zipimporter + +    return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen') + + +def args_to_str(args): +    # Get a short string representation for a subprocess command +    return ' '.join(shlex_quote(a) for a in args) + + +def urlhandle_detect_ext(url_handle): +    try: +        url_handle.headers +        getheader = lambda h: url_handle.headers[h] +    except AttributeError:  # Python < 3 +        getheader = url_handle.info().getheader + +    return getheader('Content-Type').split("/")[1] + + +def age_restricted(content_limit, age_limit): +    """ Returns True iff the content should be blocked """ + +    if age_limit is None:  # No limit set +        return False +    if content_limit is None: +        return False  # Content available for everyone +    return age_limit < content_limit | 
