diff options
Diffstat (limited to 'youtube_dl/utils.py')
| -rw-r--r-- | youtube_dl/utils.py | 89 | 
1 files changed, 76 insertions, 13 deletions
| diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d39f313a4..1737ac5f6 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -373,6 +373,13 @@ def sanitize_path(s):      return os.path.join(*sanitized_path) +# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of +# unwanted failures due to missing protocol +def sanitized_Request(url, *args, **kwargs): +    return compat_urllib_request.Request( +        'http:%s' % url if url.startswith('//') else url, *args, **kwargs) + +  def orderedSet(iterable):      """ Remove all duplicates from the input iterable """      res = [] @@ -396,10 +403,14 @@ def _htmlentity_transform(entity):              numstr = '0%s' % numstr          else:              base = 10 -        return compat_chr(int(numstr, base)) +        # See https://github.com/rg3/youtube-dl/issues/7518 +        try: +            return compat_chr(int(numstr, base)) +        except ValueError: +            pass      # Unknown entity in name, return its literal representation -    return ('&%s;' % entity) +    return '&%s;' % entity  def unescapeHTML(s): @@ -652,6 +663,16 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):      return hc +def handle_youtubedl_headers(headers): +    filtered_headers = headers + +    if 'Youtubedl-no-compression' in filtered_headers: +        filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding') +        del filtered_headers['Youtubedl-no-compression'] + +    return filtered_headers + +  class YoutubeDLHandler(compat_urllib_request.HTTPHandler):      """Handler for HTTP requests and responses. @@ -659,7 +680,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):      the standard headers to every HTTP request and handles gzipped and      deflated responses from web servers. If compression is to be avoided in      a particular request, the original request in the program code only has -    to include the HTTP header "Youtubedl-No-Compression", which will be +    to include the HTTP header "Youtubedl-no-compression", which will be      removed before making the real request.      Part of this code was copied from: @@ -720,10 +741,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):              # The dict keys are capitalized because of this bug by urllib              if h.capitalize() not in req.headers:                  req.add_header(h, v) -        if 'Youtubedl-no-compression' in req.headers: -            if 'Accept-encoding' in req.headers: -                del req.headers['Accept-encoding'] -            del req.headers['Youtubedl-no-compression'] + +        req.headers = handle_youtubedl_headers(req.headers)          if sys.version_info < (2, 7) and '#' in req.get_full_url():              # Python 2.6 is brain-dead when it comes to fragments @@ -921,6 +940,21 @@ def determine_ext(url, default_ext='unknown_video'):      guess = url.partition('?')[0].rpartition('.')[2]      if re.match(r'^[A-Za-z0-9]+$', guess):          return guess +    elif guess.rstrip('/') in ( +            'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', +            'flv', 'f4v', 'f4a', 'f4b', +            'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus', +            'mkv', 'mka', 'mk3d', +            'avi', 'divx', +            'mov', +            'asf', 'wmv', 'wma', +            '3gp', '3g2', +            'mp3', +            'flac', +            'ape', +            'wav', +            'f4f', 'f4m', 'm3u8', 'smil'): +        return guess.rstrip('/')      else:          return default_ext @@ -1372,6 +1406,15 @@ def remove_end(s, end):      return s +def remove_quotes(s): +    if s is None or len(s) < 2: +        return s +    for quote in ('"', "'", ): +        if s[0] == quote and s[-1] == quote: +            return s[1:-1] +    return s + +  def url_basename(url):      path = compat_urlparse.urlparse(url).path      return path.strip('/').split('/')[-1] @@ -1664,7 +1707,13 @@ def urlencode_postdata(*args, **kargs):  def encode_dict(d, encoding='utf-8'): -    return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items()) +    def encode(v): +        return v.encode(encoding) if isinstance(v, compat_basestring) else v +    return dict((encode(k), encode(v)) for k, v in d.items()) + + +def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): +    return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)  US_RATINGS = { @@ -1761,6 +1810,15 @@ def args_to_str(args):      return ' '.join(shlex_quote(a) for a in args) +def error_to_compat_str(err): +    err_str = str(err) +    # On python 2 error byte string must be decoded with proper +    # encoding rather than ascii +    if sys.version_info[0] < 3: +        err_str = err_str.decode(preferredencoding()) +    return err_str + +  def mimetype2ext(mt):      _, _, res = mt.rpartition('/') @@ -1931,15 +1989,15 @@ def match_filter_func(filter_str):  def parse_dfxp_time_expr(time_expr):      if not time_expr: -        return 0.0 +        return      mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)      if mobj:          return float(mobj.group('time_offset')) -    mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr) +    mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)      if mobj: -        return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3)) +        return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))  def srt_subtitles_timecode(seconds): @@ -1975,10 +2033,15 @@ def dfxp2srt(dfxp_data):          raise ValueError('Invalid dfxp/TTML subtitle')      for para, index in zip(paras, itertools.count(1)): -        begin_time = parse_dfxp_time_expr(para.attrib['begin']) +        begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))          end_time = parse_dfxp_time_expr(para.attrib.get('end')) +        dur = parse_dfxp_time_expr(para.attrib.get('dur')) +        if begin_time is None: +            continue          if not end_time: -            end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur']) +            if not dur: +                continue +            end_time = begin_time + dur          out.append('%d\n%s --> %s\n%s\n\n' % (              index,              srt_subtitles_timecode(begin_time), | 
