aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/utils.py')
-rw-r--r--youtube_dl/utils.py443
1 files changed, 52 insertions, 391 deletions
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 2864e5142..7c0fb1592 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1,6 +1,8 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
import calendar
import codecs
import contextlib
@@ -8,7 +10,6 @@ import ctypes
import datetime
import email.utils
import errno
-import getpass
import gzip
import itertools
import io
@@ -29,254 +30,19 @@ import traceback
import xml.etree.ElementTree
import zlib
-try:
- import urllib.request as compat_urllib_request
-except ImportError: # Python 2
- import urllib2 as compat_urllib_request
-
-try:
- import urllib.error as compat_urllib_error
-except ImportError: # Python 2
- import urllib2 as compat_urllib_error
-
-try:
- import urllib.parse as compat_urllib_parse
-except ImportError: # Python 2
- import urllib as compat_urllib_parse
-
-try:
- from urllib.parse import urlparse as compat_urllib_parse_urlparse
-except ImportError: # Python 2
- from urlparse import urlparse as compat_urllib_parse_urlparse
-
-try:
- import urllib.parse as compat_urlparse
-except ImportError: # Python 2
- import urlparse as compat_urlparse
-
-try:
- import http.cookiejar as compat_cookiejar
-except ImportError: # Python 2
- import cookielib as compat_cookiejar
-
-try:
- import html.entities as compat_html_entities
-except ImportError: # Python 2
- import htmlentitydefs as compat_html_entities
-
-try:
- import html.parser as compat_html_parser
-except ImportError: # Python 2
- import HTMLParser as compat_html_parser
-
-try:
- import http.client as compat_http_client
-except ImportError: # Python 2
- import httplib as compat_http_client
-
-try:
- from urllib.error import HTTPError as compat_HTTPError
-except ImportError: # Python 2
- from urllib2 import HTTPError as compat_HTTPError
-
-try:
- from urllib.request import urlretrieve as compat_urlretrieve
-except ImportError: # Python 2
- from urllib import urlretrieve as compat_urlretrieve
-
-
-try:
- from subprocess import DEVNULL
- compat_subprocess_get_DEVNULL = lambda: DEVNULL
-except ImportError:
- compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
-
-try:
- from urllib.parse import unquote as compat_urllib_parse_unquote
-except ImportError:
- def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
- if string == '':
- return string
- res = string.split('%')
- if len(res) == 1:
- return string
- if encoding is None:
- encoding = 'utf-8'
- if errors is None:
- errors = 'replace'
- # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
- pct_sequence = b''
- string = res[0]
- for item in res[1:]:
- try:
- if not item:
- raise ValueError
- pct_sequence += item[:2].decode('hex')
- rest = item[2:]
- if not rest:
- # This segment was just a single percent-encoded character.
- # May be part of a sequence of code units, so delay decoding.
- # (Stored in pct_sequence).
- continue
- except ValueError:
- rest = '%' + item
- # Encountered non-percent-encoded characters. Flush the current
- # pct_sequence.
- string += pct_sequence.decode(encoding, errors) + rest
- pct_sequence = b''
- if pct_sequence:
- # Flush the final pct_sequence
- string += pct_sequence.decode(encoding, errors)
- return string
-
-
-try:
- from urllib.parse import parse_qs as compat_parse_qs
-except ImportError: # Python 2
- # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
- # Python 2's version is apparently totally broken
-
- def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
- encoding='utf-8', errors='replace'):
- qs, _coerce_result = qs, unicode
- pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
- r = []
- for name_value in pairs:
- if not name_value and not strict_parsing:
- continue
- nv = name_value.split('=', 1)
- if len(nv) != 2:
- if strict_parsing:
- raise ValueError("bad query field: %r" % (name_value,))
- # Handle case of a control-name with no equal sign
- if keep_blank_values:
- nv.append('')
- else:
- continue
- if len(nv[1]) or keep_blank_values:
- name = nv[0].replace('+', ' ')
- name = compat_urllib_parse_unquote(
- name, encoding=encoding, errors=errors)
- name = _coerce_result(name)
- value = nv[1].replace('+', ' ')
- value = compat_urllib_parse_unquote(
- value, encoding=encoding, errors=errors)
- value = _coerce_result(value)
- r.append((name, value))
- return r
-
- def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
- encoding='utf-8', errors='replace'):
- parsed_result = {}
- pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
- encoding=encoding, errors=errors)
- for name, value in pairs:
- if name in parsed_result:
- parsed_result[name].append(value)
- else:
- parsed_result[name] = [value]
- return parsed_result
-
-try:
- compat_str = unicode # Python 2
-except NameError:
- compat_str = str
-
-try:
- compat_chr = unichr # Python 2
-except NameError:
- compat_chr = chr
-
-try:
- from xml.etree.ElementTree import ParseError as compat_xml_parse_error
-except ImportError: # Python 2.6
- from xml.parsers.expat import ExpatError as compat_xml_parse_error
-
-try:
- from shlex import quote as shlex_quote
-except ImportError: # Python < 3.3
- def shlex_quote(s):
- return "'" + s.replace("'", "'\"'\"'") + "'"
-
-
-def compat_ord(c):
- if type(c) is int: return c
- else: return ord(c)
-
-
-if sys.version_info >= (3, 0):
- compat_getenv = os.getenv
- compat_expanduser = os.path.expanduser
-else:
- # Environment variables should be decoded with filesystem encoding.
- # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918)
-
- def compat_getenv(key, default=None):
- env = os.getenv(key, default)
- if env:
- env = env.decode(get_filesystem_encoding())
- return env
-
- # HACK: The default implementations of os.path.expanduser from cpython do not decode
- # environment variables with filesystem encoding. We will work around this by
- # providing adjusted implementations.
- # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib
- # for different platforms with correct environment variables decoding.
-
- if os.name == 'posix':
- def compat_expanduser(path):
- """Expand ~ and ~user constructions. If user or $HOME is unknown,
- do nothing."""
- if not path.startswith('~'):
- return path
- i = path.find('/', 1)
- if i < 0:
- i = len(path)
- if i == 1:
- if 'HOME' not in os.environ:
- import pwd
- userhome = pwd.getpwuid(os.getuid()).pw_dir
- else:
- userhome = compat_getenv('HOME')
- else:
- import pwd
- try:
- pwent = pwd.getpwnam(path[1:i])
- except KeyError:
- return path
- userhome = pwent.pw_dir
- userhome = userhome.rstrip('/')
- return (userhome + path[i:]) or '/'
- elif os.name == 'nt' or os.name == 'ce':
- def compat_expanduser(path):
- """Expand ~ and ~user constructs.
-
- If user or $HOME is unknown, do nothing."""
- if path[:1] != '~':
- return path
- i, n = 1, len(path)
- while i < n and path[i] not in '/\\':
- i = i + 1
-
- if 'HOME' in os.environ:
- userhome = compat_getenv('HOME')
- elif 'USERPROFILE' in os.environ:
- userhome = compat_getenv('USERPROFILE')
- elif not 'HOMEPATH' in os.environ:
- return path
- else:
- try:
- drive = compat_getenv('HOMEDRIVE')
- except KeyError:
- drive = ''
- userhome = os.path.join(drive, compat_getenv('HOMEPATH'))
-
- if i != 1: #~user
- userhome = os.path.join(os.path.dirname(userhome), path[1:i])
-
- return userhome + path[i:]
- else:
- compat_expanduser = os.path.expanduser
+from .compat import (
+ compat_chr,
+ compat_getenv,
+ compat_html_entities,
+ compat_html_parser,
+ compat_parse_qs,
+ compat_str,
+ compat_urllib_error,
+ compat_urllib_parse,
+ compat_urllib_parse_urlparse,
+ compat_urllib_request,
+ compat_urlparse,
+)
# This is not clearly defined otherwise
@@ -304,14 +70,6 @@ def preferredencoding():
return pref
-if sys.version_info < (3,0):
- def compat_print(s):
- print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
-else:
- def compat_print(s):
- assert type(s) == type(u'')
- print(s)
-
def write_json_file(obj, fn):
""" Encode obj as JSON and write it to fn, atomically """
@@ -394,127 +152,32 @@ def xpath_text(node, xpath, name=None, fatal=False):
return n.text
-compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
-class BaseHTMLParser(compat_html_parser.HTMLParser):
- def __init(self):
- compat_html_parser.HTMLParser.__init__(self)
- self.html = None
-
- def loads(self, html):
- self.html = html
- self.feed(html)
- self.close()
-
-class AttrParser(BaseHTMLParser):
- """Modified HTMLParser that isolates a tag with the specified attribute"""
- def __init__(self, attribute, value):
- self.attribute = attribute
- self.value = value
- self.result = None
- self.started = False
- self.depth = {}
- self.watch_startpos = False
- self.error_count = 0
- BaseHTMLParser.__init__(self)
-
- def error(self, message):
- if self.error_count > 10 or self.started:
- raise compat_html_parser.HTMLParseError(message, self.getpos())
- self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
- self.error_count += 1
- self.goahead(1)
-
- def handle_starttag(self, tag, attrs):
- attrs = dict(attrs)
- if self.started:
- self.find_startpos(None)
- if self.attribute in attrs and attrs[self.attribute] == self.value:
- self.result = [tag]
- self.started = True
- self.watch_startpos = True
- if self.started:
- if not tag in self.depth: self.depth[tag] = 0
- self.depth[tag] += 1
-
- def handle_endtag(self, tag):
- if self.started:
- if tag in self.depth: self.depth[tag] -= 1
- if self.depth[self.result[0]] == 0:
- self.started = False
- self.result.append(self.getpos())
-
- def find_startpos(self, x):
- """Needed to put the start position of the result (self.result[1])
- after the opening tag with the requested id"""
- if self.watch_startpos:
- self.watch_startpos = False
- self.result.append(self.getpos())
- handle_entityref = handle_charref = handle_data = handle_comment = \
- handle_decl = handle_pi = unknown_decl = find_startpos
-
- def get_result(self):
- if self.result is None:
- return None
- if len(self.result) != 3:
- return None
- lines = self.html.split('\n')
- lines = lines[self.result[1][0]-1:self.result[2][0]]
- lines[0] = lines[0][self.result[1][1]:]
- if len(lines) == 1:
- lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
- lines[-1] = lines[-1][:self.result[2][1]]
- return '\n'.join(lines).strip()
-# Hack for https://github.com/rg3/youtube-dl/issues/662
-if sys.version_info < (2, 7, 3):
- AttrParser.parse_endtag = (lambda self, i:
- i + len("</scr'+'ipt>")
- if self.rawdata[i:].startswith("</scr'+'ipt>")
- else compat_html_parser.HTMLParser.parse_endtag(self, i))
-
def get_element_by_id(id, html):
"""Return the content of the tag with the specified ID in the passed HTML document"""
return get_element_by_attribute("id", id, html)
+
def get_element_by_attribute(attribute, value, html):
"""Return the content of the tag with the specified attribute in the passed HTML document"""
- parser = AttrParser(attribute, value)
- try:
- parser.loads(html)
- except compat_html_parser.HTMLParseError:
- pass
- return parser.get_result()
-class MetaParser(BaseHTMLParser):
- """
- Modified HTMLParser that isolates a meta tag with the specified name
- attribute.
- """
- def __init__(self, name):
- BaseHTMLParser.__init__(self)
- self.name = name
- self.content = None
- self.result = None
-
- def handle_starttag(self, tag, attrs):
- if tag != 'meta':
- return
- attrs = dict(attrs)
- if attrs.get('name') == self.name:
- self.result = attrs.get('content')
+ m = re.search(r'''(?xs)
+ <([a-zA-Z0-9:._-]+)
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+ \s+%s=['"]?%s['"]?
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+ \s*>
+ (?P<content>.*?)
+ </\1>
+ ''' % (re.escape(attribute), re.escape(value)), html)
- def get_result(self):
- return self.result
+ if not m:
+ return None
+ res = m.group('content')
-def get_meta_content(name, html):
- """
- Return the content attribute from the meta tag with the given name attribute.
- """
- parser = MetaParser(name)
- try:
- parser.loads(html)
- except compat_html_parser.HTMLParseError:
- pass
- return parser.get_result()
+ if res.startswith('"') or res.startswith("'"):
+ res = res[1:-1]
+
+ return unescapeHTML(res)
def clean_html(html):
@@ -1472,6 +1135,25 @@ def check_executable(exe, args=[]):
return exe
+def get_exe_version(exe, args=['--version'],
+ version_re=r'version\s+([0-9._-a-zA-Z]+)',
+ unrecognized=u'present'):
+ """ Returns the version of the specified executable,
+ or False if the executable is not present """
+ try:
+ out, err = subprocess.Popen(
+ [exe] + args,
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
+ except OSError:
+ return False
+ firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
+ m = re.search(version_re, firstline)
+ if m:
+ return m.group(1)
+ else:
+ return unrecognized
+
+
class PagedList(object):
def __len__(self):
# This is only useful for tests
@@ -1562,7 +1244,7 @@ def escape_rfc3986(s):
"""Escape non-ASCII characters as suggested by RFC 3986"""
if sys.version_info < (3, 0) and isinstance(s, unicode):
s = s.encode('utf-8')
- return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
+ return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
def escape_url(url):
@@ -1636,15 +1318,6 @@ def parse_xml(s):
return tree
-if sys.version_info < (3, 0) and sys.platform == 'win32':
- def compat_getpass(prompt, *args, **kwargs):
- if isinstance(prompt, compat_str):
- prompt = prompt.encode(preferredencoding())
- return getpass.getpass(prompt, *args, **kwargs)
-else:
- compat_getpass = getpass.getpass
-
-
US_RATINGS = {
'G': 0,
'PG': 10,
@@ -1702,18 +1375,6 @@ def qualities(quality_ids):
DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
-try:
- subprocess_check_output = subprocess.check_output
-except AttributeError:
- def subprocess_check_output(*args, **kwargs):
- assert 'input' not in kwargs
- p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
- output, _ = p.communicate()
- ret = p.poll()
- if ret:
- raise subprocess.CalledProcessError(ret, p.args, output=output)
- return output
-
def limit_length(s, length):
""" Add ellipses to overly long strings """