diff options
Diffstat (limited to 'youtube_dl/utils.py')
-rw-r--r-- | youtube_dl/utils.py | 54 |
1 files changed, 39 insertions, 15 deletions
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9595bcf9f..67a847eba 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -86,6 +86,11 @@ std_headers = { } +USER_AGENTS = { + 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', +} + + NO_DEFAULT = object() ENGLISH_MONTH_NAMES = [ @@ -123,7 +128,13 @@ DATE_FORMATS = ( '%d %B %Y', '%d %b %Y', '%B %d %Y', + '%B %dst %Y', + '%B %dnd %Y', + '%B %dth %Y', '%b %d %Y', + '%b %dst %Y', + '%b %dnd %Y', + '%b %dth %Y', '%b %dst %Y %I:%M', '%b %dnd %Y %I:%M', '%b %dth %Y %I:%M', @@ -132,6 +143,7 @@ DATE_FORMATS = ( '%Y/%m/%d', '%Y/%m/%d %H:%M', '%Y/%m/%d %H:%M:%S', + '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S.%f', '%d.%m.%Y %H:%M', @@ -496,7 +508,7 @@ def sanitize_path(s): if drive_or_unc: norm_path.pop(0) sanitized_path = [ - path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part) + path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part) for path_part in norm_path] if drive_or_unc: sanitized_path.insert(0, drive_or_unc + os.path.sep) @@ -1178,7 +1190,7 @@ def date_from_str(date_str): return today if date_str == 'yesterday': return today - datetime.timedelta(days=1) - match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str) + match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str) if match is not None: sign = match.group('sign') time = int(match.group('time')) @@ -1695,6 +1707,16 @@ def base_url(url): return re.match(r'https?://[^?#&]+/', url).group() +def urljoin(base, path): + if not isinstance(path, compat_str) or not path: + return None + if re.match(r'^(?:https?:)?//', path): + return path + if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base): + return None + return compat_urlparse.urljoin(base, path) + + class HEADRequest(compat_urllib_request.Request): def get_method(self): return 'HEAD' @@ -1751,7 +1773,7 @@ def parse_duration(s): s = s.strip() days, hours, mins, secs, ms = [None] * 5 - m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s) + m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s) if m: days, hours, mins, secs, ms = m.groups() else: @@ -1768,11 +1790,11 @@ def parse_duration(s): )? (?: (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s* - )?$''', s) + )?Z?$''', s) if m: days, hours, mins, secs, ms = m.groups() else: - m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s) + m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s) if m: hours, mins = m.groups() else: @@ -2081,11 +2103,18 @@ def strip_jsonp(code): def js_to_json(code): + COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*' + SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE) + INTEGER_TABLE = ( + (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16), + (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8), + ) + def fix_kv(m): v = m.group(0) if v in ('true', 'false', 'null'): return v - elif v.startswith('/*') or v == ',': + elif v.startswith('/*') or v.startswith('//') or v == ',': return "" if v[0] in ("'", '"'): @@ -2096,11 +2125,6 @@ def js_to_json(code): '\\x': '\\u00', }.get(m.group(0), m.group(0)), v[1:-1]) - INTEGER_TABLE = ( - (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16), - (r'^(0+[0-7]+)\s*:?$', 8), - ) - for regex, base in INTEGER_TABLE: im = re.match(regex, v) if im: @@ -2112,11 +2136,11 @@ def js_to_json(code): return re.sub(r'''(?sx) "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - /\*.*?\*/|,(?=\s*[\]}])| + {comment}|,(?={skip}[\]}}])| [a-zA-Z_][.a-zA-Z_0-9]*| - \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?| - [0-9]+(?=\s*:) - ''', fix_kv, code) + \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| + [0-9]+(?={skip}:) + '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) def qualities(quality_ids): |