diff options
Diffstat (limited to 'youtube_dl')
-rwxr-xr-x | youtube_dl/YoutubeDL.py | 42 | ||||
-rw-r--r-- | youtube_dl/__init__.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/palcomp3.py | 9 | ||||
-rw-r--r-- | youtube_dl/extractor/yandexmusic.py | 23 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 61 | ||||
-rw-r--r-- | youtube_dl/jsinterp.py | 99 | ||||
-rw-r--r-- | youtube_dl/options.py | 4 | ||||
-rw-r--r-- | youtube_dl/utils.py | 204 |
8 files changed, 357 insertions, 89 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index dad44435f..9e5620eef 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -7,6 +7,7 @@ import collections import copy import datetime import errno +import functools import io import itertools import json @@ -53,6 +54,7 @@ from .compat import ( compat_urllib_request_DataHandler, ) from .utils import ( + _UnsafeExtensionError, age_restricted, args_to_str, bug_reports_message, @@ -129,6 +131,20 @@ if compat_os_name == 'nt': import ctypes +def _catch_unsafe_file_extension(func): + @functools.wraps(func) + def wrapper(self, *args, **kwargs): + try: + return func(self, *args, **kwargs) + except _UnsafeExtensionError as error: + self.report_error( + '{0} found; to avoid damaging your system, this value is disallowed.' + ' If you believe this is an error{1}'.format( + error_to_compat_str(error), bug_reports_message(','))) + + return wrapper + + class YoutubeDL(object): """YoutubeDL class. @@ -1925,6 +1941,7 @@ class YoutubeDL(object): if self.params.get('forcejson', False): self.to_stdout(json.dumps(self.sanitize_info(info_dict))) + @_catch_unsafe_file_extension def process_info(self, info_dict): """Process a single resolved IE result.""" @@ -2097,18 +2114,26 @@ class YoutubeDL(object): # TODO: Check acodec/vcodec return False - filename_real_ext = os.path.splitext(filename)[1][1:] - filename_wo_ext = ( - os.path.splitext(filename)[0] - if filename_real_ext == info_dict['ext'] - else filename) + exts = [info_dict['ext']] requested_formats = info_dict['requested_formats'] if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats): info_dict['ext'] = 'mkv' self.report_warning( 'Requested formats are incompatible for merge and will be merged into mkv.') + exts.append(info_dict['ext']) + # Ensure filename always has a correct extension for successful merge - filename = '%s.%s' % (filename_wo_ext, info_dict['ext']) + def correct_ext(filename, ext=exts[1]): + if filename == '-': + return filename + f_name, f_real_ext = os.path.splitext(filename) + f_real_ext = f_real_ext[1:] + filename_wo_ext = f_name if f_real_ext in exts else filename + if ext is None: + ext = f_real_ext or None + return join_nonempty(filename_wo_ext, ext, delim='.') + + filename = correct_ext(filename) if os.path.exists(encodeFilename(filename)): self.to_screen( '[download] %s has already been downloaded and ' @@ -2118,8 +2143,9 @@ class YoutubeDL(object): new_info = dict(info_dict) new_info.update(f) fname = prepend_extension( - self.prepare_filename(new_info), - 'f%s' % f['format_id'], new_info['ext']) + correct_ext( + self.prepare_filename(new_info), new_info['ext']), + 'f%s' % (f['format_id'],), new_info['ext']) if not ensure_dir_exists(fname): return downloaded.append(fname) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index cc8285eba..06bdfb689 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -21,6 +21,7 @@ from .compat import ( workaround_optparse_bug9161, ) from .utils import ( + _UnsafeExtensionError, DateRange, decodeOption, DEFAULT_OUTTMPL, @@ -173,6 +174,9 @@ def _real_main(argv=None): if opts.ap_mso and opts.ap_mso not in MSO_INFO: parser.error('Unsupported TV Provider, use --ap-list-mso to get a list of supported TV Providers') + if opts.no_check_extensions: + _UnsafeExtensionError.lenient = True + def parse_retries(retries): if retries in ('inf', 'infinite'): parsed_retries = float('inf') diff --git a/youtube_dl/extractor/palcomp3.py b/youtube_dl/extractor/palcomp3.py index fb29d83f9..60f7a4d48 100644 --- a/youtube_dl/extractor/palcomp3.py +++ b/youtube_dl/extractor/palcomp3.py @@ -8,7 +8,7 @@ from ..compat import compat_str from ..utils import ( int_or_none, str_or_none, - try_get, + traverse_obj, ) @@ -109,7 +109,7 @@ class PalcoMP3ArtistIE(PalcoMP3BaseIE): } name''' - @ classmethod + @classmethod def suitable(cls, url): return False if re.match(PalcoMP3IE._VALID_URL, url) else super(PalcoMP3ArtistIE, cls).suitable(url) @@ -118,7 +118,8 @@ class PalcoMP3ArtistIE(PalcoMP3BaseIE): artist = self._call_api(artist_slug, self._ARTIST_FIELDS_TMPL)['artist'] def entries(): - for music in (try_get(artist, lambda x: x['musics']['nodes'], list) or []): + for music in traverse_obj(artist, ( + 'musics', 'nodes', lambda _, m: m['musicID'])): yield self._parse_music(music) return self.playlist_result( @@ -137,7 +138,7 @@ class PalcoMP3VideoIE(PalcoMP3BaseIE): 'title': 'Maiara e Maraisa - Você Faz Falta Aqui - DVD Ao Vivo Em Campo Grande', 'description': 'md5:7043342c09a224598e93546e98e49282', 'upload_date': '20161107', - 'uploader_id': 'maiaramaraisaoficial', + 'uploader_id': '@maiaramaraisaoficial', 'uploader': 'Maiara e Maraisa', } }] diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 84969f8e1..8da5b430f 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -106,6 +106,25 @@ class YandexMusicTrackIE(YandexMusicBaseIE): }, { 'url': 'http://music.yandex.com/album/540508/track/4878838', 'only_matching': True, + }, { + 'url': 'https://music.yandex.ru/album/16302456/track/85430762', + 'md5': '11b8d50ab03b57738deeaadf661a0a48', + 'info_dict': { + 'id': '85430762', + 'ext': 'mp3', + 'abr': 128, + 'title': 'Haddadi Von Engst, Phonic Youth, Super Flu - Til The End (Super Flu Remix)', + 'filesize': int, + 'duration': 431.14, + 'track': 'Til The End (Super Flu Remix)', + 'album': 'Til The End', + 'album_artist': 'Haddadi Von Engst, Phonic Youth', + 'artist': 'Haddadi Von Engst, Phonic Youth, Super Flu', + 'release_year': 2021, + 'genre': 'house', + 'disc_number': 1, + 'track_number': 2, + } }] def _real_extract(self, url): @@ -116,10 +135,14 @@ class YandexMusicTrackIE(YandexMusicBaseIE): 'track', tld, url, track_id, 'Downloading track JSON', {'track': '%s:%s' % (track_id, album_id)})['track'] track_title = track['title'] + track_version = track.get('version') + if track_version: + track_title = '%s (%s)' % (track_title, track_version) download_data = self._download_json( 'https://music.yandex.ru/api/v2.1/handlers/track/%s:%s/web-album_track-track-track-main/download/m' % (track_id, album_id), track_id, 'Downloading track location url JSON', + query={'hq': 1}, headers={'X-Retpath-Y': url}) fd_data = self._download_json( diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 90c16e172..6fe520e9a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1636,7 +1636,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): try: jsi, player_id, func_code = self._extract_n_function_code(video_id, player_url) except ExtractorError as e: - raise ExtractorError('Unable to extract nsig jsi, player_id, func_codefunction code', cause=e) + raise ExtractorError('Unable to extract nsig function code', cause=e) if self.get_param('youtube_print_sig_code'): self.to_screen('Extracted nsig function from {0}:\n{1}\n'.format( player_id, func_code[1])) @@ -1647,7 +1647,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): except JSInterpreter.Exception as e: self.report_warning( '%s (%s %s)' % ( - 'Unable to decode n-parameter: download likely to be throttled', + 'Unable to decode n-parameter: expect download to be blocked or throttled', error_to_compat_str(e), traceback.format_exc()), video_id=video_id) @@ -1658,13 +1658,52 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_n_function_name(self, jscode): func_name, idx = self._search_regex( - r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z_$][\w$]*)(?:\[(?P<idx>\d+)\])?\([\w$]+\)', - jscode, 'Initial JS player n function name', group=('nfunc', 'idx')) + # new: (b=String.fromCharCode(110),c=a.get(b))&&c=nfunc[idx](c) + # or: (b="nn"[+a.D],c=a.get(b))&&(c=nfunc[idx](c) + # or: (PL(a),b=a.j.n||null)&&(b=nfunc[idx](b) + # or: (b="nn"[+a.D],vL(a),c=a.j[b]||null)&&(c=narray[idx](c),a.set(b,c),narray.length||nfunc("") + # old: (b=a.get("n"))&&(b=nfunc[idx](b)(?P<c>[a-z])\s*=\s*[a-z]\s* + # older: (b=a.get("n"))&&(b=nfunc(b) + r'''(?x) + \((?:[\w$()\s]+,)*?\s* # ( + (?P<b>[a-z])\s*=\s* # b= + (?: + (?: # expect ,c=a.get(b) (etc) + String\s*\.\s*fromCharCode\s*\(\s*110\s*\)| + "n+"\[\s*\+?s*[\w$.]+\s*] + )\s*(?:,[\w$()\s]+(?=,))*| + (?P<old>[\w$]+) # a (old[er]) + )\s* + (?(old) + # b.get("n") + (?:\.\s*[\w$]+\s*|\[\s*[\w$]+\s*]\s*)*? + (?:\.\s*n|\[\s*"n"\s*]|\.\s*get\s*\(\s*"n"\s*\)) + | # ,c=a.get(b) + ,\s*(?P<c>[a-z])\s*=\s*[a-z]\s* + (?:\.\s*[\w$]+\s*|\[\s*[\w$]+\s*]\s*)*? + (?:\[\s*(?P=b)\s*]|\.\s*get\s*\(\s*(?P=b)\s*\)) + ) + # interstitial junk + \s*(?:\|\|\s*null\s*)?(?:\)\s*)?&&\s*(?:\(\s*)? + (?(c)(?P=c)|(?P=b))\s*=\s* # [c|b]= + # nfunc|nfunc[idx] + (?P<nfunc>[a-zA-Z_$][\w$]*)(?:\s*\[(?P<idx>\d+)\])?\s*\(\s*[\w$]+\s*\) + ''', jscode, 'Initial JS player n function name', group=('nfunc', 'idx'), + default=(None, None)) + # thx bashonly: yt-dlp/yt-dlp/pull/10611 + if not func_name: + self.report_warning('Falling back to generic n function search') + return self._search_regex( + r'''(?xs) + (?:(?<=[^\w$])|^) # instead of \b, which ignores $ + (?P<name>(?!\d)[a-zA-Z\d_$]+)\s*=\s*function\((?!\d)[a-zA-Z\d_$]+\) + \s*\{(?:(?!};).)+?["']enhanced_except_ + ''', jscode, 'Initial JS player n function name', group='name') if not idx: return func_name return self._parse_json(self._search_regex( - r'var {0}\s*=\s*(\[.+?\])\s*[,;]'.format(re.escape(func_name)), jscode, + r'var\s+{0}\s*=\s*(\[.+?\])\s*[,;]'.format(re.escape(func_name)), jscode, 'Initial JS player n function list ({0}.{1})'.format(func_name, idx)), func_name, transform_source=js_to_json)[int(idx)] @@ -1679,17 +1718,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): func_name = self._extract_n_function_name(jscode) - # For redundancy - func_code = self._search_regex( - r'''(?xs)%s\s*=\s*function\s*\((?P<var>[\w$]+)\)\s* - # NB: The end of the regex is intentionally kept strict - {(?P<code>.+?}\s*return\ [\w$]+.join\(""\))};''' % func_name, - jscode, 'nsig function', group=('var', 'code'), default=None) - if func_code: - func_code = ([func_code[0]], func_code[1]) - else: - self.write_debug('Extracting nsig function with jsinterp') - func_code = jsi.extract_function_code(func_name) + func_code = jsi.extract_function_code(func_name) self.cache.store('youtube-nsig', player_id, func_code) return jsi, player_id, func_code diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 02adf6678..a616ad070 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -20,7 +20,9 @@ from .compat import ( compat_basestring, compat_chr, compat_collections_chain_map as ChainMap, + compat_filter as filter, compat_itertools_zip_longest as zip_longest, + compat_map as map, compat_str, ) @@ -252,7 +254,7 @@ class Debugger(object): cls.write('=> Raises:', e, '<-|', stmt, level=allow_recursion) raise if cls.ENABLED and stmt.strip(): - if should_ret or not repr(ret) == stmt: + if should_ret or repr(ret) != stmt: cls.write(['->', '=>'][should_ret], repr(ret), '<-|', stmt, level=allow_recursion) return ret, should_ret return interpret_statement @@ -365,6 +367,8 @@ class JSInterpreter(object): start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 in_quote, escaping, after_op, in_regex_char_group = None, False, True, False skipping = 0 + if skip_delims: + skip_delims = variadic(skip_delims) for idx, char in enumerate(expr): paren_delta = 0 if not in_quote: @@ -391,7 +395,7 @@ class JSInterpreter(object): continue elif pos == 0 and skip_delims: here = expr[idx:] - for s in variadic(skip_delims): + for s in skip_delims: if here.startswith(s) and s: skipping = len(s) - 1 break @@ -412,7 +416,6 @@ class JSInterpreter(object): if delim is None: delim = expr and _MATCHING_PARENS[expr[0]] separated = list(cls._separate(expr, delim, 1)) - if len(separated) < 2: raise cls.Exception('No terminating paren {delim} in {expr!r:.5500}'.format(**locals())) return separated[0][1:].strip(), separated[1].strip() @@ -487,6 +490,7 @@ class JSInterpreter(object): # fails on (eg) if (...) stmt1; else stmt2; sub_statements = list(self._separate(stmt, ';')) or [''] expr = stmt = sub_statements.pop().strip() + for sub_stmt in sub_statements: ret, should_return = self.interpret_statement(sub_stmt, local_vars, allow_recursion) if should_return: @@ -626,8 +630,7 @@ class JSInterpreter(object): if m.group('err'): catch_vars[m.group('err')] = err.error if isinstance(err, JS_Throw) else err catch_vars = local_vars.new_child(m=catch_vars) - err = None - pending = self.interpret_statement(sub_expr, catch_vars, allow_recursion) + err, pending = None, self.interpret_statement(sub_expr, catch_vars, allow_recursion) m = self._FINALLY_RE.match(expr) if m: @@ -801,16 +804,19 @@ class JSInterpreter(object): if op in ('+', '-'): # simplify/adjust consecutive instances of these operators undone = 0 - while len(separated) > 1 and not separated[-1].strip(): + separated = [s.strip() for s in separated] + while len(separated) > 1 and not separated[-1]: undone += 1 separated.pop() if op == '-' and undone % 2 != 0: right_expr = op + right_expr elif op == '+': - while len(separated) > 1 and separated[-1].strip() in self.OP_CHARS: + while len(separated) > 1 and set(separated[-1]) <= self.OP_CHARS: + right_expr = separated.pop() + right_expr + if separated[-1][-1:] in self.OP_CHARS: right_expr = separated.pop() + right_expr # hanging op at end of left => unary + (strip) or - (push right) - left_val = separated[-1] + left_val = separated[-1] if separated else '' for dm_op in ('*', '%', '/', '**'): bodmas = tuple(self._separate(left_val, dm_op, skip_delims=skip_delim)) if len(bodmas) > 1 and not bodmas[-1].strip(): @@ -844,7 +850,7 @@ class JSInterpreter(object): memb = member raise self.Exception('{memb} {msg}'.format(**locals()), expr=expr) - def eval_method(): + def eval_method(variable, member): if (variable, member) == ('console', 'debug'): if Debugger.ENABLED: Debugger.write(self.interpret_expression('[{}]'.format(arg_str), local_vars, allow_recursion)) @@ -852,6 +858,7 @@ class JSInterpreter(object): types = { 'String': compat_str, 'Math': float, + 'Array': list, } obj = local_vars.get(variable) if obj in (JS_Undefined, None): @@ -877,12 +884,29 @@ class JSInterpreter(object): self.interpret_expression(v, local_vars, allow_recursion) for v in self._separate(arg_str)] - if obj == compat_str: + # Fixup prototype call + if isinstance(obj, type): + new_member, rest = member.partition('.')[0::2] + if new_member == 'prototype': + new_member, func_prototype = rest.partition('.')[0::2] + assertion(argvals, 'takes one or more arguments') + assertion(isinstance(argvals[0], obj), 'must bind to type {0}'.format(obj)) + if func_prototype == 'call': + obj = argvals.pop(0) + elif func_prototype == 'apply': + assertion(len(argvals) == 2, 'takes two arguments') + obj, argvals = argvals + assertion(isinstance(argvals, list), 'second argument must be a list') + else: + raise self.Exception('Unsupported Function method ' + func_prototype, expr) + member = new_member + + if obj is compat_str: if member == 'fromCharCode': assertion(argvals, 'takes one or more arguments') return ''.join(map(compat_chr, argvals)) raise self.Exception('Unsupported string method ' + member, expr=expr) - elif obj == float: + elif obj is float: if member == 'pow': assertion(len(argvals) == 2, 'takes two arguments') return argvals[0] ** argvals[1] @@ -901,18 +925,25 @@ class JSInterpreter(object): obj.reverse() return obj elif member == 'slice': - assertion(isinstance(obj, list), 'must be applied on a list') - assertion(len(argvals) == 1, 'takes exactly one argument') - return obj[argvals[0]:] + assertion(isinstance(obj, (list, compat_str)), 'must be applied on a list or string') + # From [1]: + # .slice() - like [:] + # .slice(n) - like [n:] (not [slice(n)] + # .slice(m, n) - like [m:n] or [slice(m, n)] + # [1] https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/slice + assertion(len(argvals) <= 2, 'takes between 0 and 2 arguments') + if len(argvals) < 2: + argvals += (None,) + return obj[slice(*argvals)] elif member == 'splice': assertion(isinstance(obj, list), 'must be applied on a list') assertion(argvals, 'takes one or more arguments') - index, howMany = map(int, (argvals + [len(obj)])[:2]) + index, how_many = map(int, (argvals + [len(obj)])[:2]) if index < 0: index += len(obj) add_items = argvals[2:] res = [] - for i in range(index, min(index + howMany, len(obj))): + for _ in range(index, min(index + how_many, len(obj))): res.append(obj.pop(index)) for i, item in enumerate(add_items): obj.insert(index + i, item) @@ -970,11 +1001,11 @@ class JSInterpreter(object): if remaining: ret, should_abort = self.interpret_statement( - self._named_object(local_vars, eval_method()) + remaining, + self._named_object(local_vars, eval_method(variable, member)) + remaining, local_vars, allow_recursion) return ret, should_return or should_abort else: - return eval_method(), should_return + return eval_method(variable, member), should_return elif md.get('function'): fname = m.group('fname') @@ -1002,28 +1033,25 @@ class JSInterpreter(object): def extract_object(self, objname): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' obj = {} - fields = None - for obj_m in re.finditer( + fields = next(filter(None, ( + obj_m.group('fields') for obj_m in re.finditer( r'''(?xs) {0}\s*\.\s*{1}|{1}\s*=\s*\{{\s* (?P<fields>({2}\s*:\s*function\s*\(.*?\)\s*\{{.*?}}(?:,\s*)?)*) }}\s*; '''.format(_NAME_RE, re.escape(objname), _FUNC_NAME_RE), - self.code): - fields = obj_m.group('fields') - if fields: - break - else: + self.code))), None) + if not fields: raise self.Exception('Could not find object ' + objname) # Currently, it only supports function definitions - fields_m = re.finditer( - r'''(?x) - (?P<key>%s)\s*:\s*function\s*\((?P<args>(?:%s|,)*)\){(?P<code>[^}]+)} - ''' % (_FUNC_NAME_RE, _NAME_RE), - fields) - for f in fields_m: + for f in re.finditer( + r'''(?x) + (?P<key>%s)\s*:\s*function\s*\((?P<args>(?:%s|,)*)\){(?P<code>[^}]+)} + ''' % (_FUNC_NAME_RE, _NAME_RE), + fields): argnames = self.build_arglist(f.group('args')) - obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code')) + name = remove_quotes(f.group('key')) + obj[name] = function_with_repr(self.build_function(argnames, f.group('code')), 'F<{0}>'.format(name)) return obj @@ -1058,7 +1086,7 @@ class JSInterpreter(object): def extract_function(self, funcname): return function_with_repr( self.extract_function_from_code(*self.extract_function_code(funcname)), - 'F<%s>' % (funcname, )) + 'F<%s>' % (funcname,)) def extract_function_from_code(self, argnames, code, *global_stack): local_vars = {} @@ -1067,7 +1095,7 @@ class JSInterpreter(object): if mobj is None: break start, body_start = mobj.span() - body, remaining = self._separate_at_paren(code[body_start - 1:], '}') + body, remaining = self._separate_at_paren(code[body_start - 1:]) name = self._named_object(local_vars, self.extract_function_from_code( [x.strip() for x in mobj.group('args').split(',')], body, local_vars, *global_stack)) @@ -1095,8 +1123,7 @@ class JSInterpreter(object): argnames = tuple(argnames) def resf(args, kwargs={}, allow_recursion=100): - global_stack[0].update( - zip_longest(argnames, args, fillvalue=None)) + global_stack[0].update(zip_longest(argnames, args, fillvalue=None)) global_stack[0].update(kwargs) var_stack = LocalNameSpace(*global_stack) ret, should_abort = self.interpret_statement(code.replace('\n', ' '), var_stack, allow_recursion - 1) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 434f520d3..61705d1f0 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -534,6 +534,10 @@ def parseOpts(overrideArguments=None): action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation') workarounds.add_option( + '--no-check-extensions', + action='store_true', dest='no_check_extensions', default=False, + help='Suppress file extension validation') + workarounds.add_option( '--prefer-insecure', '--prefer-unsecure', action='store_true', dest='prefer_insecure', help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 113c913df..ac1e78002 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1717,21 +1717,6 @@ TIMEZONE_NAMES = { 'PST': -8, 'PDT': -7 # Pacific } -KNOWN_EXTENSIONS = ( - 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', - 'flv', 'f4v', 'f4a', 'f4b', - 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus', - 'mkv', 'mka', 'mk3d', - 'avi', 'divx', - 'mov', - 'asf', 'wmv', 'wma', - '3gp', '3g2', - 'mp3', - 'flac', - 'ape', - 'wav', - 'f4f', 'f4m', 'm3u8', 'smil') - # needed for sanitizing filenames in restricted mode ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'], @@ -3959,19 +3944,22 @@ def parse_duration(s): return duration -def prepend_extension(filename, ext, expected_real_ext=None): +def _change_extension(prepend, filename, ext, expected_real_ext=None): name, real_ext = os.path.splitext(filename) - return ( - '{0}.{1}{2}'.format(name, ext, real_ext) - if not expected_real_ext or real_ext[1:] == expected_real_ext - else '{0}.{1}'.format(filename, ext)) + sanitize_extension = _UnsafeExtensionError.sanitize_extension + if not expected_real_ext or real_ext.partition('.')[0::2] == ('', expected_real_ext): + filename = name + if prepend and real_ext: + sanitize_extension(ext, prepend=prepend) + return ''.join((filename, '.', ext, real_ext)) -def replace_extension(filename, ext, expected_real_ext=None): - name, real_ext = os.path.splitext(filename) - return '{0}.{1}'.format( - name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename, - ext) + # Mitigate path traversal and file impersonation attacks + return '.'.join((filename, sanitize_extension(ext))) + + +prepend_extension = functools.partial(_change_extension, True) +replace_extension = functools.partial(_change_extension, False) def check_executable(exe, args=[]): @@ -6561,3 +6549,169 @@ def join_nonempty(*values, **kwargs): if from_dict is not None: values = (traverse_obj(from_dict, variadic(v)) for v in values) return delim.join(map(compat_str, filter(None, values))) + + +class Namespace(object): + """Immutable namespace""" + + def __init__(self, **kw_attr): + self.__dict__.update(kw_attr) + + def __iter__(self): + return iter(self.__dict__.values()) + + @property + def items_(self): + return self.__dict__.items() + + +MEDIA_EXTENSIONS = Namespace( + common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'), + video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'), + common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'), + audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'), + thumbnails=('jpg', 'png', 'webp'), + # storyboards=('mhtml', ), + subtitles=('srt', 'vtt', 'ass', 'lrc', 'ttml'), + manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'), +) +MEDIA_EXTENSIONS.video = MEDIA_EXTENSIONS.common_video + MEDIA_EXTENSIONS.video +MEDIA_EXTENSIONS.audio = MEDIA_EXTENSIONS.common_audio + MEDIA_EXTENSIONS.audio + +KNOWN_EXTENSIONS = ( + MEDIA_EXTENSIONS.video + MEDIA_EXTENSIONS.audio + + MEDIA_EXTENSIONS.manifests +) + + +class _UnsafeExtensionError(Exception): + """ + Mitigation exception for unwanted file overwrite/path traversal + + Ref: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-79w7-vh3h-8g4j + """ + _ALLOWED_EXTENSIONS = frozenset(itertools.chain( + ( # internal + 'description', + 'json', + 'meta', + 'orig', + 'part', + 'temp', + 'uncut', + 'unknown_video', + 'ytdl', + ), + # video + MEDIA_EXTENSIONS.video, ( + 'asx', + 'ismv', + 'm2t', + 'm2ts', + 'm2v', + 'm4s', + 'mng', + 'mp2v', + 'mp4v', + 'mpe', + 'mpeg', + 'mpeg1', + 'mpeg2', + 'mpeg4', + 'mxf', + 'ogm', + 'qt', + 'rm', + 'swf', + 'ts', + 'vob', + 'vp9', + ), + # audio + MEDIA_EXTENSIONS.audio, ( + '3ga', + 'ac3', + 'adts', + 'aif', + 'au', + 'dts', + 'isma', + 'it', + 'mid', + 'mod', + 'mpga', + 'mp1', + 'mp2', + 'mp4a', + 'mpa', + 'ra', + 'shn', + 'xm', + ), + # image + MEDIA_EXTENSIONS.thumbnails, ( + 'avif', + 'bmp', + 'gif', + 'ico', + 'heic', + 'jng', + 'jpeg', + 'jxl', + 'svg', + 'tif', + 'tiff', + 'wbmp', + ), + # subtitle + MEDIA_EXTENSIONS.subtitles, ( + 'dfxp', + 'fs', + 'ismt', + 'json3', + 'sami', + 'scc', + 'srv1', + 'srv2', + 'srv3', + 'ssa', + 'tt', + 'xml', + ), + # others + MEDIA_EXTENSIONS.manifests, + ( + # not used in yt-dl + # *MEDIA_EXTENSIONS.storyboards, + # 'desktop', + # 'ism', + # 'm3u', + # 'sbv', + # 'swp', + # 'url', + # 'webloc', + ))) + + def __init__(self, extension): + super(_UnsafeExtensionError, self).__init__('unsafe file extension: {0!r}'.format(extension)) + self.extension = extension + + # support --no-check-extensions + lenient = False + + @classmethod + def sanitize_extension(cls, extension, **kwargs): + # ... /, *, prepend=False + prepend = kwargs.get('prepend', False) + + if '/' in extension or '\\' in extension: + raise cls(extension) + + if not prepend: + last = extension.rpartition('.')[-1] + if last == 'bin': + extension = last = 'unknown_video' + if not (cls.lenient or last.lower() in cls._ALLOWED_EXTENSIONS): + raise cls(extension) + + return extension |