aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/YoutubeDL.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/YoutubeDL.py')
-rwxr-xr-xyoutube_dl/YoutubeDL.py1358
1 files changed, 979 insertions, 379 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index 5036289b0..8367b6e53 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -1,13 +1,13 @@
#!/usr/bin/env python
-# -*- coding: utf-8 -*-
+# coding: utf-8
from __future__ import absolute_import, unicode_literals
import collections
-import contextlib
+import copy
import datetime
import errno
-import fileinput
+import functools
import io
import itertools
import json
@@ -23,24 +23,41 @@ import sys
import time
import tokenize
import traceback
+import random
+
+try:
+ from ssl import OPENSSL_VERSION
+except ImportError:
+ # Must be Python 2.6, should be built against 1.0.2
+ OPENSSL_VERSION = 'OpenSSL 1.0.2(?)'
+from string import ascii_letters
from .compat import (
compat_basestring,
- compat_cookiejar,
- compat_expanduser,
+ compat_collections_chain_map as ChainMap,
+ compat_filter as filter,
compat_get_terminal_size,
compat_http_client,
+ compat_http_cookiejar_Cookie,
+ compat_http_cookies_SimpleCookie,
+ compat_integer_types,
compat_kwargs,
+ compat_map as map,
+ compat_numeric_types,
+ compat_open as open,
compat_os_name,
compat_str,
compat_tokenize_tokenize,
compat_urllib_error,
+ compat_urllib_parse,
compat_urllib_request,
compat_urllib_request_DataHandler,
)
from .utils import (
+ _UnsafeExtensionError,
age_restricted,
args_to_str,
+ bug_reports_message,
ContentTooShortError,
date_from_str,
DateRange,
@@ -51,12 +68,19 @@ from .utils import (
encode_compat_str,
encodeFilename,
error_to_compat_str,
+ expand_path,
ExtractorError,
format_bytes,
formatSeconds,
+ GeoRestrictedError,
+ int_or_none,
+ ISO3166Utils,
+ join_nonempty,
locked_file,
+ LazyList,
make_HTTPS_handler,
MaxDownloadsReached,
+ orderedSet,
PagedList,
parse_filesize,
PerRequestProxyHandler,
@@ -64,6 +88,7 @@ from .utils import (
PostProcessingError,
preferredencoding,
prepend_extension,
+ process_communicate_or_kill,
register_socks_protocols,
render_table,
replace_extension,
@@ -73,17 +98,23 @@ from .utils import (
sanitize_url,
sanitized_Request,
std_headers,
+ str_or_none,
subtitles_filename,
+ traverse_obj,
UnavailableVideoError,
url_basename,
version_tuple,
write_json_file,
write_string,
+ YoutubeDLCookieJar,
YoutubeDLCookieProcessor,
YoutubeDLHandler,
+ YoutubeDLRedirectHandler,
+ ytdl_is_updateable,
)
from .cache import Cache
from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
+from .extractor.openload import PhantomJSwrapper
from .downloader import get_suitable_downloader
from .downloader.rtmp import rtmpdump_version
from .postprocessor import (
@@ -100,6 +131,20 @@ if compat_os_name == 'nt':
import ctypes
+def _catch_unsafe_file_extension(func):
+ @functools.wraps(func)
+ def wrapper(self, *args, **kwargs):
+ try:
+ return func(self, *args, **kwargs)
+ except _UnsafeExtensionError as error:
+ self.report_error(
+ '{0} found; to avoid damaging your system, this value is disallowed.'
+ ' If you believe this is an error{1}'.format(
+ error_to_compat_str(error), bug_reports_message(',')))
+
+ return wrapper
+
+
class YoutubeDL(object):
"""YoutubeDL class.
@@ -130,6 +175,9 @@ class YoutubeDL(object):
username: Username for authentication purposes.
password: Password for authentication purposes.
videopassword: Password for accessing a video.
+ ap_mso: Adobe Pass multiple-system operator identifier.
+ ap_username: Multiple-system operator account username.
+ ap_password: Multiple-system operator account password.
usenetrc: Use netrc for authentication instead.
verbose: Print additional info to stdout.
quiet: Do not print messages to stdout.
@@ -147,6 +195,7 @@ class YoutubeDL(object):
simulate: Do not download the video files.
format: Video format code. See options.py for more information.
outtmpl: Template for output names.
+ outtmpl_na_placeholder: Placeholder for unavailable meta fields.
restrictfilenames: Do not allow "&" and spaces in file names
ignoreerrors: Do not stop on download errors.
force_generic_extractor: Force downloader to use the generic extractor
@@ -155,6 +204,7 @@ class YoutubeDL(object):
playlistend: Playlist item to end at.
playlist_items: Specific indices of playlist to download.
playlistreverse: Download playlist items in reverse order.
+ playlistrandom: Download playlist items in random order.
matchtitle: Download only matching titles.
rejecttitle: Reject downloads for matching titles.
logger: Log messages to a logging.Logger instance.
@@ -196,8 +246,8 @@ class YoutubeDL(object):
prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
At the moment, this is only supported by YouTube.
proxy: URL of the proxy server to use
- cn_verification_proxy: URL of the proxy to use for IP address verification
- on Chinese sites. (Experimental)
+ geo_verification_proxy: URL of the proxy to use for IP address verification
+ on geo-restricted sites.
socket_timeout: Time to wait for unresponsive hosts, in seconds
bidi_workaround: Work around buggy terminals without bidirectional text
support, using fridibi
@@ -245,10 +295,19 @@ class YoutubeDL(object):
- "warn": only emit a warning
- "detect_or_warn": check whether we can do anything
about it, warn otherwise (default)
- source_address: (Experimental) Client-side IP address to bind to.
+ source_address: Client-side IP address to bind to.
call_home: Boolean, true iff we are allowed to contact the
youtube-dl servers for debugging.
- sleep_interval: Number of seconds to sleep before each download.
+ sleep_interval: Number of seconds to sleep before each download when
+ used alone or a lower bound of a range for randomized
+ sleep before each download (minimum possible number
+ of seconds to sleep) when used along with
+ max_sleep_interval.
+ max_sleep_interval:Upper bound of a range for randomized sleep before each
+ download (maximum possible number of seconds to sleep).
+ Must only be used along with sleep_interval.
+ Actual sleep time will be a random float from range
+ [sleep_interval; max_sleep_interval].
listformats: Print an overview of available video formats and exit.
list_thumbnails: Print a table of all thumbnails and exit.
match_filter: A function that gets called with the info_dict of
@@ -257,6 +316,15 @@ class YoutubeDL(object):
If it returns None, the video is downloaded.
match_filter_func in utils.py is one example for this.
no_color: Do not emit color codes in output.
+ geo_bypass: Bypass geographic restriction via faking X-Forwarded-For
+ HTTP header
+ geo_bypass_country:
+ Two-letter ISO 3166-2 country code that will be used for
+ explicit geographic restriction bypassing via faking
+ X-Forwarded-For HTTP header
+ geo_bypass_ip_block:
+ IP range in CIDR notation that will be used similarly to
+ geo_bypass_country
The following options determine which downloader is picked:
external_downloader: Executable of the external downloader to call.
@@ -269,20 +337,42 @@ class YoutubeDL(object):
the downloader (see youtube_dl/downloader/common.py):
nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
noresizebuffer, retries, continuedl, noprogress, consoletitle,
- xattr_set_filesize, external_downloader_args, hls_use_mpegts.
+ xattr_set_filesize, external_downloader_args, hls_use_mpegts,
+ http_chunk_size.
The following options are used by the post processors:
- prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available,
- otherwise prefer avconv.
+ prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available,
+ otherwise prefer ffmpeg.
+ ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
+ to the binary or its containing directory.
postprocessor_args: A list of additional command-line arguments for the
postprocessor.
+
+ The following options are used by the Youtube extractor:
+ youtube_include_dash_manifest: If True (default), DASH manifests and related
+ data will be downloaded and processed by extractor.
+ You can reduce network I/O by disabling it if you don't
+ care about DASH.
"""
+ _NUMERIC_FIELDS = set((
+ 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
+ 'timestamp', 'upload_year', 'upload_month', 'upload_day',
+ 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
+ 'average_rating', 'comment_count', 'age_limit',
+ 'start_time', 'end_time',
+ 'chapter_number', 'season_number', 'episode_number',
+ 'track_number', 'disc_number', 'release_year',
+ 'playlist_index',
+ ))
+
params = None
_ies = []
_pps = []
_download_retcode = None
_num_downloads = None
+ _playlist_level = 0
+ _playlist_urls = set()
_screen_file = None
def __init__(self, params=None, auto_init=True):
@@ -304,6 +394,24 @@ class YoutubeDL(object):
self.params.update(params)
self.cache = Cache(self)
+ self._header_cookies = []
+ self._load_cookies_from_headers(self.params.get('http_headers'))
+
+ def check_deprecated(param, option, suggestion):
+ if self.params.get(param) is not None:
+ self.report_warning(
+ '%s is deprecated. Use %s instead.' % (option, suggestion))
+ return True
+ return False
+
+ if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
+ if self.params.get('geo_verification_proxy') is None:
+ self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
+
+ check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
+ check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
+ check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
+
if params.get('bidi_workaround', False):
try:
import pty
@@ -331,10 +439,10 @@ class YoutubeDL(object):
else:
raise
- if (sys.version_info >= (3,) and sys.platform != 'win32' and
- sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
- not params.get('restrictfilenames', False)):
- # On Python 3, the Unicode filesystem API will throw errors (#1474)
+ if (sys.platform != 'win32'
+ and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
+ and not params.get('restrictfilenames', False)):
+ # Unicode filesystem API will throw errors (#1474, #13027)
self.report_warning(
'Assuming --restrict-filenames since file system encoding '
'cannot encode all characters. '
@@ -371,9 +479,9 @@ class YoutubeDL(object):
if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
if idxs:
correct_argv = (
- ['youtube-dl'] +
- [a for i, a in enumerate(argv) if i not in idxs] +
- ['--'] + [argv[i] for i in idxs]
+ ['youtube-dl']
+ + [a for i, a in enumerate(argv) if i not in idxs]
+ + ['--'] + [argv[i] for i in idxs]
)
self.report_warning(
'Long argument string detected. '
@@ -432,10 +540,14 @@ class YoutubeDL(object):
"""Print message to stdout if not in quiet mode."""
return self.to_stdout(message, skip_eol, check_quiet=True)
- def _write_string(self, s, out=None):
+ def _write_string(self, s, out=None, only_once=False, _cache=set()):
+ if only_once and s in _cache:
+ return
write_string(s, out=out, encoding=self.params.get('encoding'))
+ if only_once:
+ _cache.add(s)
- def to_stdout(self, message, skip_eol=False, check_quiet=False):
+ def to_stdout(self, message, skip_eol=False, check_quiet=False, only_once=False):
"""Print message to stdout if not in quiet mode."""
if self.params.get('logger'):
self.params['logger'].debug(message)
@@ -444,9 +556,9 @@ class YoutubeDL(object):
terminator = ['\n', ''][skip_eol]
output = message + terminator
- self._write_string(output, self._screen_file)
+ self._write_string(output, self._screen_file, only_once=only_once)
- def to_stderr(self, message):
+ def to_stderr(self, message, only_once=False):
"""Print message to stderr."""
assert isinstance(message, compat_str)
if self.params.get('logger'):
@@ -454,29 +566,34 @@ class YoutubeDL(object):
else:
message = self._bidi_workaround(message)
output = message + '\n'
- self._write_string(output, self._err_file)
+ self._write_string(output, self._err_file, only_once=only_once)
def to_console_title(self, message):
if not self.params.get('consoletitle', False):
return
- if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
- # c_wchar_p() might not be necessary if `message` is
- # already of type unicode()
- ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
+ if compat_os_name == 'nt':
+ if ctypes.windll.kernel32.GetConsoleWindow():
+ # c_wchar_p() might not be necessary if `message` is
+ # already of type unicode()
+ ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
elif 'TERM' in os.environ:
self._write_string('\033]0;%s\007' % message, self._screen_file)
def save_console_title(self):
if not self.params.get('consoletitle', False):
return
- if 'TERM' in os.environ:
+ if self.params.get('simulate', False):
+ return
+ if compat_os_name != 'nt' and 'TERM' in os.environ:
# Save the title on stack
self._write_string('\033[22;0t', self._screen_file)
def restore_console_title(self):
if not self.params.get('consoletitle', False):
return
- if 'TERM' in os.environ:
+ if self.params.get('simulate', False):
+ return
+ if compat_os_name != 'nt' and 'TERM' in os.environ:
# Restore the title from stack
self._write_string('\033[23;0t', self._screen_file)
@@ -488,9 +605,9 @@ class YoutubeDL(object):
self.restore_console_title()
if self.params.get('cookiefile') is not None:
- self.cookiejar.save()
+ self.cookiejar.save(ignore_discard=True, ignore_expires=True)
- def trouble(self, message=None, tb=None):
+ def trouble(self, *args, **kwargs):
"""Determine action to take when a download problem appears.
Depending on if the downloader has been configured to ignore
@@ -499,6 +616,11 @@ class YoutubeDL(object):
tb, if given, is additional traceback information.
"""
+ # message=None, tb=None, is_error=True
+ message = args[0] if len(args) > 0 else kwargs.get('message', None)
+ tb = args[1] if len(args) > 1 else kwargs.get('tb', None)
+ is_error = args[2] if len(args) > 2 else kwargs.get('is_error', True)
+
if message is not None:
self.to_stderr(message)
if self.params.get('verbose'):
@@ -511,7 +633,10 @@ class YoutubeDL(object):
else:
tb_data = traceback.format_list(traceback.extract_stack())
tb = ''.join(tb_data)
- self.to_stderr(tb)
+ if tb:
+ self.to_stderr(tb)
+ if not is_error:
+ return
if not self.params.get('ignoreerrors', False):
if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
exc_info = sys.exc_info()[1].exc_info
@@ -520,7 +645,7 @@ class YoutubeDL(object):
raise DownloadError(message, exc_info)
self._download_retcode = 1
- def report_warning(self, message):
+ def report_warning(self, message, only_once=False):
'''
Print the message to stderr, it will be prefixed with 'WARNING:'
If stderr is a tty file the 'WARNING:' will be colored
@@ -535,9 +660,9 @@ class YoutubeDL(object):
else:
_msg_header = 'WARNING:'
warning_message = '%s %s' % (_msg_header, message)
- self.to_stderr(warning_message)
+ self.to_stderr(warning_message, only_once=only_once)
- def report_error(self, message, tb=None):
+ def report_error(self, message, *args, **kwargs):
'''
Do the same as trouble, but prefixes the message with 'ERROR:', colored
in red if stderr is a tty file.
@@ -546,8 +671,28 @@ class YoutubeDL(object):
_msg_header = '\033[0;31mERROR:\033[0m'
else:
_msg_header = 'ERROR:'
- error_message = '%s %s' % (_msg_header, message)
- self.trouble(error_message, tb)
+ kwargs['message'] = '%s %s' % (_msg_header, message)
+ self.trouble(*args, **kwargs)
+
+ def write_debug(self, message, only_once=False):
+ '''Log debug message or Print message to stderr'''
+ if not self.params.get('verbose', False):
+ return
+ message = '[debug] {0}'.format(message)
+ if self.params.get('logger'):
+ self.params['logger'].debug(message)
+ else:
+ self.to_stderr(message, only_once)
+
+ def report_unscoped_cookies(self, *args, **kwargs):
+ # message=None, tb=False, is_error=False
+ if len(args) <= 2:
+ kwargs.setdefault('is_error', False)
+ if len(args) <= 0:
+ kwargs.setdefault(
+ 'message',
+ 'Unscoped cookies are not allowed: please specify some sort of scoping')
+ self.report_error(*args, **kwargs)
def report_file_already_downloaded(self, file_name):
"""Report file has already been fully downloaded."""
@@ -565,10 +710,7 @@ class YoutubeDL(object):
autonumber_size = self.params.get('autonumber_size')
if autonumber_size is None:
autonumber_size = 5
- autonumber_templ = '%0' + str(autonumber_size) + 'd'
- template_dict['autonumber'] = autonumber_templ % self._num_downloads
- if template_dict.get('playlist_index') is not None:
- template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index'])
+ template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
if template_dict.get('resolution') is None:
if template_dict.get('width') and template_dict.get('height'):
template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
@@ -580,15 +722,64 @@ class YoutubeDL(object):
sanitize = lambda k, v: sanitize_filename(
compat_str(v),
restricted=self.params.get('restrictfilenames'),
- is_id=(k == 'id'))
- template_dict = dict((k, sanitize(k, v))
+ is_id=(k == 'id' or k.endswith('_id')))
+ template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
for k, v in template_dict.items()
if v is not None and not isinstance(v, (list, tuple, dict)))
- template_dict = collections.defaultdict(lambda: 'NA', template_dict)
+ template_dict = collections.defaultdict(lambda: self.params.get('outtmpl_na_placeholder', 'NA'), template_dict)
outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
- tmpl = compat_expanduser(outtmpl)
- filename = tmpl % template_dict
+
+ # For fields playlist_index and autonumber convert all occurrences
+ # of %(field)s to %(field)0Nd for backward compatibility
+ field_size_compat_map = {
+ 'playlist_index': len(str(template_dict['n_entries'])),
+ 'autonumber': autonumber_size,
+ }
+ FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
+ mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
+ if mobj:
+ outtmpl = re.sub(
+ FIELD_SIZE_COMPAT_RE,
+ r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
+ outtmpl)
+
+ # Missing numeric fields used together with integer presentation types
+ # in format specification will break the argument substitution since
+ # string NA placeholder is returned for missing fields. We will patch
+ # output template for missing fields to meet string presentation type.
+ for numeric_field in self._NUMERIC_FIELDS:
+ if numeric_field not in template_dict:
+ # As of [1] format syntax is:
+ # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
+ # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
+ FORMAT_RE = r'''(?x)
+ (?<!%)
+ %
+ \({0}\) # mapping key
+ (?:[#0\-+ ]+)? # conversion flags (optional)
+ (?:\d+)? # minimum field width (optional)
+ (?:\.\d+)? # precision (optional)
+ [hlL]? # length modifier (optional)
+ [diouxXeEfFgGcrs%] # conversion type
+ '''
+ outtmpl = re.sub(
+ FORMAT_RE.format(numeric_field),
+ r'%({0})s'.format(numeric_field), outtmpl)
+
+ # expand_path translates '%%' into '%' and '$$' into '$'
+ # correspondingly that is not what we want since we need to keep
+ # '%%' intact for template dict substitution step. Working around
+ # with boundary-alike separator hack.
+ sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
+ outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
+
+ # outtmpl should be expand_path'ed before template dict substitution
+ # because meta fields may contain env variables we don't want to
+ # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
+ # title "Hello $PATH", we don't want `$PATH` to be expanded.
+ filename = expand_path(outtmpl).replace(sep, '') % template_dict
+
# Temporary fix for #4787
# 'Treat' all problem characters by passing filename through preferredencoding
# to workaround encoding issues with subprocess on python2 @ Windows
@@ -596,7 +787,7 @@ class YoutubeDL(object):
filename = encodeFilename(filename, True).decode(preferredencoding())
return sanitize_path(filename)
except ValueError as err:
- self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
+ self.report_error('Error in output template: ' + error_to_compat_str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
return None
def _match_entry(self, info_dict, incomplete):
@@ -649,11 +840,20 @@ class YoutubeDL(object):
def extract_info(self, url, download=True, ie_key=None, extra_info={},
process=True, force_generic_extractor=False):
- '''
- Returns a list with a dictionary for each video we find.
- If 'download', also downloads the videos.
- extra_info is a dict containing the extra values to add to each result
- '''
+ """
+ Return a list with a dictionary for each video extracted.
+
+ Arguments:
+ url -- URL to extract
+
+ Keyword arguments:
+ download -- whether to download videos during extraction
+ ie_key -- extractor key hint
+ extra_info -- dictionary containing the extra values to add to each result
+ process -- whether to resolve all unresolved references (URLs, playlist items),
+ must be True for download to work.
+ force_generic_extractor -- force using the generic extractor
+ """
if not ie_key and force_generic_extractor:
ie_key = 'Generic'
@@ -672,34 +872,123 @@ class YoutubeDL(object):
self.report_warning('The program functionality for this site has been marked as broken, '
'and will probably not work.')
+ return self.__extract_info(url, ie, download, extra_info, process)
+ else:
+ self.report_error('no suitable InfoExtractor for URL %s' % url)
+
+ def __handle_extraction_exceptions(func):
+ def wrapper(self, *args, **kwargs):
try:
- ie_result = ie.extract(url)
- if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
- break
- if isinstance(ie_result, list):
- # Backwards compatibility: old IE result format
- ie_result = {
- '_type': 'compat_list',
- 'entries': ie_result,
- }
- self.add_default_extra_info(ie_result, ie, url)
- if process:
- return self.process_ie_result(ie_result, download, extra_info)
- else:
- return ie_result
+ return func(self, *args, **kwargs)
+ except GeoRestrictedError as e:
+ msg = e.msg
+ if e.countries:
+ msg += '\nThis video is available in %s.' % ', '.join(
+ map(ISO3166Utils.short2full, e.countries))
+ msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
+ self.report_error(msg)
except ExtractorError as e: # An error we somewhat expected
- self.report_error(compat_str(e), e.format_traceback())
- break
+ self.report_error(compat_str(e), tb=e.format_traceback())
except MaxDownloadsReached:
raise
except Exception as e:
if self.params.get('ignoreerrors', False):
self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
- break
else:
raise
+ return wrapper
+
+ def _remove_cookie_header(self, http_headers):
+ """Filters out `Cookie` header from an `http_headers` dict
+ The `Cookie` header is removed to prevent leaks as a result of unscoped cookies.
+ See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj
+
+ @param http_headers An `http_headers` dict from which any `Cookie` header
+ should be removed, or None
+ """
+ return dict(filter(lambda pair: pair[0].lower() != 'cookie', (http_headers or {}).items()))
+
+ def _load_cookies(self, data, **kwargs):
+ """Loads cookies from a `Cookie` header
+
+ This tries to work around the security vulnerability of passing cookies to every domain.
+
+ @param data The Cookie header as a string to load the cookies from
+ @param autoscope If `False`, scope cookies using Set-Cookie syntax and error for cookie without domains
+ If `True`, save cookies for later to be stored in the jar with a limited scope
+ If a URL, save cookies in the jar with the domain of the URL
+ """
+ # autoscope=True (kw-only)
+ autoscope = kwargs.get('autoscope', True)
+
+ for cookie in compat_http_cookies_SimpleCookie(data).values() if data else []:
+ if autoscope and any(cookie.values()):
+ raise ValueError('Invalid syntax in Cookie Header')
+
+ domain = cookie.get('domain') or ''
+ expiry = cookie.get('expires')
+ if expiry == '': # 0 is valid so we check for `''` explicitly
+ expiry = None
+ prepared_cookie = compat_http_cookiejar_Cookie(
+ cookie.get('version') or 0, cookie.key, cookie.value, None, False,
+ domain, True, True, cookie.get('path') or '', bool(cookie.get('path')),
+ bool(cookie.get('secure')), expiry, False, None, None, {})
+
+ if domain:
+ self.cookiejar.set_cookie(prepared_cookie)
+ elif autoscope is True:
+ self.report_warning(
+ 'Passing cookies as a header is a potential security risk; '
+ 'they will be scoped to the domain of the downloaded urls. '
+ 'Please consider loading cookies from a file or browser instead.',
+ only_once=True)
+ self._header_cookies.append(prepared_cookie)
+ elif autoscope:
+ self.report_warning(
+ 'The extractor result contains an unscoped cookie as an HTTP header. '
+ 'If you are specifying an input URL, ' + bug_reports_message(),
+ only_once=True)
+ self._apply_header_cookies(autoscope, [prepared_cookie])
+ else:
+ self.report_unscoped_cookies()
+
+ def _load_cookies_from_headers(self, headers):
+ self._load_cookies(traverse_obj(headers, 'cookie', casesense=False))
+
+ def _apply_header_cookies(self, url, cookies=None):
+ """This method applies stray header cookies to the provided url
+
+ This loads header cookies and scopes them to the domain provided in `url`.
+ While this is not ideal, it helps reduce the risk of them being sent to
+ an unintended destination.
+ """
+ parsed = compat_urllib_parse.urlparse(url)
+ if not parsed.hostname:
+ return
+
+ for cookie in map(copy.copy, cookies or self._header_cookies):
+ cookie.domain = '.' + parsed.hostname
+ self.cookiejar.set_cookie(cookie)
+
+ @__handle_extraction_exceptions
+ def __extract_info(self, url, ie, download, extra_info, process):
+ # Compat with passing cookies in http headers
+ self._apply_header_cookies(url)
+
+ ie_result = ie.extract(url)
+ if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
+ return
+ if isinstance(ie_result, list):
+ # Backwards compatibility: old IE result format
+ ie_result = {
+ '_type': 'compat_list',
+ 'entries': ie_result,
+ }
+ self.add_default_extra_info(ie_result, ie, url)
+ if process:
+ return self.process_ie_result(ie_result, download, extra_info)
else:
- self.report_error('no suitable InfoExtractor for URL %s' % url)
+ return ie_result
def add_default_extra_info(self, ie_result, ie, url):
self.add_extra_info(ie_result, {
@@ -711,7 +1000,7 @@ class YoutubeDL(object):
def process_ie_result(self, ie_result, download=True, extra_info={}):
"""
- Take the result of the ie(may be modified) and resolve all unresolved
+ Take the result of the ie (may be modified) and resolve all unresolved
references (URLs, playlist items).
It will also download the videos if 'download'.
@@ -722,10 +1011,11 @@ class YoutubeDL(object):
if result_type in ('url', 'url_transparent'):
ie_result['url'] = sanitize_url(ie_result['url'])
extract_flat = self.params.get('extract_flat', False)
- if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
- extract_flat is True):
- if self.params.get('forcejson', False):
- self.to_stdout(json.dumps(ie_result))
+ if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
+ or extract_flat is True):
+ self.__forced_printings(
+ ie_result, self.prepare_filename(ie_result),
+ incomplete=True)
return ie_result
if result_type == 'video':
@@ -744,112 +1034,53 @@ class YoutubeDL(object):
ie_result['url'], ie_key=ie_result.get('ie_key'),
extra_info=extra_info, download=False, process=False)
+ # extract_info may return None when ignoreerrors is enabled and
+ # extraction failed with an error, don't crash and return early
+ # in this case
+ if not info:
+ return info
+
force_properties = dict(
(k, v) for k, v in ie_result.items() if v is not None)
- for f in ('_type', 'url', 'ie_key'):
+ for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
if f in force_properties:
del force_properties[f]
new_result = info.copy()
new_result.update(force_properties)
- assert new_result.get('_type') != 'url_transparent'
+ # Extracted info may not be a video result (i.e.
+ # info.get('_type', 'video') != video) but rather an url or
+ # url_transparent. In such cases outer metadata (from ie_result)
+ # should be propagated to inner one (info). For this to happen
+ # _type of info should be overridden with url_transparent. This
+ # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
+ if new_result.get('_type') == 'url':
+ new_result['_type'] = 'url_transparent'
return self.process_ie_result(
new_result, download=download, extra_info=extra_info)
- elif result_type == 'playlist' or result_type == 'multi_video':
- # We process each entry in the playlist
- playlist = ie_result.get('title') or ie_result.get('id')
- self.to_screen('[download] Downloading playlist: %s' % playlist)
-
- playlist_results = []
-
- playliststart = self.params.get('playliststart', 1) - 1
- playlistend = self.params.get('playlistend')
- # For backwards compatibility, interpret -1 as whole list
- if playlistend == -1:
- playlistend = None
-
- playlistitems_str = self.params.get('playlist_items')
- playlistitems = None
- if playlistitems_str is not None:
- def iter_playlistitems(format):
- for string_segment in format.split(','):
- if '-' in string_segment:
- start, end = string_segment.split('-')
- for item in range(int(start), int(end) + 1):
- yield int(item)
- else:
- yield int(string_segment)
- playlistitems = iter_playlistitems(playlistitems_str)
-
- ie_entries = ie_result['entries']
- if isinstance(ie_entries, list):
- n_all_entries = len(ie_entries)
- if playlistitems:
- entries = [
- ie_entries[i - 1] for i in playlistitems
- if -n_all_entries <= i - 1 < n_all_entries]
- else:
- entries = ie_entries[playliststart:playlistend]
- n_entries = len(entries)
+ elif result_type in ('playlist', 'multi_video'):
+ # Protect from infinite recursion due to recursively nested playlists
+ # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
+ webpage_url = ie_result.get('webpage_url') # not all pl/mv have this
+ if webpage_url and webpage_url in self._playlist_urls:
self.to_screen(
- '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
- (ie_result['extractor'], playlist, n_all_entries, n_entries))
- elif isinstance(ie_entries, PagedList):
- if playlistitems:
- entries = []
- for item in playlistitems:
- entries.extend(ie_entries.getslice(
- item - 1, item
- ))
- else:
- entries = ie_entries.getslice(
- playliststart, playlistend)
- n_entries = len(entries)
- self.to_screen(
- '[%s] playlist %s: Downloading %d videos' %
- (ie_result['extractor'], playlist, n_entries))
- else: # iterable
- if playlistitems:
- entry_list = list(ie_entries)
- entries = [entry_list[i - 1] for i in playlistitems]
- else:
- entries = list(itertools.islice(
- ie_entries, playliststart, playlistend))
- n_entries = len(entries)
- self.to_screen(
- '[%s] playlist %s: Downloading %d videos' %
- (ie_result['extractor'], playlist, n_entries))
-
- if self.params.get('playlistreverse', False):
- entries = entries[::-1]
-
- for i, entry in enumerate(entries, 1):
- self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
- extra = {
- 'n_entries': n_entries,
- 'playlist': playlist,
- 'playlist_id': ie_result.get('id'),
- 'playlist_title': ie_result.get('title'),
- 'playlist_index': i + playliststart,
- 'extractor': ie_result['extractor'],
- 'webpage_url': ie_result['webpage_url'],
- 'webpage_url_basename': url_basename(ie_result['webpage_url']),
- 'extractor_key': ie_result['extractor_key'],
- }
-
- reason = self._match_entry(entry, incomplete=True)
- if reason is not None:
- self.to_screen('[download] ' + reason)
- continue
+ '[download] Skipping already downloaded playlist: %s'
+ % ie_result.get('title') or ie_result.get('id'))
+ return
- entry_result = self.process_ie_result(entry,
- download=download,
- extra_info=extra)
- playlist_results.append(entry_result)
- ie_result['entries'] = playlist_results
- self.to_screen('[download] Finished downloading playlist: %s' % playlist)
- return ie_result
+ self._playlist_level += 1
+ self._playlist_urls.add(webpage_url)
+ new_result = dict((k, v) for k, v in extra_info.items() if k not in ie_result)
+ if new_result:
+ new_result.update(ie_result)
+ ie_result = new_result
+ try:
+ return self.__process_playlist(ie_result, download)
+ finally:
+ self._playlist_level -= 1
+ if not self._playlist_level:
+ self._playlist_urls.clear()
elif result_type == 'compat_list':
self.report_warning(
'Extractor %s returned a compat_list result. '
@@ -874,6 +1105,123 @@ class YoutubeDL(object):
else:
raise Exception('Invalid result type: %s' % result_type)
+ def __process_playlist(self, ie_result, download):
+ # We process each entry in the playlist
+ playlist = ie_result.get('title') or ie_result.get('id')
+
+ self.to_screen('[download] Downloading playlist: %s' % playlist)
+
+ playlist_results = []
+
+ playliststart = self.params.get('playliststart', 1) - 1
+ playlistend = self.params.get('playlistend')
+ # For backwards compatibility, interpret -1 as whole list
+ if playlistend == -1:
+ playlistend = None
+
+ playlistitems_str = self.params.get('playlist_items')
+ playlistitems = None
+ if playlistitems_str is not None:
+ def iter_playlistitems(format):
+ for string_segment in format.split(','):
+ if '-' in string_segment:
+ start, end = string_segment.split('-')
+ for item in range(int(start), int(end) + 1):
+ yield int(item)
+ else:
+ yield int(string_segment)
+ playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
+
+ ie_entries = ie_result['entries']
+
+ def make_playlistitems_entries(list_ie_entries):
+ num_entries = len(list_ie_entries)
+ return [
+ list_ie_entries[i - 1] for i in playlistitems
+ if -num_entries <= i - 1 < num_entries]
+
+ def report_download(num_entries):
+ self.to_screen(
+ '[%s] playlist %s: Downloading %d videos' %
+ (ie_result['extractor'], playlist, num_entries))
+
+ if isinstance(ie_entries, list):
+ n_all_entries = len(ie_entries)
+ if playlistitems:
+ entries = make_playlistitems_entries(ie_entries)
+ else:
+ entries = ie_entries[playliststart:playlistend]
+ n_entries = len(entries)
+ self.to_screen(
+ '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
+ (ie_result['extractor'], playlist, n_all_entries, n_entries))
+ elif isinstance(ie_entries, PagedList):
+ if playlistitems:
+ entries = []
+ for item in playlistitems:
+ entries.extend(ie_entries.getslice(
+ item - 1, item
+ ))
+ else:
+ entries = ie_entries.getslice(
+ playliststart, playlistend)
+ n_entries = len(entries)
+ report_download(n_entries)
+ else: # iterable
+ if playlistitems:
+ entries = make_playlistitems_entries(list(itertools.islice(
+ ie_entries, 0, max(playlistitems))))
+ else:
+ entries = list(itertools.islice(
+ ie_entries, playliststart, playlistend))
+ n_entries = len(entries)
+ report_download(n_entries)
+
+ if self.params.get('playlistreverse', False):
+ entries = entries[::-1]
+
+ if self.params.get('playlistrandom', False):
+ random.shuffle(entries)
+
+ x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
+
+ for i, entry in enumerate(entries, 1):
+ self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
+ # This __x_forwarded_for_ip thing is a bit ugly but requires
+ # minimal changes
+ if x_forwarded_for:
+ entry['__x_forwarded_for_ip'] = x_forwarded_for
+ extra = {
+ 'n_entries': n_entries,
+ 'playlist': playlist,
+ 'playlist_id': ie_result.get('id'),
+ 'playlist_title': ie_result.get('title'),
+ 'playlist_uploader': ie_result.get('uploader'),
+ 'playlist_uploader_id': ie_result.get('uploader_id'),
+ 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart,
+ 'extractor': ie_result['extractor'],
+ 'webpage_url': ie_result['webpage_url'],
+ 'webpage_url_basename': url_basename(ie_result['webpage_url']),
+ 'extractor_key': ie_result['extractor_key'],
+ }
+
+ reason = self._match_entry(entry, incomplete=True)
+ if reason is not None:
+ self.to_screen('[download] ' + reason)
+ continue
+
+ entry_result = self.__process_iterable_entry(entry, download, extra)
+ # TODO: skip failed (empty) entries?
+ playlist_results.append(entry_result)
+ ie_result['entries'] = playlist_results
+ self.to_screen('[download] Finished downloading playlist: %s' % playlist)
+ return ie_result
+
+ @__handle_extraction_exceptions
+ def __process_iterable_entry(self, entry, download, extra_info):
+ return self.process_ie_result(
+ entry, download=download, extra_info=extra_info)
+
def _build_format_filter(self, filter_spec):
" Returns a function to filter the formats according to the filter_spec "
@@ -886,7 +1234,7 @@ class YoutubeDL(object):
'!=': operator.ne,
}
operator_rex = re.compile(r'''(?x)\s*
- (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
+ (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)
\s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
(?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
$
@@ -908,21 +1256,24 @@ class YoutubeDL(object):
if not m:
STR_OPERATORS = {
'=': operator.eq,
- '!=': operator.ne,
'^=': lambda attr, value: attr.startswith(value),
'$=': lambda attr, value: attr.endswith(value),
'*=': lambda attr, value: value in attr,
}
str_operator_rex = re.compile(r'''(?x)
- \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
- \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
+ \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id|language)
+ \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?
\s*(?P<value>[a-zA-Z0-9._-]+)
\s*$
''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
m = str_operator_rex.search(filter_spec)
if m:
comparison_value = m.group('value')
- op = STR_OPERATORS[m.group('op')]
+ str_op = STR_OPERATORS[m.group('op')]
+ if m.group('negation'):
+ op = lambda attr, value: not str_op(attr, value)
+ else:
+ op = str_op
if not m:
raise ValueError('Invalid filter specification %r' % filter_spec)
@@ -934,6 +1285,30 @@ class YoutubeDL(object):
return op(actual_value, comparison_value)
return _filter
+ def _default_format_spec(self, info_dict, download=True):
+
+ def can_merge():
+ merger = FFmpegMergerPP(self)
+ return merger.available and merger.can_merge()
+
+ def prefer_best():
+ if self.params.get('simulate', False):
+ return False
+ if not download:
+ return False
+ if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
+ return True
+ if info_dict.get('is_live'):
+ return True
+ if not can_merge():
+ return True
+ return False
+
+ req_format_list = ['bestvideo+bestaudio', 'best']
+ if prefer_best():
+ req_format_list.reverse()
+ return '/'.join(req_format_list)
+
def build_format_selector(self, format_spec):
def syntax_error(note, start):
message = (
@@ -1029,6 +1404,8 @@ class YoutubeDL(object):
group = _parse_format_selection(tokens, inside_group=True)
current_selector = FormatSelector(GROUP, group, [])
elif string == '+':
+ if inside_merge:
+ raise syntax_error('Unexpected "+"', start)
video_selector = current_selector
audio_selector = _parse_format_selection(tokens, inside_merge=True)
if not video_selector or not audio_selector:
@@ -1046,9 +1423,9 @@ class YoutubeDL(object):
if isinstance(selector, list):
fs = [_build_selector_function(s) for s in selector]
- def selector_function(formats):
+ def selector_function(ctx):
for f in fs:
- for format in f(formats):
+ for format in f(ctx):
yield format
return selector_function
elif selector.type == GROUP:
@@ -1056,17 +1433,17 @@ class YoutubeDL(object):
elif selector.type == PICKFIRST:
fs = [_build_selector_function(s) for s in selector.selector]
- def selector_function(formats):
+ def selector_function(ctx):
for f in fs:
- picked_formats = list(f(formats))
+ picked_formats = list(f(ctx))
if picked_formats:
return picked_formats
return []
elif selector.type == SINGLE:
format_spec = selector.selector
- def selector_function(formats):
- formats = list(formats)
+ def selector_function(ctx):
+ formats = list(ctx['formats'])
if not formats:
return
if format_spec == 'all':
@@ -1079,9 +1456,10 @@ class YoutubeDL(object):
if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
if audiovideo_formats:
yield audiovideo_formats[format_idx]
- # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
- elif (all(f.get('acodec') != 'none' for f in formats) or
- all(f.get('vcodec') != 'none' for f in formats)):
+ # for extractors with incomplete formats (audio only (soundcloud)
+ # or video only (imgur)) we will fallback to best/worst
+ # {video,audio}-only format
+ elif ctx['incomplete_formats']:
yield formats[format_idx]
elif format_spec == 'bestaudio':
audio_formats = [
@@ -1153,19 +1531,19 @@ class YoutubeDL(object):
'abr': formats_info[1].get('abr'),
'ext': output_ext,
}
- video_selector, audio_selector = map(_build_selector_function, selector.selector)
- def selector_function(formats):
- formats = list(formats)
- for pair in itertools.product(video_selector(formats), audio_selector(formats)):
+ def selector_function(ctx):
+ selector_fn = lambda x: _build_selector_function(x)(ctx)
+ for pair in itertools.product(*map(selector_fn, selector.selector)):
yield _merge(pair)
filters = [self._build_format_filter(f) for f in selector.filters]
- def final_selector(formats):
+ def final_selector(ctx):
+ ctx_copy = dict(ctx)
for _filter in filters:
- formats = list(filter(_filter, formats))
- return selector_function(formats)
+ ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
+ return selector_function(ctx_copy)
return final_selector
stream = io.BytesIO(format_spec.encode('utf-8'))
@@ -1197,24 +1575,73 @@ class YoutubeDL(object):
parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
return _build_selector_function(parsed_selector)
- def _calc_headers(self, info_dict):
- res = std_headers.copy()
+ def _calc_headers(self, info_dict, load_cookies=False):
+ if load_cookies: # For --load-info-json
+ # load cookies from http_headers in legacy info.json
+ self._load_cookies(traverse_obj(info_dict, ('http_headers', 'Cookie'), casesense=False),
+ autoscope=info_dict['url'])
+ # load scoped cookies from info.json
+ self._load_cookies(info_dict.get('cookies'), autoscope=False)
- add_headers = info_dict.get('http_headers')
- if add_headers:
- res.update(add_headers)
-
- cookies = self._calc_cookies(info_dict)
+ cookies = self.cookiejar.get_cookies_for_url(info_dict['url'])
if cookies:
- res['Cookie'] = cookies
+ # Make a string like name1=val1; attr1=a_val1; ...name2=val2; ...
+ # By convention a cookie name can't be a well-known attribute name
+ # so this syntax is unambiguous and can be parsed by (eg) SimpleCookie
+ encoder = compat_http_cookies_SimpleCookie()
+ values = []
+ attributes = (('Domain', '='), ('Path', '='), ('Secure',), ('Expires', '='), ('Version', '='))
+ attributes = tuple([x[0].lower()] + list(x) for x in attributes)
+ for cookie in cookies:
+ _, value = encoder.value_encode(cookie.value)
+ # Py 2 '' --> '', Py 3 '' --> '""'
+ if value == '':
+ value = '""'
+ values.append('='.join((cookie.name, value)))
+ for attr in attributes:
+ value = getattr(cookie, attr[0], None)
+ if value:
+ values.append('%s%s' % (''.join(attr[1:]), value if len(attr) == 3 else ''))
+ info_dict['cookies'] = '; '.join(values)
- return res
+ res = std_headers.copy()
+ res.update(info_dict.get('http_headers') or {})
+ res = self._remove_cookie_header(res)
+
+ if 'X-Forwarded-For' not in res:
+ x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
+ if x_forwarded_for_ip:
+ res['X-Forwarded-For'] = x_forwarded_for_ip
+
+ return res or None
def _calc_cookies(self, info_dict):
pr = sanitized_Request(info_dict['url'])
self.cookiejar.add_cookie_header(pr)
return pr.get_header('Cookie')
+ def _fill_common_fields(self, info_dict, final=True):
+
+ for ts_key, date_key in (
+ ('timestamp', 'upload_date'),
+ ('release_timestamp', 'release_date'),
+ ):
+ if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
+ # Working around out-of-range timestamp values (e.g. negative ones on Windows,
+ # see http://bugs.python.org/issue1646728)
+ try:
+ upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
+ info_dict[date_key] = compat_str(upload_date.strftime('%Y%m%d'))
+ except (ValueError, OverflowError, OSError):
+ pass
+
+ # Auto generate title fields corresponding to the *_number fields when missing
+ # in order to always have clean titles. This is very common for TV series.
+ if final:
+ for field in ('chapter', 'season', 'episode'):
+ if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
+ info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
+
def process_video_result(self, info_dict, download=True):
assert info_dict.get('_type', 'video') == 'video'
@@ -1223,9 +1650,28 @@ class YoutubeDL(object):
if 'title' not in info_dict:
raise ExtractorError('Missing "title" field in extractor result')
- if not isinstance(info_dict['id'], compat_str):
- self.report_warning('"id" field is not a string - forcing string conversion')
- info_dict['id'] = compat_str(info_dict['id'])
+ def report_force_conversion(field, field_not, conversion):
+ self.report_warning(
+ '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
+ % (field, field_not, conversion))
+
+ def sanitize_string_field(info, string_field):
+ field = info.get(string_field)
+ if field is None or isinstance(field, compat_str):
+ return
+ report_force_conversion(string_field, 'a string', 'string')
+ info[string_field] = compat_str(field)
+
+ def sanitize_numeric_fields(info):
+ for numeric_field in self._NUMERIC_FIELDS:
+ field = info.get(numeric_field)
+ if field is None or isinstance(field, compat_numeric_types):
+ continue
+ report_force_conversion(numeric_field, 'numeric', 'int')
+ info[numeric_field] = int_or_none(field)
+
+ sanitize_string_field(info_dict, 'id')
+ sanitize_numeric_fields(info_dict)
if 'playlist' not in info_dict:
# It isn't part of a playlist
@@ -1239,8 +1685,10 @@ class YoutubeDL(object):
info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
if thumbnails:
thumbnails.sort(key=lambda t: (
- t.get('preference'), t.get('width'), t.get('height'),
- t.get('id'), t.get('url')))
+ t.get('preference') if t.get('preference') is not None else -1,
+ t.get('width') if t.get('width') is not None else -1,
+ t.get('height') if t.get('height') is not None else -1,
+ t.get('id') if t.get('id') is not None else '', t.get('url')))
for i, t in enumerate(thumbnails):
t['url'] = sanitize_url(t['url'])
if t.get('width') and t.get('height'):
@@ -1261,38 +1709,30 @@ class YoutubeDL(object):
if 'display_id' not in info_dict and 'id' in info_dict:
info_dict['display_id'] = info_dict['id']
- if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
- # Working around out-of-range timestamp values (e.g. negative ones on Windows,
- # see http://bugs.python.org/issue1646728)
- try:
- upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
- info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
- except (ValueError, OverflowError, OSError):
- pass
+ self._fill_common_fields(info_dict)
- # Auto generate title fields corresponding to the *_number fields when missing
- # in order to always have clean titles. This is very common for TV series.
- for field in ('chapter', 'season', 'episode'):
- if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
- info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
+ for cc_kind in ('subtitles', 'automatic_captions'):
+ cc = info_dict.get(cc_kind)
+ if cc:
+ for _, subtitle in cc.items():
+ for subtitle_format in subtitle:
+ if subtitle_format.get('url'):
+ subtitle_format['url'] = sanitize_url(subtitle_format['url'])
+ if subtitle_format.get('ext') is None:
+ subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
+ automatic_captions = info_dict.get('automatic_captions')
subtitles = info_dict.get('subtitles')
- if subtitles:
- for _, subtitle in subtitles.items():
- for subtitle_format in subtitle:
- if subtitle_format.get('url'):
- subtitle_format['url'] = sanitize_url(subtitle_format['url'])
- if 'ext' not in subtitle_format:
- subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
if self.params.get('listsubtitles', False):
if 'automatic_captions' in info_dict:
- self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
+ self.list_subtitles(
+ info_dict['id'], automatic_captions, 'automatic captions')
self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
return
+
info_dict['requested_subtitles'] = self.process_subtitles(
- info_dict['id'], subtitles,
- info_dict.get('automatic_captions'))
+ info_dict['id'], subtitles, automatic_captions)
# We now pick which formats have to be downloaded
if info_dict.get('formats') is None:
@@ -1301,6 +1741,20 @@ class YoutubeDL(object):
else:
formats = info_dict['formats']
+ def is_wellformed(f):
+ url = f.get('url')
+ if not url:
+ self.report_warning(
+ '"url" field is missing or empty - skipping format, '
+ 'there is an error in extractor')
+ return False
+ if isinstance(url, bytes):
+ sanitize_string_field(f, 'url')
+ return True
+
+ # Filter out malformed formats for better extraction robustness
+ formats = list(filter(is_wellformed, formats or []))
+
if not formats:
raise ExtractorError('No video formats found!')
@@ -1308,16 +1762,14 @@ class YoutubeDL(object):
# We check that all the formats have the format and format_id fields
for i, format in enumerate(formats):
- if 'url' not in format:
- raise ExtractorError('Missing "url" key in result (index %d)' % i)
-
+ sanitize_string_field(format, 'format_id')
+ sanitize_numeric_fields(format)
format['url'] = sanitize_url(format['url'])
-
- if format.get('format_id') is None:
+ if not format.get('format_id'):
format['format_id'] = compat_str(i)
else:
# Sanitize format_id from characters used in format selector expression
- format['format_id'] = re.sub('[\s,/+\[\]()]', '_', format['format_id'])
+ format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
format_id = format['format_id']
if format_id not in formats_dict:
formats_dict[format_id] = []
@@ -1337,17 +1789,23 @@ class YoutubeDL(object):
note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
)
# Automatically determine file extension if missing
- if 'ext' not in format:
+ if format.get('ext') is None:
format['ext'] = determine_ext(format['url']).lower()
# Automatically determine protocol if missing (useful for format
# selection purposes)
- if 'protocol' not in format:
+ if format.get('protocol') is None:
format['protocol'] = determine_protocol(format)
# Add HTTP headers, so that external programs can use them from the
# json output
- full_format_info = info_dict.copy()
- full_format_info.update(format)
- format['http_headers'] = self._calc_headers(full_format_info)
+ format['http_headers'] = self._calc_headers(ChainMap(format, info_dict), load_cookies=True)
+
+ # Safeguard against old/insecure infojson when using --load-info-json
+ info_dict['http_headers'] = self._remove_cookie_header(
+ info_dict.get('http_headers') or {}) or None
+
+ # Remove private housekeeping stuff (copied to http_headers in _calc_headers())
+ if '__x_forwarded_for_ip' in info_dict:
+ del info_dict['__x_forwarded_for_ip']
# TODO Central sorting goes here
@@ -1363,16 +1821,39 @@ class YoutubeDL(object):
req_format = self.params.get('format')
if req_format is None:
- req_format_list = []
- if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
- not info_dict.get('is_live')):
- merger = FFmpegMergerPP(self)
- if merger.available and merger.can_merge():
- req_format_list.append('bestvideo+bestaudio')
- req_format_list.append('best')
- req_format = '/'.join(req_format_list)
+ req_format = self._default_format_spec(info_dict, download=download)
+ if self.params.get('verbose'):
+ self._write_string('[debug] Default format spec: %s\n' % req_format)
+
format_selector = self.build_format_selector(req_format)
- formats_to_download = list(format_selector(formats))
+
+ # While in format selection we may need to have an access to the original
+ # format set in order to calculate some metrics or do some processing.
+ # For now we need to be able to guess whether original formats provided
+ # by extractor are incomplete or not (i.e. whether extractor provides only
+ # video-only or audio-only formats) for proper formats selection for
+ # extractors with such incomplete formats (see
+ # https://github.com/ytdl-org/youtube-dl/pull/5556).
+ # Since formats may be filtered during format selection and may not match
+ # the original formats the results may be incorrect. Thus original formats
+ # or pre-calculated metrics should be passed to format selection routines
+ # as well.
+ # We will pass a context object containing all necessary additional data
+ # instead of just formats.
+ # This fixes incorrect format selection issue (see
+ # https://github.com/ytdl-org/youtube-dl/issues/10083).
+ incomplete_formats = (
+ # All formats are video-only or
+ all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
+ # all formats are audio-only
+ or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
+
+ ctx = {
+ 'formats': formats,
+ 'incomplete_formats': incomplete_formats,
+ }
+
+ formats_to_download = list(format_selector(ctx))
if not formats_to_download:
raise ExtractorError('requested format not available',
expected=True)
@@ -1437,19 +1918,48 @@ class YoutubeDL(object):
subs[lang] = f
return subs
+ def __forced_printings(self, info_dict, filename, incomplete):
+ def print_mandatory(field):
+ if (self.params.get('force%s' % field, False)
+ and (not incomplete or info_dict.get(field) is not None)):
+ self.to_stdout(info_dict[field])
+
+ def print_optional(field):
+ if (self.params.get('force%s' % field, False)
+ and info_dict.get(field) is not None):
+ self.to_stdout(info_dict[field])
+
+ print_mandatory('title')
+ print_mandatory('id')
+ if self.params.get('forceurl', False) and not incomplete:
+ if info_dict.get('requested_formats') is not None:
+ for f in info_dict['requested_formats']:
+ self.to_stdout(f['url'] + f.get('play_path', ''))
+ else:
+ # For RTMP URLs, also include the playpath
+ self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
+ print_optional('thumbnail')
+ print_optional('description')
+ if self.params.get('forcefilename', False) and filename is not None:
+ self.to_stdout(filename)
+ if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
+ self.to_stdout(formatSeconds(info_dict['duration']))
+ print_mandatory('format')
+ if self.params.get('forcejson', False):
+ self.to_stdout(json.dumps(self.sanitize_info(info_dict)))
+
+ @_catch_unsafe_file_extension
def process_info(self, info_dict):
"""Process a single resolved IE result."""
assert info_dict.get('_type', 'video') == 'video'
- max_downloads = self.params.get('max_downloads')
- if max_downloads is not None:
- if self._num_downloads >= int(max_downloads):
- raise MaxDownloadsReached()
+ max_downloads = int_or_none(self.params.get('max_downloads')) or float('inf')
+ if self._num_downloads >= max_downloads:
+ raise MaxDownloadsReached()
+ # TODO: backward compatibility, to be removed
info_dict['fulltitle'] = info_dict['title']
- if len(info_dict['title']) > 200:
- info_dict['title'] = info_dict['title'][:197] + '...'
if 'format' not in info_dict:
info_dict['format'] = info_dict['ext']
@@ -1464,29 +1974,7 @@ class YoutubeDL(object):
info_dict['_filename'] = filename = self.prepare_filename(info_dict)
# Forced printings
- if self.params.get('forcetitle', False):
- self.to_stdout(info_dict['fulltitle'])
- if self.params.get('forceid', False):
- self.to_stdout(info_dict['id'])
- if self.params.get('forceurl', False):
- if info_dict.get('requested_formats') is not None:
- for f in info_dict['requested_formats']:
- self.to_stdout(f['url'] + f.get('play_path', ''))
- else:
- # For RTMP URLs, also include the playpath
- self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
- if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
- self.to_stdout(info_dict['thumbnail'])
- if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
- self.to_stdout(info_dict['description'])
- if self.params.get('forcefilename', False) and filename is not None:
- self.to_stdout(filename)
- if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
- self.to_stdout(formatSeconds(info_dict['duration']))
- if self.params.get('forceformat', False):
- self.to_stdout(info_dict['format'])
- if self.params.get('forcejson', False):
- self.to_stdout(json.dumps(info_dict))
+ self.__forced_printings(info_dict, filename, incomplete=False)
# Do nothing else if in simulate mode
if self.params.get('simulate', False):
@@ -1495,12 +1983,19 @@ class YoutubeDL(object):
if filename is None:
return
- try:
- dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
- if dn and not os.path.exists(dn):
- os.makedirs(dn)
- except (OSError, IOError) as err:
- self.report_error('unable to create directory ' + error_to_compat_str(err))
+ def ensure_dir_exists(path):
+ try:
+ dn = os.path.dirname(path)
+ if dn and not os.path.exists(dn):
+ os.makedirs(dn)
+ return True
+ except (OSError, IOError) as err:
+ if isinstance(err, OSError) and err.errno == errno.EEXIST:
+ return True
+ self.report_error('unable to create directory ' + error_to_compat_str(err))
+ return False
+
+ if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):
return
if self.params.get('writedescription', False):
@@ -1512,7 +2007,7 @@ class YoutubeDL(object):
else:
try:
self.to_screen('[info] Writing video description to: ' + descfn)
- with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
+ with open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
descfile.write(info_dict['description'])
except (OSError, IOError):
self.report_error('Cannot write description file ' + descfn)
@@ -1522,10 +2017,12 @@ class YoutubeDL(object):
annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
self.to_screen('[info] Video annotations are already present')
+ elif not info_dict.get('annotations'):
+ self.report_warning('There are no annotations to write.')
else:
try:
self.to_screen('[info] Writing video annotations to: ' + annofn)
- with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
+ with open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
annofile.write(info_dict['annotations'])
except (KeyError, TypeError):
self.report_warning('There are no annotations to write.')
@@ -1543,51 +2040,59 @@ class YoutubeDL(object):
ie = self.get_info_extractor(info_dict['extractor_key'])
for sub_lang, sub_info in subtitles.items():
sub_format = sub_info['ext']
- if sub_info.get('data') is not None:
- sub_data = sub_info['data']
+ sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
+ if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
+ self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
else:
- try:
- sub_data = ie._download_webpage(
- sub_info['url'], info_dict['id'], note=False)
- except ExtractorError as err:
- self.report_warning('Unable to download subtitle for "%s": %s' %
- (sub_lang, error_to_compat_str(err.cause)))
- continue
- try:
- sub_filename = subtitles_filename(filename, sub_lang, sub_format)
- if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
- self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
+ self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
+ if sub_info.get('data') is not None:
+ try:
+ # Use newline='' to prevent conversion of newline characters
+ # See https://github.com/ytdl-org/youtube-dl/issues/10268
+ with open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
+ subfile.write(sub_info['data'])
+ except (OSError, IOError):
+ self.report_error('Cannot write subtitles file ' + sub_filename)
+ return
else:
- self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
- with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
- subfile.write(sub_data)
- except (OSError, IOError):
- self.report_error('Cannot write subtitles file ' + sub_filename)
- return
-
- if self.params.get('writeinfojson', False):
- infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
- if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
- self.to_screen('[info] Video description metadata is already present')
- else:
- self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
- try:
- write_json_file(self.filter_requested_info(info_dict), infofn)
- except (OSError, IOError):
- self.report_error('Cannot write metadata to JSON file ' + infofn)
- return
+ try:
+ sub_data = ie._request_webpage(
+ sub_info['url'], info_dict['id'], note=False).read()
+ with open(encodeFilename(sub_filename), 'wb') as subfile:
+ subfile.write(sub_data)
+ except (ExtractorError, IOError, OSError, ValueError) as err:
+ self.report_warning('Unable to download subtitle for "%s": %s' %
+ (sub_lang, error_to_compat_str(err)))
+ continue
+
+ self._write_info_json(
+ 'video description', info_dict,
+ replace_extension(filename, 'info.json', info_dict.get('ext')))
self._write_thumbnails(info_dict, filename)
if not self.params.get('skip_download', False):
try:
+ def checked_get_suitable_downloader(info_dict, params):
+ ed_args = params.get('external_downloader_args')
+ dler = get_suitable_downloader(info_dict, params)
+ if ed_args and not params.get('external_downloader_args'):
+ # external_downloader_args was cleared because external_downloader was rejected
+ self.report_warning('Requested external downloader cannot be used: '
+ 'ignoring --external-downloader-args.')
+ return dler
+
def dl(name, info):
- fd = get_suitable_downloader(info, self.params)(self, self.params)
+ fd = checked_get_suitable_downloader(info, self.params)(self, self.params)
for ph in self._progress_hooks:
fd.add_progress_hook(ph)
if self.params.get('verbose'):
- self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
- return fd.download(name, info)
+ self.to_screen('[debug] Invoking downloader on %r' % info.get('url'))
+
+ new_info = dict((k, v) for k, v in info.items() if not k.startswith('__p'))
+ new_info['http_headers'] = self._calc_headers(new_info)
+
+ return fd.download(name, new_info)
if info_dict.get('requested_formats') is not None:
downloaded = []
@@ -1604,10 +2109,10 @@ class YoutubeDL(object):
def compatible_formats(formats):
video, audio = formats
# Check extension
- video_ext, audio_ext = audio.get('ext'), video.get('ext')
+ video_ext, audio_ext = video.get('ext'), audio.get('ext')
if video_ext and audio_ext:
COMPATIBLE_EXTS = (
- ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v'),
+ ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
('webm')
)
for exts in COMPATIBLE_EXTS:
@@ -1616,18 +2121,26 @@ class YoutubeDL(object):
# TODO: Check acodec/vcodec
return False
- filename_real_ext = os.path.splitext(filename)[1][1:]
- filename_wo_ext = (
- os.path.splitext(filename)[0]
- if filename_real_ext == info_dict['ext']
- else filename)
+ exts = [info_dict['ext']]
requested_formats = info_dict['requested_formats']
if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
info_dict['ext'] = 'mkv'
self.report_warning(
'Requested formats are incompatible for merge and will be merged into mkv.')
+ exts.append(info_dict['ext'])
+
# Ensure filename always has a correct extension for successful merge
- filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
+ def correct_ext(filename, ext=exts[1]):
+ if filename == '-':
+ return filename
+ f_name, f_real_ext = os.path.splitext(filename)
+ f_real_ext = f_real_ext[1:]
+ filename_wo_ext = f_name if f_real_ext in exts else filename
+ if ext is None:
+ ext = f_real_ext or None
+ return join_nonempty(filename_wo_ext, ext, delim='.')
+
+ filename = correct_ext(filename)
if os.path.exists(encodeFilename(filename)):
self.to_screen(
'[download] %s has already been downloaded and '
@@ -1636,8 +2149,12 @@ class YoutubeDL(object):
for f in requested_formats:
new_info = dict(info_dict)
new_info.update(f)
- fname = self.prepare_filename(new_info)
- fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
+ fname = prepend_extension(
+ correct_ext(
+ self.prepare_filename(new_info), new_info['ext']),
+ 'f%s' % (f['format_id'],), new_info['ext'])
+ if not ensure_dir_exists(fname):
+ return
downloaded.append(fname)
partial_success = dl(fname, new_info)
success = success and partial_success
@@ -1680,8 +2197,8 @@ class YoutubeDL(object):
else:
assert fixup_policy in ('ignore', 'never')
- if (info_dict.get('requested_formats') is None and
- info_dict.get('container') == 'm4a_dash'):
+ if (info_dict.get('requested_formats') is None
+ and info_dict.get('container') == 'm4a_dash'):
if fixup_policy == 'warn':
self.report_warning(
'%s: writing DASH m4a. '
@@ -1700,11 +2217,11 @@ class YoutubeDL(object):
else:
assert fixup_policy in ('ignore', 'never')
- if (info_dict.get('protocol') == 'm3u8_native' or
- info_dict.get('protocol') == 'm3u8' and
- self.params.get('hls_prefer_native')):
+ if (info_dict.get('protocol') == 'm3u8_native'
+ or info_dict.get('protocol') == 'm3u8'
+ and self.params.get('hls_prefer_native')):
if fixup_policy == 'warn':
- self.report_warning('%s: malformated aac bitstream.' % (
+ self.report_warning('%s: malformed AAC bitstream detected.' % (
info_dict['id']))
elif fixup_policy == 'detect_or_warn':
fixup_pp = FFmpegFixupM3u8PP(self)
@@ -1713,7 +2230,7 @@ class YoutubeDL(object):
info_dict['__postprocessors'].append(fixup_pp)
else:
self.report_warning(
- '%s: malformated aac bitstream. %s'
+ '%s: malformed AAC bitstream detected. %s'
% (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
else:
assert fixup_policy in ('ignore', 'never')
@@ -1721,16 +2238,20 @@ class YoutubeDL(object):
try:
self.post_process(filename, info_dict)
except (PostProcessingError) as err:
- self.report_error('postprocessing: %s' % str(err))
+ self.report_error('postprocessing: %s' % error_to_compat_str(err))
return
self.record_download_archive(info_dict)
+ # avoid possible nugatory search for further items (PR #26638)
+ if self._num_downloads >= max_downloads:
+ raise MaxDownloadsReached()
def download(self, url_list):
"""Download a given list of URLs."""
outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
- if (len(url_list) > 1 and
- '%' not in outtmpl and
- self.params.get('max_downloads') != 1):
+ if (len(url_list) > 1
+ and outtmpl != '-'
+ and '%' not in outtmpl
+ and self.params.get('max_downloads') != 1):
raise SameFileError(outtmpl)
for url in url_list:
@@ -1745,16 +2266,13 @@ class YoutubeDL(object):
raise
else:
if self.params.get('dump_single_json', False):
- self.to_stdout(json.dumps(res))
+ self.to_stdout(json.dumps(self.sanitize_info(res)))
return self._download_retcode
def download_with_info_file(self, info_filename):
- with contextlib.closing(fileinput.FileInput(
- [info_filename], mode='r',
- openhook=fileinput.hook_encoded('utf-8'))) as f:
- # FileInput doesn't have a read method, we can't call json.load
- info = self.filter_requested_info(json.loads('\n'.join(f)))
+ with open(info_filename, encoding='utf-8') as f:
+ info = self.filter_requested_info(json.load(f))
try:
self.process_ie_result(info, download=True)
except DownloadError:
@@ -1767,10 +2285,36 @@ class YoutubeDL(object):
return self._download_retcode
@staticmethod
- def filter_requested_info(info_dict):
- return dict(
- (k, v) for k, v in info_dict.items()
- if k not in ['requested_formats', 'requested_subtitles'])
+ def sanitize_info(info_dict, remove_private_keys=False):
+ ''' Sanitize the infodict for converting to json '''
+ if info_dict is None:
+ return info_dict
+
+ if remove_private_keys:
+ reject = lambda k, v: (v is None
+ or k.startswith('__')
+ or k in ('requested_formats',
+ 'requested_subtitles'))
+ else:
+ reject = lambda k, v: False
+
+ def filter_fn(obj):
+ if isinstance(obj, dict):
+ return dict((k, filter_fn(v)) for k, v in obj.items() if not reject(k, v))
+ elif isinstance(obj, (list, tuple, set, LazyList)):
+ return list(map(filter_fn, obj))
+ elif obj is None or any(isinstance(obj, c)
+ for c in (compat_integer_types,
+ (compat_str, float, bool))):
+ return obj
+ else:
+ return repr(obj)
+
+ return filter_fn(info_dict)
+
+ @classmethod
+ def filter_requested_info(cls, info_dict):
+ return cls.sanitize_info(info_dict, True)
def post_process(self, filename, ie_info):
"""Run all the postprocessors on the given file."""
@@ -1795,15 +2339,24 @@ class YoutubeDL(object):
self.report_warning('Unable to remove downloaded original file')
def _make_archive_id(self, info_dict):
+ video_id = info_dict.get('id')
+ if not video_id:
+ return
# Future-proof against any change in case
# and backwards compatibility with prior versions
- extractor = info_dict.get('extractor_key')
+ extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist
if extractor is None:
- if 'id' in info_dict:
- extractor = info_dict.get('ie_key') # key in a playlist
- if extractor is None:
- return None # Incomplete video information
- return extractor.lower() + ' ' + info_dict['id']
+ url = str_or_none(info_dict.get('url'))
+ if not url:
+ return
+ # Try to find matching extractor for the URL and take its ie_key
+ for ie in self._ies:
+ if ie.suitable(url):
+ extractor = ie.ie_key()
+ break
+ else:
+ return
+ return extractor.lower() + ' ' + video_id
def in_download_archive(self, info_dict):
fn = self.params.get('download_archive')
@@ -1811,7 +2364,7 @@ class YoutubeDL(object):
return False
vid_id = self._make_archive_id(info_dict)
- if vid_id is None:
+ if not vid_id:
return False # Incomplete video information
try:
@@ -1866,8 +2419,8 @@ class YoutubeDL(object):
if res:
res += ', '
res += '%s container' % fdict['container']
- if (fdict.get('vcodec') is not None and
- fdict.get('vcodec') != 'none'):
+ if (fdict.get('vcodec') is not None
+ and fdict.get('vcodec') != 'none'):
if res:
res += ', '
res += fdict['vcodec']
@@ -1954,7 +2507,7 @@ class YoutubeDL(object):
return
if type('') is not compat_str:
- # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
+ # Python 2.6 on SLES11 SP1 (https://github.com/ytdl-org/youtube-dl/issues/3326)
self.report_warning(
'Your Python is broken! Update to a newer and supported version')
@@ -1968,28 +2521,53 @@ class YoutubeDL(object):
self.get_encoding()))
write_string(encoding_str, encoding=None)
- self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
+ writeln_debug = lambda *s: self.write_debug(''.join(s))
+ writeln_debug('youtube-dl version ', __version__)
if _LAZY_LOADER:
- self._write_string('[debug] Lazy loading extractors enabled' + '\n')
+ writeln_debug('Lazy loading extractors enabled')
+ if ytdl_is_updateable():
+ writeln_debug('Single file build')
try:
sp = subprocess.Popen(
['git', 'rev-parse', '--short', 'HEAD'],
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
cwd=os.path.dirname(os.path.abspath(__file__)))
- out, err = sp.communicate()
+ out, err = process_communicate_or_kill(sp)
out = out.decode().strip()
if re.match('[0-9a-f]+', out):
- self._write_string('[debug] Git HEAD: ' + out + '\n')
+ writeln_debug('Git HEAD: ', out)
except Exception:
try:
sys.exc_clear()
except Exception:
pass
- self._write_string('[debug] Python version %s - %s\n' % (
- platform.python_version(), platform_name()))
+
+ def python_implementation():
+ impl_name = platform.python_implementation()
+ if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
+ return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
+ return impl_name
+
+ def libc_ver():
+ try:
+ return platform.libc_ver()
+ except OSError: # We may not have access to the executable
+ return []
+
+ libc = join_nonempty(*libc_ver(), delim=' ')
+ writeln_debug('Python %s (%s %s %s) - %s - %s%s' % (
+ platform.python_version(),
+ python_implementation(),
+ platform.machine(),
+ platform.architecture()[0],
+ platform_name(),
+ OPENSSL_VERSION,
+ (' - %s' % (libc, )) if libc else ''
+ ))
exe_versions = FFmpegPostProcessor.get_versions(self)
exe_versions['rtmpdump'] = rtmpdump_version()
+ exe_versions['phantomjs'] = PhantomJSwrapper._version()
exe_str = ', '.join(
'%s %s' % (exe, v)
for exe, v in sorted(exe_versions.items())
@@ -1997,17 +2575,17 @@ class YoutubeDL(object):
)
if not exe_str:
exe_str = 'none'
- self._write_string('[debug] exe versions: %s\n' % exe_str)
+ writeln_debug('exe versions: %s' % (exe_str, ))
proxy_map = {}
for handler in self._opener.handlers:
if hasattr(handler, 'proxies'):
proxy_map.update(handler.proxies)
- self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
+ writeln_debug('Proxy map: ', compat_str(proxy_map))
if self.params.get('call_home', False):
ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
- self._write_string('[debug] Public IP address: %s\n' % ipaddr)
+ writeln_debug('Public IP address: %s' % (ipaddr, ))
latest_version = self.urlopen(
'https://yt-dl.org/latest/version').read().decode('utf-8')
if version_tuple(latest_version) > version_tuple(__version__):
@@ -2024,13 +2602,12 @@ class YoutubeDL(object):
opts_proxy = self.params.get('proxy')
if opts_cookiefile is None:
- self.cookiejar = compat_cookiejar.CookieJar()
+ self.cookiejar = YoutubeDLCookieJar()
else:
- opts_cookiefile = compat_expanduser(opts_cookiefile)
- self.cookiejar = compat_cookiejar.MozillaCookieJar(
- opts_cookiefile)
+ opts_cookiefile = expand_path(opts_cookiefile)
+ self.cookiejar = YoutubeDLCookieJar(opts_cookiefile)
if os.access(opts_cookiefile, os.R_OK):
- self.cookiejar.load()
+ self.cookiejar.load(ignore_discard=True, ignore_expires=True)
cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
if opts_proxy is not None:
@@ -2040,7 +2617,7 @@ class YoutubeDL(object):
proxies = {'http': opts_proxy, 'https': opts_proxy}
else:
proxies = compat_urllib_request.getproxies()
- # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
+ # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
if 'http' in proxies and 'https' not in proxies:
proxies['https'] = proxies['http']
proxy_handler = PerRequestProxyHandler(proxies)
@@ -2048,12 +2625,13 @@ class YoutubeDL(object):
debuglevel = 1 if self.params.get('debug_printtraffic') else 0
https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
+ redirect_handler = YoutubeDLRedirectHandler()
data_handler = compat_urllib_request_DataHandler()
# When passing our own FileHandler instance, build_opener won't add the
# default FileHandler and allows us to disable the file protocol, which
# can be used for malicious purposes (see
- # https://github.com/rg3/youtube-dl/issues/8227)
+ # https://github.com/ytdl-org/youtube-dl/issues/8227)
file_handler = compat_urllib_request.FileHandler()
def file_open(*args, **kwargs):
@@ -2061,11 +2639,11 @@ class YoutubeDL(object):
file_handler.file_open = file_open
opener = compat_urllib_request.build_opener(
- proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
+ proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
# Delete the default user-agent header, which would otherwise apply in
# cases where our custom HTTP handler doesn't come into play
- # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
+ # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
opener.addheaders = []
self._opener = opener
@@ -2085,6 +2663,28 @@ class YoutubeDL(object):
encoding = preferredencoding()
return encoding
+ def _write_info_json(self, label, info_dict, infofn, overwrite=None):
+ if not self.params.get('writeinfojson', False):
+ return False
+
+ def msg(fmt, lbl):
+ return fmt % (lbl + ' metadata',)
+
+ if overwrite is None:
+ overwrite = not self.params.get('nooverwrites', False)
+
+ if not overwrite and os.path.exists(encodeFilename(infofn)):
+ self.to_screen(msg('[info] %s is already present', label.title()))
+ return 'exists'
+ else:
+ self.to_screen(msg('[info] Writing %s as JSON to: ', label) + infofn)
+ try:
+ write_json_file(self.filter_requested_info(info_dict), infofn)
+ return True
+ except (OSError, IOError):
+ self.report_error(msg('Cannot write %s to JSON file ', label) + infofn)
+ return
+
def _write_thumbnails(self, info_dict, filename):
if self.params.get('writethumbnail', False):
thumbnails = info_dict.get('thumbnails')
@@ -2103,7 +2703,7 @@ class YoutubeDL(object):
thumb_ext = determine_ext(t['url'], 'jpg')
suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
- t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
+ t['filename'] = thumb_filename = replace_extension(filename + suffix, thumb_ext, info_dict.get('ext'))
if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
self.to_screen('[%s] %s: Thumbnail %sis already present' %