diff options
Diffstat (limited to 'youtube_dl/downloader')
-rw-r--r-- | youtube_dl/downloader/common.py | 8 | ||||
-rw-r--r-- | youtube_dl/downloader/dash.py | 4 | ||||
-rw-r--r-- | youtube_dl/downloader/external.py | 45 | ||||
-rw-r--r-- | youtube_dl/downloader/f4m.py | 147 | ||||
-rw-r--r-- | youtube_dl/downloader/fragment.py | 111 | ||||
-rw-r--r-- | youtube_dl/downloader/hls.py | 102 | ||||
-rw-r--r-- | youtube_dl/downloader/http.py | 29 | ||||
-rw-r--r-- | youtube_dl/downloader/rtmp.py | 4 |
8 files changed, 279 insertions, 171 deletions
diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 97e755d4b..beae8c4d0 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -5,9 +5,9 @@ import re import sys import time -from ..compat import compat_str from ..utils import ( encodeFilename, + error_to_compat_str, decodeArgument, format_bytes, timeconvert, @@ -42,7 +42,7 @@ class FileDownloader(object): min_filesize: Skip files smaller than this size max_filesize: Skip files larger than this size xattr_set_filesize: Set ytdl.filesize user xattribute with expected size. - (experimenatal) + (experimental) external_downloader_args: A list of additional command-line arguments for the external downloader. @@ -186,7 +186,7 @@ class FileDownloader(object): return os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) except (IOError, OSError) as err: - self.report_error('unable to rename file: %s' % compat_str(err)) + self.report_error('unable to rename file: %s' % error_to_compat_str(err)) def try_utime(self, filename, last_modified_hdr): """Try to set the last-modified time of the given file.""" @@ -325,7 +325,7 @@ class FileDownloader(object): ) # Check file already present - if filename != '-' and nooverwrites_and_exists or continuedl_and_exists: + if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists): self.report_file_already_downloaded(filename) self._hook_progress({ 'filename': filename, diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 8b6fa2753..535f2a7fc 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re from .common import FileDownloader -from ..compat import compat_urllib_request +from ..utils import sanitized_Request class DashSegmentsFD(FileDownloader): @@ -22,7 +22,7 @@ class DashSegmentsFD(FileDownloader): def append_url_to_file(outf, target_url, target_name, remaining_bytes=None): self.to_screen('[DashSegments] %s: Downloading %s' % (info_dict['id'], target_name)) - req = compat_urllib_request.Request(target_url) + req = sanitized_Request(target_url) if remaining_bytes is not None: req.add_header('Range', 'bytes=0-%d' % (remaining_bytes - 1)) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 1d5cc9904..2bc011266 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -5,6 +5,10 @@ import subprocess from .common import FileDownloader from ..utils import ( + cli_option, + cli_valueless_option, + cli_bool_option, + cli_configuration_args, encodeFilename, encodeArgument, ) @@ -45,18 +49,17 @@ class ExternalFD(FileDownloader): def supports(cls, info_dict): return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps') - def _source_address(self, command_option): - source_address = self.params.get('source_address') - if source_address is None: - return [] - return [command_option, source_address] + def _option(self, command_option, param): + return cli_option(self.params, command_option, param) + + def _bool_option(self, command_option, param, true_value='true', false_value='false', separator=None): + return cli_bool_option(self.params, command_option, param, true_value, false_value, separator) + + def _valueless_option(self, command_option, param, expected_value=True): + return cli_valueless_option(self.params, command_option, param, expected_value) def _configuration_args(self, default=[]): - ex_args = self.params.get('external_downloader_args') - if ex_args is None: - return default - assert isinstance(ex_args, list) - return ex_args + return cli_configuration_args(self.params, 'external_downloader_args', default) def _call_downloader(self, tmpfilename, info_dict): """ Either overwrite this or implement _make_cmd """ @@ -77,7 +80,19 @@ class CurlFD(ExternalFD): cmd = [self.exe, '--location', '-o', tmpfilename] for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] - cmd += self._source_address('--interface') + cmd += self._option('--interface', 'source_address') + cmd += self._option('--proxy', 'proxy') + cmd += self._valueless_option('--insecure', 'nocheckcertificate') + cmd += self._configuration_args() + cmd += ['--', info_dict['url']] + return cmd + + +class AxelFD(ExternalFD): + def _make_cmd(self, tmpfilename, info_dict): + cmd = [self.exe, '-o', tmpfilename] + for key, val in info_dict['http_headers'].items(): + cmd += ['-H', '%s: %s' % (key, val)] cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd @@ -88,7 +103,9 @@ class WgetFD(ExternalFD): cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] - cmd += self._source_address('--bind-address') + cmd += self._option('--bind-address', 'source_address') + cmd += self._option('--proxy', 'proxy') + cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate') cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd @@ -105,7 +122,9 @@ class Aria2cFD(ExternalFD): cmd += ['--out', os.path.basename(tmpfilename)] for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] - cmd += self._source_address('--interface') + cmd += self._option('--interface', 'source_address') + cmd += self._option('--all-proxy', 'proxy') + cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') cmd += ['--', info_dict['url']] return cmd diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index b1a858c45..aaf0c49c8 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -5,19 +5,20 @@ import io import itertools import os import time -import xml.etree.ElementTree as etree -from .common import FileDownloader -from .http import HttpFD +from .fragment import FragmentFD from ..compat import ( + compat_etree_fromstring, compat_urlparse, compat_urllib_error, + compat_urllib_parse_urlparse, ) from ..utils import ( - struct_pack, - struct_unpack, encodeFilename, + fix_xml_ampersands, sanitize_open, + struct_pack, + struct_unpack, xpath_text, ) @@ -226,16 +227,13 @@ def _add_ns(prop): return '{http://ns.adobe.com/f4m/1.0}%s' % prop -class HttpQuietDownloader(HttpFD): - def to_screen(self, *args, **kargs): - pass - - -class F4mFD(FileDownloader): +class F4mFD(FragmentFD): """ A downloader for f4m manifests or AdobeHDS. """ + FD_NAME = 'f4m' + def _get_unencrypted_media(self, doc): media = doc.findall(_add_ns('media')) if not media: @@ -288,10 +286,15 @@ class F4mFD(FileDownloader): def real_download(self, filename, info_dict): man_url = info_dict['url'] requested_bitrate = info_dict.get('tbr') - self.to_screen('[download] Downloading f4m manifest') - manifest = self.ydl.urlopen(man_url).read() - - doc = etree.fromstring(manifest) + self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME) + urlh = self.ydl.urlopen(man_url) + man_url = urlh.geturl() + # Some manifests may be malformed, e.g. prosiebensat1 generated manifests + # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244 + # and https://github.com/rg3/youtube-dl/issues/7823) + manifest = fix_xml_ampersands(urlh.read().decode('utf-8', 'ignore')).strip() + + doc = compat_etree_fromstring(manifest) formats = [(int(f.attrib.get('bitrate', -1)), f) for f in self._get_unencrypted_media(doc)] if requested_bitrate is None: @@ -320,94 +323,53 @@ class F4mFD(FileDownloader): # For some akamai manifests we'll need to add a query to the fragment url akamai_pv = xpath_text(doc, _add_ns('pv-2.0')) - self.report_destination(filename) - http_dl = HttpQuietDownloader( - self.ydl, - { - 'continuedl': True, - 'quiet': True, - 'noprogress': True, - 'ratelimit': self.params.get('ratelimit', None), - 'test': self.params.get('test', False), - } - ) - tmpfilename = self.temp_name(filename) - (dest_stream, tmpfilename) = sanitize_open(tmpfilename, 'wb') + ctx = { + 'filename': filename, + 'total_frags': total_frags, + } + + self._prepare_frag_download(ctx) + + dest_stream = ctx['dest_stream'] write_flv_header(dest_stream) if not live: write_metadata_tag(dest_stream, metadata) - # This dict stores the download progress, it's updated by the progress - # hook - state = { - 'status': 'downloading', - 'downloaded_bytes': 0, - 'frag_index': 0, - 'frag_count': total_frags, - 'filename': filename, - 'tmpfilename': tmpfilename, - } - start = time.time() - - def frag_progress_hook(s): - if s['status'] not in ('downloading', 'finished'): - return - - frag_total_bytes = s.get('total_bytes', 0) - if s['status'] == 'finished': - state['downloaded_bytes'] += frag_total_bytes - state['frag_index'] += 1 - - estimated_size = ( - (state['downloaded_bytes'] + frag_total_bytes) / - (state['frag_index'] + 1) * total_frags) - time_now = time.time() - state['total_bytes_estimate'] = estimated_size - state['elapsed'] = time_now - start - - if s['status'] == 'finished': - progress = self.calc_percent(state['frag_index'], total_frags) - else: - frag_downloaded_bytes = s['downloaded_bytes'] - frag_progress = self.calc_percent(frag_downloaded_bytes, - frag_total_bytes) - progress = self.calc_percent(state['frag_index'], total_frags) - progress += frag_progress / float(total_frags) - - state['eta'] = self.calc_eta( - start, time_now, estimated_size, state['downloaded_bytes'] + frag_downloaded_bytes) - state['speed'] = s.get('speed') - self._hook_progress(state) + base_url_parsed = compat_urllib_parse_urlparse(base_url) - http_dl.add_progress_hook(frag_progress_hook) + self._start_frag_download(ctx) frags_filenames = [] while fragments_list: seg_i, frag_i = fragments_list.pop(0) name = 'Seg%d-Frag%d' % (seg_i, frag_i) - url = base_url + name + query = [] + if base_url_parsed.query: + query.append(base_url_parsed.query) if akamai_pv: - url += '?' + akamai_pv.strip(';') + query.append(akamai_pv.strip(';')) if info_dict.get('extra_param_to_segment_url'): - url += info_dict.get('extra_param_to_segment_url') - frag_filename = '%s-%s' % (tmpfilename, name) + query.append(info_dict['extra_param_to_segment_url']) + url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query)) + frag_filename = '%s-%s' % (ctx['tmpfilename'], name) try: - success = http_dl.download(frag_filename, {'url': url}) + success = ctx['dl'].download(frag_filename, {'url': url_parsed.geturl()}) if not success: return False - with open(frag_filename, 'rb') as down: - down_data = down.read() - reader = FlvReader(down_data) - while True: - _, box_type, box_data = reader.read_box_info() - if box_type == b'mdat': - dest_stream.write(box_data) - break + (down, frag_sanitized) = sanitize_open(frag_filename, 'rb') + down_data = down.read() + down.close() + reader = FlvReader(down_data) + while True: + _, box_type, box_data = reader.read_box_info() + if box_type == b'mdat': + dest_stream.write(box_data) + break if live: - os.remove(frag_filename) + os.remove(encodeFilename(frag_sanitized)) else: - frags_filenames.append(frag_filename) + frags_filenames.append(frag_sanitized) except (compat_urllib_error.HTTPError, ) as err: if live and (err.code == 404 or err.code == 410): # We didn't keep up with the live window. Continue @@ -425,20 +387,9 @@ class F4mFD(FileDownloader): msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1)) self.report_warning(msg) - dest_stream.close() + self._finish_frag_download(ctx) - elapsed = time.time() - start - self.try_rename(tmpfilename, filename) for frag_file in frags_filenames: - os.remove(frag_file) - - fsize = os.path.getsize(encodeFilename(filename)) - self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, - 'filename': filename, - 'status': 'finished', - 'elapsed': elapsed, - }) + os.remove(encodeFilename(frag_file)) return True diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py new file mode 100644 index 000000000..5a64b29ee --- /dev/null +++ b/youtube_dl/downloader/fragment.py @@ -0,0 +1,111 @@ +from __future__ import division, unicode_literals + +import os +import time + +from .common import FileDownloader +from .http import HttpFD +from ..utils import ( + encodeFilename, + sanitize_open, +) + + +class HttpQuietDownloader(HttpFD): + def to_screen(self, *args, **kargs): + pass + + +class FragmentFD(FileDownloader): + """ + A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests). + """ + + def _prepare_and_start_frag_download(self, ctx): + self._prepare_frag_download(ctx) + self._start_frag_download(ctx) + + def _prepare_frag_download(self, ctx): + self.to_screen('[%s] Total fragments: %d' % (self.FD_NAME, ctx['total_frags'])) + self.report_destination(ctx['filename']) + dl = HttpQuietDownloader( + self.ydl, + { + 'continuedl': True, + 'quiet': True, + 'noprogress': True, + 'ratelimit': self.params.get('ratelimit', None), + 'retries': self.params.get('retries', 0), + 'test': self.params.get('test', False), + } + ) + tmpfilename = self.temp_name(ctx['filename']) + dest_stream, tmpfilename = sanitize_open(tmpfilename, 'wb') + ctx.update({ + 'dl': dl, + 'dest_stream': dest_stream, + 'tmpfilename': tmpfilename, + }) + + def _start_frag_download(self, ctx): + total_frags = ctx['total_frags'] + # This dict stores the download progress, it's updated by the progress + # hook + state = { + 'status': 'downloading', + 'downloaded_bytes': 0, + 'frag_index': 0, + 'frag_count': total_frags, + 'filename': ctx['filename'], + 'tmpfilename': ctx['tmpfilename'], + } + start = time.time() + ctx['started'] = start + + def frag_progress_hook(s): + if s['status'] not in ('downloading', 'finished'): + return + + frag_total_bytes = s.get('total_bytes', 0) + if s['status'] == 'finished': + state['downloaded_bytes'] += frag_total_bytes + state['frag_index'] += 1 + + estimated_size = ( + (state['downloaded_bytes'] + frag_total_bytes) / + (state['frag_index'] + 1) * total_frags) + time_now = time.time() + state['total_bytes_estimate'] = estimated_size + state['elapsed'] = time_now - start + + if s['status'] == 'finished': + progress = self.calc_percent(state['frag_index'], total_frags) + else: + frag_downloaded_bytes = s['downloaded_bytes'] + frag_progress = self.calc_percent(frag_downloaded_bytes, + frag_total_bytes) + progress = self.calc_percent(state['frag_index'], total_frags) + progress += frag_progress / float(total_frags) + + state['eta'] = self.calc_eta( + start, time_now, estimated_size, state['downloaded_bytes'] + frag_downloaded_bytes) + state['speed'] = s.get('speed') + self._hook_progress(state) + + ctx['dl'].add_progress_hook(frag_progress_hook) + + return start + + def _finish_frag_download(self, ctx): + ctx['dest_stream'].close() + elapsed = time.time() - ctx['started'] + self.try_rename(ctx['tmpfilename'], ctx['filename']) + fsize = os.path.getsize(encodeFilename(ctx['filename'])) + + self._hook_progress({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + 'filename': ctx['filename'], + 'status': 'finished', + 'elapsed': elapsed, + }) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 8be4f4249..b5a3e1167 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -4,15 +4,16 @@ import os import re import subprocess -from ..postprocessor.ffmpeg import FFmpegPostProcessor from .common import FileDownloader -from ..compat import ( - compat_urlparse, - compat_urllib_request, -) +from .fragment import FragmentFD + +from ..compat import compat_urlparse +from ..postprocessor.ffmpeg import FFmpegPostProcessor from ..utils import ( encodeArgument, encodeFilename, + sanitize_open, + handle_youtubedl_headers, ) @@ -28,10 +29,22 @@ class HlsFD(FileDownloader): return False ffpp.check_version() - args = [ - encodeArgument(opt) - for opt in (ffpp.executable, '-y', '-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc')] - args.append(encodeFilename(tmpfilename, True)) + args = [ffpp.executable, '-y'] + + if info_dict['http_headers'] and re.match(r'^https?://', url): + # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: + # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. + headers = handle_youtubedl_headers(info_dict['http_headers']) + args += [ + '-headers', + ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] + + args += ['-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] + + args = [encodeArgument(opt) for opt in args] + args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) + + self._debug_cmd(args) retval = subprocess.call(args) if retval == 0: @@ -51,54 +64,51 @@ class HlsFD(FileDownloader): return False -class NativeHlsFD(FileDownloader): +class NativeHlsFD(FragmentFD): """ A more limited implementation that does not require ffmpeg """ + FD_NAME = 'hlsnative' + def real_download(self, filename, info_dict): - url = info_dict['url'] - self.report_destination(filename) - tmpfilename = self.temp_name(filename) + man_url = info_dict['url'] + self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) + manifest = self.ydl.urlopen(man_url).read() - self.to_screen( - '[hlsnative] %s: Downloading m3u8 manifest' % info_dict['id']) - data = self.ydl.urlopen(url).read() - s = data.decode('utf-8', 'ignore') - segment_urls = [] + s = manifest.decode('utf-8', 'ignore') + fragment_urls = [] for line in s.splitlines(): line = line.strip() if line and not line.startswith('#'): segment_url = ( line if re.match(r'^https?://', line) - else compat_urlparse.urljoin(url, line)) - segment_urls.append(segment_url) - - is_test = self.params.get('test', False) - remaining_bytes = self._TEST_FILE_SIZE if is_test else None - byte_counter = 0 - with open(tmpfilename, 'wb') as outf: - for i, segurl in enumerate(segment_urls): - self.to_screen( - '[hlsnative] %s: Downloading segment %d / %d' % - (info_dict['id'], i + 1, len(segment_urls))) - seg_req = compat_urllib_request.Request(segurl) - if remaining_bytes is not None: - seg_req.add_header('Range', 'bytes=0-%d' % (remaining_bytes - 1)) - - segment = self.ydl.urlopen(seg_req).read() - if remaining_bytes is not None: - segment = segment[:remaining_bytes] - remaining_bytes -= len(segment) - outf.write(segment) - byte_counter += len(segment) - if remaining_bytes is not None and remaining_bytes <= 0: + else compat_urlparse.urljoin(man_url, line)) + fragment_urls.append(segment_url) + # We only download the first fragment during the test + if self.params.get('test', False): break - self._hook_progress({ - 'downloaded_bytes': byte_counter, - 'total_bytes': byte_counter, + ctx = { 'filename': filename, - 'status': 'finished', - }) - self.try_rename(tmpfilename, filename) + 'total_frags': len(fragment_urls), + } + + self._prepare_and_start_frag_download(ctx) + + frags_filenames = [] + for i, frag_url in enumerate(fragment_urls): + frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i) + success = ctx['dl'].download(frag_filename, {'url': frag_url}) + if not success: + return False + down, frag_sanitized = sanitize_open(frag_filename, 'rb') + ctx['dest_stream'].write(down.read()) + down.close() + frags_filenames.append(frag_sanitized) + + self._finish_frag_download(ctx) + + for frag_file in frags_filenames: + os.remove(encodeFilename(frag_file)) + return True diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index b7f144af9..56840e026 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -4,16 +4,15 @@ import errno import os import socket import time +import re from .common import FileDownloader -from ..compat import ( - compat_urllib_request, - compat_urllib_error, -) +from ..compat import compat_urllib_error from ..utils import ( ContentTooShortError, encodeFilename, sanitize_open, + sanitized_Request, ) @@ -28,8 +27,8 @@ class HttpFD(FileDownloader): add_headers = info_dict.get('http_headers') if add_headers: headers.update(add_headers) - basic_request = compat_urllib_request.Request(url, None, headers) - request = compat_urllib_request.Request(url, None, headers) + basic_request = sanitized_Request(url, None, headers) + request = sanitized_Request(url, None, headers) is_test = self.params.get('test', False) @@ -57,6 +56,24 @@ class HttpFD(FileDownloader): # Establish connection try: data = self.ydl.urlopen(request) + # When trying to resume, Content-Range HTTP header of response has to be checked + # to match the value of requested Range HTTP header. This is due to a webservers + # that don't support resuming and serve a whole file with no Content-Range + # set in response despite of requested Range (see + # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) + if resume_len > 0: + content_range = data.headers.get('Content-Range') + if content_range: + content_range_m = re.search(r'bytes (\d+)-', content_range) + # Content-Range is present and matches requested Range, resume is possible + if content_range_m and resume_len == int(content_range_m.group(1)): + break + # Content-Range is either not present or invalid. Assuming remote webserver is + # trying to send the whole file, resume is not possible, so wiping the local file + # and performing entire redownload + self.report_unable_to_resume() + resume_len = 0 + open_mode = 'wb' break except (compat_urllib_error.HTTPError, ) as err: if (err.code < 500 or err.code >= 600) and err.code != 416: diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index 7d19bb808..14d56db47 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -105,7 +105,7 @@ class RtmpFD(FileDownloader): protocol = info_dict.get('rtmp_protocol', None) real_time = info_dict.get('rtmp_real_time', False) no_resume = info_dict.get('no_resume', False) - continue_dl = info_dict.get('continuedl', True) + continue_dl = self.params.get('continuedl', True) self.report_destination(filename) tmpfilename = self.temp_name(filename) @@ -117,7 +117,7 @@ class RtmpFD(FileDownloader): return False # Download using rtmpdump. rtmpdump returns exit code 2 when - # the connection was interrumpted and resuming appears to be + # the connection was interrupted and resuming appears to be # possible. This is part of rtmpdump's normal usage, AFAIK. basic_args = [ 'rtmpdump', '--verbose', '-r', url, |