diff options
Diffstat (limited to 'youtube_dl')
38 files changed, 2095 insertions, 936 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 702a6ad50..cad6b026e 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -21,24 +21,24 @@ import subprocess import socket import sys import time +import tokenize import traceback if os.name == 'nt': import ctypes from .compat import ( - compat_basestring, compat_cookiejar, compat_expanduser, compat_get_terminal_size, compat_http_client, compat_kwargs, compat_str, + compat_tokenize_tokenize, compat_urllib_error, compat_urllib_request, ) from .utils import ( - escape_url, ContentTooShortError, date_from_str, DateRange, @@ -49,7 +49,6 @@ from .utils import ( ExtractorError, format_bytes, formatSeconds, - HEADRequest, locked_file, make_HTTPS_handler, MaxDownloadsReached, @@ -853,8 +852,8 @@ class YoutubeDL(object): else: raise Exception('Invalid result type: %s' % result_type) - def _apply_format_filter(self, format_spec, available_formats): - " Returns a tuple of the remaining format_spec and filtered formats " + def _build_format_filter(self, filter_spec): + " Returns a function to filter the formats according to the filter_spec " OPERATORS = { '<': operator.lt, @@ -864,13 +863,13 @@ class YoutubeDL(object): '=': operator.eq, '!=': operator.ne, } - operator_rex = re.compile(r'''(?x)\s*\[ + operator_rex = re.compile(r'''(?x)\s* (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps) \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?) - \]$ + $ ''' % '|'.join(map(re.escape, OPERATORS.keys()))) - m = operator_rex.search(format_spec) + m = operator_rex.search(filter_spec) if m: try: comparison_value = int(m.group('value')) @@ -881,7 +880,7 @@ class YoutubeDL(object): if comparison_value is None: raise ValueError( 'Invalid value %r in format specification %r' % ( - m.group('value'), format_spec)) + m.group('value'), filter_spec)) op = OPERATORS[m.group('op')] if not m: @@ -889,85 +888,283 @@ class YoutubeDL(object): '=': operator.eq, '!=': operator.ne, } - str_operator_rex = re.compile(r'''(?x)\s*\[ + str_operator_rex = re.compile(r'''(?x) \s*(?P<key>ext|acodec|vcodec|container|protocol) \s*(?P<op>%s)(?P<none_inclusive>\s*\?)? \s*(?P<value>[a-zA-Z0-9_-]+) - \s*\]$ + \s*$ ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) - m = str_operator_rex.search(format_spec) + m = str_operator_rex.search(filter_spec) if m: comparison_value = m.group('value') op = STR_OPERATORS[m.group('op')] if not m: - raise ValueError('Invalid format specification %r' % format_spec) + raise ValueError('Invalid filter specification %r' % filter_spec) def _filter(f): actual_value = f.get(m.group('key')) if actual_value is None: return m.group('none_inclusive') return op(actual_value, comparison_value) - new_formats = [f for f in available_formats if _filter(f)] + return _filter + + def build_format_selector(self, format_spec): + def syntax_error(note, start): + message = ( + 'Invalid format specification: ' + '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1])) + return SyntaxError(message) + + PICKFIRST = 'PICKFIRST' + MERGE = 'MERGE' + SINGLE = 'SINGLE' + GROUP = 'GROUP' + FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters']) + + def _parse_filter(tokens): + filter_parts = [] + for type, string, start, _, _ in tokens: + if type == tokenize.OP and string == ']': + return ''.join(filter_parts) + else: + filter_parts.append(string) + + def _remove_unused_ops(tokens): + # Remove operators that we don't use and join them with the sourrounding strings + # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' + ALLOWED_OPS = ('/', '+', ',', '(', ')') + last_string, last_start, last_end, last_line = None, None, None, None + for type, string, start, end, line in tokens: + if type == tokenize.OP and string == '[': + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string, start, end, line + # everything inside brackets will be handled by _parse_filter + for type, string, start, end, line in tokens: + yield type, string, start, end, line + if type == tokenize.OP and string == ']': + break + elif type == tokenize.OP and string in ALLOWED_OPS: + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string, start, end, line + elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]: + if not last_string: + last_string = string + last_start = start + last_end = end + else: + last_string += string + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + + def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): + selectors = [] + current_selector = None + for type, string, start, _, _ in tokens: + # ENCODING is only defined in python 3.x + if type == getattr(tokenize, 'ENCODING', None): + continue + elif type in [tokenize.NAME, tokenize.NUMBER]: + current_selector = FormatSelector(SINGLE, string, []) + elif type == tokenize.OP: + if string == ')': + if not inside_group: + # ')' will be handled by the parentheses group + tokens.restore_last_token() + break + elif inside_merge and string in ['/', ',']: + tokens.restore_last_token() + break + elif inside_choice and string == ',': + tokens.restore_last_token() + break + elif string == ',': + if not current_selector: + raise syntax_error('"," must follow a format selector', start) + selectors.append(current_selector) + current_selector = None + elif string == '/': + if not current_selector: + raise syntax_error('"/" must follow a format selector', start) + first_choice = current_selector + second_choice = _parse_format_selection(tokens, inside_choice=True) + current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) + elif string == '[': + if not current_selector: + current_selector = FormatSelector(SINGLE, 'best', []) + format_filter = _parse_filter(tokens) + current_selector.filters.append(format_filter) + elif string == '(': + if current_selector: + raise syntax_error('Unexpected "("', start) + group = _parse_format_selection(tokens, inside_group=True) + current_selector = FormatSelector(GROUP, group, []) + elif string == '+': + video_selector = current_selector + audio_selector = _parse_format_selection(tokens, inside_merge=True) + if not video_selector or not audio_selector: + raise syntax_error('"+" must be between two format selectors', start) + current_selector = FormatSelector(MERGE, (video_selector, audio_selector), []) + else: + raise syntax_error('Operator not recognized: "{0}"'.format(string), start) + elif type == tokenize.ENDMARKER: + break + if current_selector: + selectors.append(current_selector) + return selectors + + def _build_selector_function(selector): + if isinstance(selector, list): + fs = [_build_selector_function(s) for s in selector] + + def selector_function(formats): + for f in fs: + for format in f(formats): + yield format + return selector_function + elif selector.type == GROUP: + selector_function = _build_selector_function(selector.selector) + elif selector.type == PICKFIRST: + fs = [_build_selector_function(s) for s in selector.selector] + + def selector_function(formats): + for f in fs: + picked_formats = list(f(formats)) + if picked_formats: + return picked_formats + return [] + elif selector.type == SINGLE: + format_spec = selector.selector + + def selector_function(formats): + formats = list(formats) + if not formats: + return + if format_spec == 'all': + for f in formats: + yield f + elif format_spec in ['best', 'worst', None]: + format_idx = 0 if format_spec == 'worst' else -1 + audiovideo_formats = [ + f for f in formats + if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] + if audiovideo_formats: + yield audiovideo_formats[format_idx] + # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format + elif (all(f.get('acodec') != 'none' for f in formats) or + all(f.get('vcodec') != 'none' for f in formats)): + yield formats[format_idx] + elif format_spec == 'bestaudio': + audio_formats = [ + f for f in formats + if f.get('vcodec') == 'none'] + if audio_formats: + yield audio_formats[-1] + elif format_spec == 'worstaudio': + audio_formats = [ + f for f in formats + if f.get('vcodec') == 'none'] + if audio_formats: + yield audio_formats[0] + elif format_spec == 'bestvideo': + video_formats = [ + f for f in formats + if f.get('acodec') == 'none'] + if video_formats: + yield video_formats[-1] + elif format_spec == 'worstvideo': + video_formats = [ + f for f in formats + if f.get('acodec') == 'none'] + if video_formats: + yield video_formats[0] + else: + extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] + if format_spec in extensions: + filter_f = lambda f: f['ext'] == format_spec + else: + filter_f = lambda f: f['format_id'] == format_spec + matches = list(filter(filter_f, formats)) + if matches: + yield matches[-1] + elif selector.type == MERGE: + def _merge(formats_info): + format_1, format_2 = [f['format_id'] for f in formats_info] + # The first format must contain the video and the + # second the audio + if formats_info[0].get('vcodec') == 'none': + self.report_error('The first format must ' + 'contain the video, try using ' + '"-f %s+%s"' % (format_2, format_1)) + return + output_ext = ( + formats_info[0]['ext'] + if self.params.get('merge_output_format') is None + else self.params['merge_output_format']) + return { + 'requested_formats': formats_info, + 'format': '%s+%s' % (formats_info[0].get('format'), + formats_info[1].get('format')), + 'format_id': '%s+%s' % (formats_info[0].get('format_id'), + formats_info[1].get('format_id')), + 'width': formats_info[0].get('width'), + 'height': formats_info[0].get('height'), + 'resolution': formats_info[0].get('resolution'), + 'fps': formats_info[0].get('fps'), + 'vcodec': formats_info[0].get('vcodec'), + 'vbr': formats_info[0].get('vbr'), + 'stretched_ratio': formats_info[0].get('stretched_ratio'), + 'acodec': formats_info[1].get('acodec'), + 'abr': formats_info[1].get('abr'), + 'ext': output_ext, + } + video_selector, audio_selector = map(_build_selector_function, selector.selector) - new_format_spec = format_spec[:-len(m.group(0))] - if not new_format_spec: - new_format_spec = 'best' + def selector_function(formats): + formats = list(formats) + for pair in itertools.product(video_selector(formats), audio_selector(formats)): + yield _merge(pair) - return (new_format_spec, new_formats) + filters = [self._build_format_filter(f) for f in selector.filters] - def select_format(self, format_spec, available_formats): - while format_spec.endswith(']'): - format_spec, available_formats = self._apply_format_filter( - format_spec, available_formats) - if not available_formats: - return None + def final_selector(formats): + for _filter in filters: + formats = list(filter(_filter, formats)) + return selector_function(formats) + return final_selector - if format_spec in ['best', 'worst', None]: - format_idx = 0 if format_spec == 'worst' else -1 - audiovideo_formats = [ - f for f in available_formats - if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] - if audiovideo_formats: - return audiovideo_formats[format_idx] - # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format - elif (all(f.get('acodec') != 'none' for f in available_formats) or - all(f.get('vcodec') != 'none' for f in available_formats)): - return available_formats[format_idx] - elif format_spec == 'bestaudio': - audio_formats = [ - f for f in available_formats - if f.get('vcodec') == 'none'] - if audio_formats: - return audio_formats[-1] - elif format_spec == 'worstaudio': - audio_formats = [ - f for f in available_formats - if f.get('vcodec') == 'none'] - if audio_formats: - return audio_formats[0] - elif format_spec == 'bestvideo': - video_formats = [ - f for f in available_formats - if f.get('acodec') == 'none'] - if video_formats: - return video_formats[-1] - elif format_spec == 'worstvideo': - video_formats = [ - f for f in available_formats - if f.get('acodec') == 'none'] - if video_formats: - return video_formats[0] - else: - extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] - if format_spec in extensions: - filter_f = lambda f: f['ext'] == format_spec - else: - filter_f = lambda f: f['format_id'] == format_spec - matches = list(filter(filter_f, available_formats)) - if matches: - return matches[-1] - return None + stream = io.BytesIO(format_spec.encode('utf-8')) + try: + tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline))) + except tokenize.TokenError: + raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) + + class TokenIterator(object): + def __init__(self, tokens): + self.tokens = tokens + self.counter = 0 + + def __iter__(self): + return self + + def __next__(self): + if self.counter >= len(self.tokens): + raise StopIteration() + value = self.tokens[self.counter] + self.counter += 1 + return value + + next = __next__ + + def restore_last_token(self): + self.counter -= 1 + + parsed_selector = _parse_format_selection(iter(TokenIterator(tokens))) + return _build_selector_function(parsed_selector) def _calc_headers(self, info_dict): res = std_headers.copy() @@ -1111,56 +1308,8 @@ class YoutubeDL(object): req_format_list.append('bestvideo+bestaudio') req_format_list.append('best') req_format = '/'.join(req_format_list) - formats_to_download = [] - if req_format == 'all': - formats_to_download = formats - else: - for rfstr in req_format.split(','): - # We can accept formats requested in the format: 34/5/best, we pick - # the first that is available, starting from left - req_formats = rfstr.split('/') - for rf in req_formats: - if re.match(r'.+?\+.+?', rf) is not None: - # Two formats have been requested like '137+139' - format_1, format_2 = rf.split('+') - formats_info = (self.select_format(format_1, formats), - self.select_format(format_2, formats)) - if all(formats_info): - # The first format must contain the video and the - # second the audio - if formats_info[0].get('vcodec') == 'none': - self.report_error('The first format must ' - 'contain the video, try using ' - '"-f %s+%s"' % (format_2, format_1)) - return - output_ext = ( - formats_info[0]['ext'] - if self.params.get('merge_output_format') is None - else self.params['merge_output_format']) - selected_format = { - 'requested_formats': formats_info, - 'format': '%s+%s' % (formats_info[0].get('format'), - formats_info[1].get('format')), - 'format_id': '%s+%s' % (formats_info[0].get('format_id'), - formats_info[1].get('format_id')), - 'width': formats_info[0].get('width'), - 'height': formats_info[0].get('height'), - 'resolution': formats_info[0].get('resolution'), - 'fps': formats_info[0].get('fps'), - 'vcodec': formats_info[0].get('vcodec'), - 'vbr': formats_info[0].get('vbr'), - 'stretched_ratio': formats_info[0].get('stretched_ratio'), - 'acodec': formats_info[1].get('acodec'), - 'abr': formats_info[1].get('abr'), - 'ext': output_ext, - } - else: - selected_format = None - else: - selected_format = self.select_format(rf, formats) - if selected_format is not None: - formats_to_download.append(selected_format) - break + format_selector = self.build_format_selector(req_format) + formats_to_download = list(format_selector(formats)) if not formats_to_download: raise ExtractorError('requested format not available', expected=True) @@ -1708,27 +1857,6 @@ class YoutubeDL(object): def urlopen(self, req): """ Start an HTTP download """ - - # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not - # always respected by websites, some tend to give out URLs with non percent-encoded - # non-ASCII characters (see telemb.py, ard.py [#3412]) - # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) - # To work around aforementioned issue we will replace request's original URL with - # percent-encoded one - req_is_string = isinstance(req, compat_basestring) - url = req if req_is_string else req.get_full_url() - url_escaped = escape_url(url) - - # Substitute URL if any change after escaping - if url != url_escaped: - if req_is_string: - req = url_escaped - else: - req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request - req = req_type( - url_escaped, data=req.data, headers=req.headers, - origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) - return self._opener.open(req, timeout=self._socket_timeout) def print_debug_header(self): diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 0c57c7aeb..ace5bd716 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -43,6 +43,11 @@ except ImportError: # Python 2 import cookielib as compat_cookiejar try: + import http.cookies as compat_cookies +except ImportError: # Python 2 + import Cookie as compat_cookies + +try: import html.entities as compat_html_entities except ImportError: # Python 2 import htmlentitydefs as compat_html_entities @@ -431,11 +436,17 @@ except TypeError: # Python 2.6 yield n n += step +if sys.version_info >= (3, 0): + from tokenize import tokenize as compat_tokenize_tokenize +else: + from tokenize import generate_tokens as compat_tokenize_tokenize + __all__ = [ 'compat_HTTPError', 'compat_basestring', 'compat_chr', 'compat_cookiejar', + 'compat_cookies', 'compat_expanduser', 'compat_get_terminal_size', 'compat_getenv', @@ -451,6 +462,7 @@ __all__ = [ 'compat_socket_create_connection', 'compat_str', 'compat_subprocess_get_DEVNULL', + 'compat_tokenize_tokenize', 'compat_urllib_error', 'compat_urllib_parse', 'compat_urllib_parse_unquote', diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index b1a858c45..275564b59 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -7,8 +7,7 @@ import os import time import xml.etree.ElementTree as etree -from .common import FileDownloader -from .http import HttpFD +from .fragment import FragmentFD from ..compat import ( compat_urlparse, compat_urllib_error, @@ -16,8 +15,6 @@ from ..compat import ( from ..utils import ( struct_pack, struct_unpack, - encodeFilename, - sanitize_open, xpath_text, ) @@ -226,16 +223,13 @@ def _add_ns(prop): return '{http://ns.adobe.com/f4m/1.0}%s' % prop -class HttpQuietDownloader(HttpFD): - def to_screen(self, *args, **kargs): - pass - - -class F4mFD(FileDownloader): +class F4mFD(FragmentFD): """ A downloader for f4m manifests or AdobeHDS. """ + FD_NAME = 'f4m' + def _get_unencrypted_media(self, doc): media = doc.findall(_add_ns('media')) if not media: @@ -288,7 +282,7 @@ class F4mFD(FileDownloader): def real_download(self, filename, info_dict): man_url = info_dict['url'] requested_bitrate = info_dict.get('tbr') - self.to_screen('[download] Downloading f4m manifest') + self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME) manifest = self.ydl.urlopen(man_url).read() doc = etree.fromstring(manifest) @@ -320,67 +314,20 @@ class F4mFD(FileDownloader): # For some akamai manifests we'll need to add a query to the fragment url akamai_pv = xpath_text(doc, _add_ns('pv-2.0')) - self.report_destination(filename) - http_dl = HttpQuietDownloader( - self.ydl, - { - 'continuedl': True, - 'quiet': True, - 'noprogress': True, - 'ratelimit': self.params.get('ratelimit', None), - 'test': self.params.get('test', False), - } - ) - tmpfilename = self.temp_name(filename) - (dest_stream, tmpfilename) = sanitize_open(tmpfilename, 'wb') + ctx = { + 'filename': filename, + 'total_frags': total_frags, + } + + self._prepare_frag_download(ctx) + + dest_stream = ctx['dest_stream'] write_flv_header(dest_stream) if not live: write_metadata_tag(dest_stream, metadata) - # This dict stores the download progress, it's updated by the progress - # hook - state = { - 'status': 'downloading', - 'downloaded_bytes': 0, - 'frag_index': 0, - 'frag_count': total_frags, - 'filename': filename, - 'tmpfilename': tmpfilename, - } - start = time.time() - - def frag_progress_hook(s): - if s['status'] not in ('downloading', 'finished'): - return - - frag_total_bytes = s.get('total_bytes', 0) - if s['status'] == 'finished': - state['downloaded_bytes'] += frag_total_bytes - state['frag_index'] += 1 - - estimated_size = ( - (state['downloaded_bytes'] + frag_total_bytes) / - (state['frag_index'] + 1) * total_frags) - time_now = time.time() - state['total_bytes_estimate'] = estimated_size - state['elapsed'] = time_now - start - - if s['status'] == 'finished': - progress = self.calc_percent(state['frag_index'], total_frags) - else: - frag_downloaded_bytes = s['downloaded_bytes'] - frag_progress = self.calc_percent(frag_downloaded_bytes, - frag_total_bytes) - progress = self.calc_percent(state['frag_index'], total_frags) - progress += frag_progress / float(total_frags) - - state['eta'] = self.calc_eta( - start, time_now, estimated_size, state['downloaded_bytes'] + frag_downloaded_bytes) - state['speed'] = s.get('speed') - self._hook_progress(state) - - http_dl.add_progress_hook(frag_progress_hook) + self._start_frag_download(ctx) frags_filenames = [] while fragments_list: @@ -391,9 +338,9 @@ class F4mFD(FileDownloader): url += '?' + akamai_pv.strip(';') if info_dict.get('extra_param_to_segment_url'): url += info_dict.get('extra_param_to_segment_url') - frag_filename = '%s-%s' % (tmpfilename, name) + frag_filename = '%s-%s' % (ctx['tmpfilename'], name) try: - success = http_dl.download(frag_filename, {'url': url}) + success = ctx['dl'].download(frag_filename, {'url': url}) if not success: return False with open(frag_filename, 'rb') as down: @@ -425,20 +372,9 @@ class F4mFD(FileDownloader): msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1)) self.report_warning(msg) - dest_stream.close() + self._finish_frag_download(ctx) - elapsed = time.time() - start - self.try_rename(tmpfilename, filename) for frag_file in frags_filenames: os.remove(frag_file) - fsize = os.path.getsize(encodeFilename(filename)) - self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, - 'filename': filename, - 'status': 'finished', - 'elapsed': elapsed, - }) - return True diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py new file mode 100644 index 000000000..5f9d6796d --- /dev/null +++ b/youtube_dl/downloader/fragment.py @@ -0,0 +1,110 @@ +from __future__ import division, unicode_literals + +import os +import time + +from .common import FileDownloader +from .http import HttpFD +from ..utils import ( + encodeFilename, + sanitize_open, +) + + +class HttpQuietDownloader(HttpFD): + def to_screen(self, *args, **kargs): + pass + + +class FragmentFD(FileDownloader): + """ + A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests). + """ + + def _prepare_and_start_frag_download(self, ctx): + self._prepare_frag_download(ctx) + self._start_frag_download(ctx) + + def _prepare_frag_download(self, ctx): + self.to_screen('[%s] Total fragments: %d' % (self.FD_NAME, ctx['total_frags'])) + self.report_destination(ctx['filename']) + dl = HttpQuietDownloader( + self.ydl, + { + 'continuedl': True, + 'quiet': True, + 'noprogress': True, + 'ratelimit': self.params.get('ratelimit', None), + 'test': self.params.get('test', False), + } + ) + tmpfilename = self.temp_name(ctx['filename']) + dest_stream, tmpfilename = sanitize_open(tmpfilename, 'wb') + ctx.update({ + 'dl': dl, + 'dest_stream': dest_stream, + 'tmpfilename': tmpfilename, + }) + + def _start_frag_download(self, ctx): + total_frags = ctx['total_frags'] + # This dict stores the download progress, it's updated by the progress + # hook + state = { + 'status': 'downloading', + 'downloaded_bytes': 0, + 'frag_index': 0, + 'frag_count': total_frags, + 'filename': ctx['filename'], + 'tmpfilename': ctx['tmpfilename'], + } + start = time.time() + ctx['started'] = start + + def frag_progress_hook(s): + if s['status'] not in ('downloading', 'finished'): + return + + frag_total_bytes = s.get('total_bytes', 0) + if s['status'] == 'finished': + state['downloaded_bytes'] += frag_total_bytes + state['frag_index'] += 1 + + estimated_size = ( + (state['downloaded_bytes'] + frag_total_bytes) / + (state['frag_index'] + 1) * total_frags) + time_now = time.time() + state['total_bytes_estimate'] = estimated_size + state['elapsed'] = time_now - start + + if s['status'] == 'finished': + progress = self.calc_percent(state['frag_index'], total_frags) + else: + frag_downloaded_bytes = s['downloaded_bytes'] + frag_progress = self.calc_percent(frag_downloaded_bytes, + frag_total_bytes) + progress = self.calc_percent(state['frag_index'], total_frags) + progress += frag_progress / float(total_frags) + + state['eta'] = self.calc_eta( + start, time_now, estimated_size, state['downloaded_bytes'] + frag_downloaded_bytes) + state['speed'] = s.get('speed') + self._hook_progress(state) + + ctx['dl'].add_progress_hook(frag_progress_hook) + + return start + + def _finish_frag_download(self, ctx): + ctx['dest_stream'].close() + elapsed = time.time() - ctx['started'] + self.try_rename(ctx['tmpfilename'], ctx['filename']) + fsize = os.path.getsize(encodeFilename(ctx['filename'])) + + self._hook_progress({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + 'filename': ctx['filename'], + 'status': 'finished', + 'elapsed': elapsed, + }) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 8be4f4249..60dca0ab1 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -4,12 +4,11 @@ import os import re import subprocess -from ..postprocessor.ffmpeg import FFmpegPostProcessor from .common import FileDownloader -from ..compat import ( - compat_urlparse, - compat_urllib_request, -) +from .fragment import FragmentFD + +from ..compat import compat_urlparse +from ..postprocessor.ffmpeg import FFmpegPostProcessor from ..utils import ( encodeArgument, encodeFilename, @@ -51,54 +50,50 @@ class HlsFD(FileDownloader): return False -class NativeHlsFD(FileDownloader): +class NativeHlsFD(FragmentFD): """ A more limited implementation that does not require ffmpeg """ + FD_NAME = 'hlsnative' + def real_download(self, filename, info_dict): - url = info_dict['url'] - self.report_destination(filename) - tmpfilename = self.temp_name(filename) + man_url = info_dict['url'] + self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) + manifest = self.ydl.urlopen(man_url).read() - self.to_screen( - '[hlsnative] %s: Downloading m3u8 manifest' % info_dict['id']) - data = self.ydl.urlopen(url).read() - s = data.decode('utf-8', 'ignore') - segment_urls = [] + s = manifest.decode('utf-8', 'ignore') + fragment_urls = [] for line in s.splitlines(): line = line.strip() if line and not line.startswith('#'): segment_url = ( line if re.match(r'^https?://', line) - else compat_urlparse.urljoin(url, line)) - segment_urls.append(segment_url) - - is_test = self.params.get('test', False) - remaining_bytes = self._TEST_FILE_SIZE if is_test else None - byte_counter = 0 - with open(tmpfilename, 'wb') as outf: - for i, segurl in enumerate(segment_urls): - self.to_screen( - '[hlsnative] %s: Downloading segment %d / %d' % - (info_dict['id'], i + 1, len(segment_urls))) - seg_req = compat_urllib_request.Request(segurl) - if remaining_bytes is not None: - seg_req.add_header('Range', 'bytes=0-%d' % (remaining_bytes - 1)) - - segment = self.ydl.urlopen(seg_req).read() - if remaining_bytes is not None: - segment = segment[:remaining_bytes] - remaining_bytes -= len(segment) - outf.write(segment) - byte_counter += len(segment) - if remaining_bytes is not None and remaining_bytes <= 0: + else compat_urlparse.urljoin(man_url, line)) + fragment_urls.append(segment_url) + # We only download the first fragment during the test + if self.params.get('test', False): break - self._hook_progress({ - 'downloaded_bytes': byte_counter, - 'total_bytes': byte_counter, + ctx = { 'filename': filename, - 'status': 'finished', - }) - self.try_rename(tmpfilename, filename) + 'total_frags': len(fragment_urls), + } + + self._prepare_and_start_frag_download(ctx) + + frags_filenames = [] + for i, frag_url in enumerate(fragment_urls): + frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i) + success = ctx['dl'].download(frag_filename, {'url': frag_url}) + if not success: + return False + with open(frag_filename, 'rb') as down: + ctx['dest_stream'].write(down.read()) + frags_filenames.append(frag_filename) + + self._finish_frag_download(ctx) + + for frag_file in frags_filenames: + os.remove(frag_file) + return True diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index b7f144af9..a29f5cf31 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -4,6 +4,7 @@ import errno import os import socket import time +import re from .common import FileDownloader from ..compat import ( @@ -57,6 +58,24 @@ class HttpFD(FileDownloader): # Establish connection try: data = self.ydl.urlopen(request) + # When trying to resume, Content-Range HTTP header of response has to be checked + # to match the value of requested Range HTTP header. This is due to a webservers + # that don't support resuming and serve a whole file with no Content-Range + # set in response despite of requested Range (see + # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) + if resume_len > 0: + content_range = data.headers.get('Content-Range') + if content_range: + content_range_m = re.search(r'bytes (\d+)-', content_range) + # Content-Range is present and matches requested Range, resume is possible + if content_range_m and resume_len == int(content_range_m.group(1)): + break + # Content-Range is either not present or invalid. Assuming remote webserver is + # trying to send the whole file, resume is not possible, so wiping the local file + # and performing entire redownload + self.report_unable_to_resume() + resume_len = 0 + open_mode = 'wb' break except (compat_urllib_error.HTTPError, ) as err: if (err.code < 500 or err.code >= 600) and err.code != 416: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3cfa804ec..922d9b3d8 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -43,7 +43,10 @@ from .azubu import AzubuIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE -from .bbccouk import BBCCoUkIE +from .bbc import ( + BBCCoUkIE, + BBCIE, +) from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE @@ -115,6 +118,7 @@ from .dailymotion import ( ) from .daum import DaumIE from .dbtv import DBTVIE +from .dcn import DCNIE from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .dfb import DFBIE @@ -243,6 +247,7 @@ from .instagram import InstagramIE, InstagramUserIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE from .iqiyi import IqiyiIE +from .ir90tv import Ir90TvIE from .ivi import ( IviIE, IviCompilationIE diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py new file mode 100644 index 000000000..abc5a44a1 --- /dev/null +++ b/youtube_dl/extractor/bbc.py @@ -0,0 +1,780 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + parse_duration, + parse_iso8601, +) +from ..compat import compat_HTTPError + + +class BBCCoUkIE(InfoExtractor): + IE_NAME = 'bbc.co.uk' + IE_DESC = 'BBC iPlayer' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})' + + _MEDIASELECTOR_URLS = [ + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', + ] + + _TESTS = [ + { + 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', + 'info_dict': { + 'id': 'b039d07m', + 'ext': 'flv', + 'title': 'Kaleidoscope, Leonard Cohen', + 'description': 'The Canadian poet and songwriter reflects on his musical career.', + 'duration': 1740, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, + { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/', + 'info_dict': { + 'id': 'b00yng1d', + 'ext': 'flv', + 'title': 'The Man in Black: Series 3: The Printed Name', + 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.", + 'duration': 1800, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Episode is no longer available on BBC iPlayer Radio', + }, + { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/', + 'info_dict': { + 'id': 'b00yng1d', + 'ext': 'flv', + 'title': 'The Voice UK: Series 3: Blind Auditions 5', + 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.", + 'duration': 5100, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', + }, + { + 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion', + 'info_dict': { + 'id': 'b03k3pb7', + 'ext': 'flv', + 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction", + 'description': '2. Invasion', + 'duration': 3600, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', + }, { + 'url': 'http://www.bbc.co.uk/programmes/b04v20dw', + 'info_dict': { + 'id': 'b04v209v', + 'ext': 'flv', + 'title': 'Pete Tong, The Essential New Tune Special', + 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!", + 'duration': 10800, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3', + 'note': 'Audio', + 'info_dict': { + 'id': 'p02frcch', + 'ext': 'flv', + 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix', + 'description': 'French house superstar Madeon takes us out of the club and onto the after party.', + 'duration': 3507, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz', + 'note': 'Video', + 'info_dict': { + 'id': 'p025c103', + 'ext': 'flv', + 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)', + 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014', + 'duration': 226, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls', + 'info_dict': { + 'id': 'p02n76xf', + 'ext': 'flv', + 'title': 'Natural World, 2015-2016: 2. Super Powered Owls', + 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d', + 'duration': 3540, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'geolocation', + }, { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition', + 'info_dict': { + 'id': 'b05zmgw1', + 'ext': 'flv', + 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.', + 'title': 'Royal Academy Summer Exhibition', + 'duration': 3540, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'geolocation', + }, { + 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo', + 'only_matching': True, + } + ] + + class MediaSelectionError(Exception): + def __init__(self, id): + self.id = id + + def _extract_asx_playlist(self, connection, programme_id): + asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') + return [ref.get('href') for ref in asx.findall('./Entry/ref')] + + def _extract_connection(self, connection, programme_id): + formats = [] + protocol = connection.get('protocol') + supplier = connection.get('supplier') + if protocol == 'http': + href = connection.get('href') + transfer_format = connection.get('transferFormat') + # ASX playlist + if supplier == 'asx': + for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): + formats.append({ + 'url': ref, + 'format_id': 'ref%s_%s' % (i, supplier), + }) + # Skip DASH until supported + elif transfer_format == 'dash': + pass + # Direct link + else: + formats.append({ + 'url': href, + 'format_id': supplier, + }) + elif protocol == 'rtmp': + application = connection.get('application', 'ondemand') + auth_string = connection.get('authString') + identifier = connection.get('identifier') + server = connection.get('server') + formats.append({ + 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), + 'play_path': identifier, + 'app': '%s?%s' % (application, auth_string), + 'page_url': 'http://www.bbc.co.uk', + 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', + 'rtmp_live': False, + 'ext': 'flv', + 'format_id': supplier, + }) + return formats + + def _extract_items(self, playlist): + return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item') + + def _extract_medias(self, media_selection): + error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error') + if error is not None: + raise BBCCoUkIE.MediaSelectionError(error.get('id')) + return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media') + + def _extract_connections(self, media): + return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection') + + def _extract_video(self, media, programme_id): + formats = [] + vbr = int_or_none(media.get('bitrate')) + vcodec = media.get('encoding') + service = media.get('service') + width = int_or_none(media.get('width')) + height = int_or_none(media.get('height')) + file_size = int_or_none(media.get('media_file_size')) + for connection in self._extract_connections(media): + conn_formats = self._extract_connection(connection, programme_id) + for format in conn_formats: + format.update({ + 'format_id': '%s_%s' % (service, format['format_id']), + 'width': width, + 'height': height, + 'vbr': vbr, + 'vcodec': vcodec, + 'filesize': file_size, + }) + formats.extend(conn_formats) + return formats + + def _extract_audio(self, media, programme_id): + formats = [] + abr = int_or_none(media.get('bitrate')) + acodec = media.get('encoding') + service = media.get('service') + for connection in self._extract_connections(media): + conn_formats = self._extract_connection(connection, programme_id) + for format in conn_formats: + format.update({ + 'format_id': '%s_%s' % (service, format['format_id']), + 'abr': abr, + 'acodec': acodec, + }) + formats.extend(conn_formats) + return formats + + def _get_subtitles(self, media, programme_id): + subtitles = {} + for connection in self._extract_connections(media): + captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') + lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') + subtitles[lang] = [ + { + 'url': connection.get('href'), + 'ext': 'ttml', + }, + ] + return subtitles + + def _raise_extractor_error(self, media_selection_error): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, media_selection_error.id), + expected=True) + + def _download_media_selector(self, programme_id): + last_exception = None + for mediaselector_url in self._MEDIASELECTOR_URLS: + try: + return self._download_media_selector_url( + mediaselector_url % programme_id, programme_id) + except BBCCoUkIE.MediaSelectionError as e: + if e.id == 'notukerror': + last_exception = e + continue + self._raise_extractor_error(e) + self._raise_extractor_error(last_exception) + + def _download_media_selector_url(self, url, programme_id=None): + try: + media_selection = self._download_xml( + url, programme_id, 'Downloading media selection XML') + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: + media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) + else: + raise + return self._process_media_selector(media_selection, programme_id) + + def _process_media_selector(self, media_selection, programme_id): + formats = [] + subtitles = None + + for media in self._extract_medias(media_selection): + kind = media.get('kind') + if kind == 'audio': + formats.extend(self._extract_audio(media, programme_id)) + elif kind == 'video': + formats.extend(self._extract_video(media, programme_id)) + elif kind == 'captions': + subtitles = self.extract_subtitles(media, programme_id) + return formats, subtitles + + def _download_playlist(self, playlist_id): + try: + playlist = self._download_json( + 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, + playlist_id, 'Downloading playlist JSON') + + version = playlist.get('defaultAvailableVersion') + if version: + smp_config = version['smpConfig'] + title = smp_config['title'] + description = smp_config['summary'] + for item in smp_config['items']: + kind = item['kind'] + if kind != 'programme' and kind != 'radioProgramme': + continue + programme_id = item.get('vpid') + duration = int_or_none(item.get('duration')) + formats, subtitles = self._download_media_selector(programme_id) + return programme_id, title, description, duration, formats, subtitles + except ExtractorError as ee: + if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): + raise + + # fallback to legacy playlist + return self._process_legacy_playlist(playlist_id) + + def _process_legacy_playlist_url(self, url, display_id): + playlist = self._download_legacy_playlist_url(url, display_id) + return self._extract_from_legacy_playlist(playlist, display_id) + + def _process_legacy_playlist(self, playlist_id): + return self._process_legacy_playlist_url( + 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id) + + def _download_legacy_playlist_url(self, url, playlist_id=None): + return self._download_xml( + url, playlist_id, 'Downloading legacy playlist XML') + + def _extract_from_legacy_playlist(self, playlist, playlist_id): + no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems') + if no_items is not None: + reason = no_items.get('reason') + if reason == 'preAvailability': + msg = 'Episode %s is not yet available' % playlist_id + elif reason == 'postAvailability': + msg = 'Episode %s is no longer available' % playlist_id + elif reason == 'noMedia': + msg = 'Episode %s is not currently available' % playlist_id + else: + msg = 'Episode %s is not available: %s' % (playlist_id, reason) + raise ExtractorError(msg, expected=True) + + for item in self._extract_items(playlist): + kind = item.get('kind') + if kind != 'programme' and kind != 'radioProgramme': + continue + title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text + description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text + + def get_programme_id(item): + def get_from_attributes(item): + for p in('identifier', 'group'): + value = item.get(p) + if value and re.match(r'^[pb][\da-z]{7}$', value): + return value + get_from_attributes(item) + mediator = item.find('./{http://bbc.co.uk/2008/emp/playlist}mediator') + if mediator is not None: + return get_from_attributes(mediator) + + programme_id = get_programme_id(item) + duration = int_or_none(item.get('duration')) + # TODO: programme_id can be None and media items can be incorporated right inside + # playlist's item (e.g. http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) + # as f4m and m3u8 + formats, subtitles = self._download_media_selector(programme_id) + + return programme_id, title, description, duration, formats, subtitles + + def _real_extract(self, url): + group_id = self._match_id(url) + + webpage = self._download_webpage(url, group_id, 'Downloading video page') + + programme_id = None + + tviplayer = self._search_regex( + r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById', + webpage, 'player', default=None) + + if tviplayer: + player = self._parse_json(tviplayer, group_id).get('player', {}) + duration = int_or_none(player.get('duration')) + programme_id = player.get('vpid') + + if not programme_id: + programme_id = self._search_regex( + r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + title = self._og_search_title(webpage) + description = self._search_regex( + r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>', + webpage, 'description', fatal=False) + else: + programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) + + self._sort_formats(formats) + + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } + + +class BBCIE(BBCCoUkIE): + IE_NAME = 'bbc' + IE_DESC = 'BBC' + _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)' + + _MEDIASELECTOR_URLS = [ + # Provides more formats, namely direct mp4 links, but fails on some videos with + # notukerror for non UK (?) users (e.g. + # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) + 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s', + # Provides fewer formats, but works everywhere for everybody (hopefully) + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s', + ] + + _TESTS = [{ + # article with multiple videos embedded with data-media-meta containing + # playlist.sxml, externalId and no direct video links + 'url': 'http://www.bbc.com/news/world-europe-32668511', + 'info_dict': { + 'id': 'world-europe-32668511', + 'title': 'Russia stages massive WW2 parade despite Western boycott', + 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', + }, + 'playlist_count': 2, + }, { + # article with multiple videos embedded with data-media-meta (more videos) + 'url': 'http://www.bbc.com/news/business-28299555', + 'info_dict': { + 'id': 'business-28299555', + 'title': 'Farnborough Airshow: Video highlights', + 'description': 'BBC reports and video highlights at the Farnborough Airshow.', + }, + 'playlist_count': 9, + 'skip': 'Save time', + }, { + # article with multiple videos embedded with `new SMP()` + 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460', + 'info_dict': { + 'id': '3662a707-0af9-3149-963f-47bea720b460', + 'title': 'BBC Blogs - Adam Curtis - BUGGER', + }, + 'playlist_count': 18, + }, { + # single video embedded with mediaAssetPage.init() + 'url': 'http://www.bbc.com/news/world-europe-32041533', + 'info_dict': { + 'id': 'p02mprgb', + 'ext': 'mp4', + 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'duration': 47, + 'timestamp': 1427219242, + 'upload_date': '20150324', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # article with single video embedded with data-media-meta containing + # direct video links (for now these are extracted) and playlist.xml (with + # media items as f4m and m3u8 - currently unsupported) + 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', + 'info_dict': { + 'id': '150615_telabyad_kentin_cogu', + 'ext': 'mp4', + 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde", + 'duration': 47, + 'timestamp': 1434397334, + 'upload_date': '20150615', + }, + 'params': { + 'skip_download': True, + } + }, { + # single video embedded with mediaAssetPage.init() (regional section) + 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', + 'info_dict': { + 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', + 'ext': 'mp4', + 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', + 'duration': 87, + 'timestamp': 1434713142, + 'upload_date': '20150619', + }, + 'params': { + 'skip_download': True, + } + }, { + # single video from video playlist embedded with vxp-playlist-data JSON + 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', + 'info_dict': { + 'id': 'p02w6qjc', + 'ext': 'mp4', + 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', + 'duration': 56, + }, + 'params': { + 'skip_download': True, + } + }, { + # single video story with digitalData + 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', + 'info_dict': { + 'id': 'p02q6gc4', + 'ext': 'flv', + 'title': 'Sri Lanka’s spicy secret', + 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.', + 'timestamp': 1437674293, + 'upload_date': '20150723', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video story without digitalData + 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', + 'info_dict': { + 'id': 'p018zqqg', + 'ext': 'mp4', + 'title': 'Hyundai Santa Fe Sport: Rock star', + 'description': 'md5:b042a26142c4154a6e472933cf20793d', + 'timestamp': 1368473503, + 'upload_date': '20130513', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video with playlist.sxml URL + 'url': 'http://www.bbc.com/sport/0/football/33653409', + 'info_dict': { + 'id': 'p02xycnp', + 'ext': 'mp4', + 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'md5:398fca0e2e701c609d726e034fa1fc89', + 'duration': 140, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video with playlist URL from weather section + 'url': 'http://www.bbc.com/weather/features/33601775', + 'only_matching': True, + }, { + # custom redirection to www.bbc.com + 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if BBCCoUkIE.suitable(url) else super(BBCIE, cls).suitable(url) + + def _extract_from_media_meta(self, media_meta, video_id): + # Direct links to media in media metadata (e.g. + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) + # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml + source_files = media_meta.get('sourceFiles') + if source_files: + return [{ + 'url': f['url'], + 'format_id': format_id, + 'ext': f.get('encoding'), + 'tbr': float_or_none(f.get('bitrate'), 1000), + 'filesize': int_or_none(f.get('filesize')), + } for format_id, f in source_files.items() if f.get('url')], [] + + programme_id = media_meta.get('externalId') + if programme_id: + return self._download_media_selector(programme_id) + + # Process playlist.sxml as legacy playlist + href = media_meta.get('href') + if href: + playlist = self._download_legacy_playlist_url(href) + _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id) + return formats, subtitles + + return [], [] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + timestamp = parse_iso8601(self._search_regex( + [r'"datePublished":\s*"([^"]+)', + r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"', + r'itemprop="datePublished"[^>]+datetime="([^"]+)"'], + webpage, 'date', default=None)) + + # single video with playlist.sxml URL (e.g. http://www.bbc.com/sport/0/football/3365340ng) + playlist = self._search_regex( + r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', + webpage, 'playlist', default=None) + if playlist: + programme_id, title, description, duration, formats, subtitles = \ + self._process_legacy_playlist_url(playlist, playlist_id) + self._sort_formats(formats) + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + } + + # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) + programme_id = self._search_regex( + [r'data-video-player-vpid="([\da-z]{8})"', + r'<param[^>]+name="externalIdentifier"[^>]+value="([\da-z]{8})"'], + webpage, 'vpid', default=None) + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star) + digital_data = self._parse_json( + self._search_regex( + r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'), + programme_id, fatal=False) + page_info = digital_data.get('page', {}).get('pageInfo', {}) + title = page_info.get('pageName') or self._og_search_title(webpage) + description = page_info.get('description') or self._og_search_description(webpage) + timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + } + + playlist_title = self._html_search_regex( + r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title') + playlist_description = self._og_search_description(webpage, default=None) + + def extract_all(pattern): + return list(filter(None, map( + lambda s: self._parse_json(s, playlist_id, fatal=False), + re.findall(pattern, webpage)))) + + # Multiple video article (e.g. + # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) + EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?' + entries = [] + for match in extract_all(r'new\s+SMP\(({.+?})\)'): + embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') + if embed_url and re.match(EMBED_URL, embed_url): + entries.append(embed_url) + entries.extend(re.findall( + r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage)) + if entries: + return self.playlist_result( + [self.url_result(entry, 'BBCCoUk') for entry in entries], + playlist_id, playlist_title, playlist_description) + + # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) + medias = extract_all(r"data-media-meta='({[^']+})'") + + if not medias: + # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international) + media_asset = self._search_regex( + r'mediaAssetPage\.init\(\s*({.+?}), "/', + webpage, 'media asset', default=None) + if media_asset: + media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False) + medias = [] + for video in media_asset_page.get('videos', {}).values(): + medias.extend(video.values()) + + if not medias: + # Multiple video playlist with single `now playing` entry (e.g. + # http://www.bbc.com/news/video_and_audio/must_see/33767813) + vxp_playlist = self._parse_json( + self._search_regex( + r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>', + webpage, 'playlist data'), + playlist_id) + playlist_medias = [] + for item in vxp_playlist: + media = item.get('media') + if not media: + continue + playlist_medias.append(media) + # Download single video if found media with asset id matching the video id from URL + if item.get('advert', {}).get('assetId') == playlist_id: + medias = [media] + break + # Fallback to the whole playlist + if not medias: + medias = playlist_medias + + entries = [] + for num, media_meta in enumerate(medias, start=1): + formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id) + if not formats: + continue + self._sort_formats(formats) + + video_id = media_meta.get('externalId') + if not video_id: + video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num) + + title = media_meta.get('caption') + if not title: + title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num) + + duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration')) + + images = [] + for image in media_meta.get('images', {}).values(): + images.extend(image.values()) + if 'image' in media_meta: + images.append(media_meta['image']) + + thumbnails = [{ + 'url': image.get('href'), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in images] + + entries.append({ + 'id': video_id, + 'title': title, + 'thumbnails': thumbnails, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + }) + + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py deleted file mode 100644 index 5825d2867..000000000 --- a/youtube_dl/extractor/bbccouk.py +++ /dev/null @@ -1,379 +0,0 @@ -from __future__ import unicode_literals - -import xml.etree.ElementTree - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, -) -from ..compat import compat_HTTPError - - -class BBCCoUkIE(InfoExtractor): - IE_NAME = 'bbc.co.uk' - IE_DESC = 'BBC iPlayer' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})' - - _TESTS = [ - { - 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', - 'info_dict': { - 'id': 'b039d07m', - 'ext': 'flv', - 'title': 'Kaleidoscope, Leonard Cohen', - 'description': 'The Canadian poet and songwriter reflects on his musical career.', - 'duration': 1740, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, - { - 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/', - 'info_dict': { - 'id': 'b00yng1d', - 'ext': 'flv', - 'title': 'The Man in Black: Series 3: The Printed Name', - 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.", - 'duration': 1800, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Episode is no longer available on BBC iPlayer Radio', - }, - { - 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/', - 'info_dict': { - 'id': 'b00yng1d', - 'ext': 'flv', - 'title': 'The Voice UK: Series 3: Blind Auditions 5', - 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.", - 'duration': 5100, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', - }, - { - 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion', - 'info_dict': { - 'id': 'b03k3pb7', - 'ext': 'flv', - 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction", - 'description': '2. Invasion', - 'duration': 3600, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', - }, { - 'url': 'http://www.bbc.co.uk/programmes/b04v20dw', - 'info_dict': { - 'id': 'b04v209v', - 'ext': 'flv', - 'title': 'Pete Tong, The Essential New Tune Special', - 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!", - 'duration': 10800, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3', - 'note': 'Audio', - 'info_dict': { - 'id': 'p02frcch', - 'ext': 'flv', - 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix', - 'description': 'French house superstar Madeon takes us out of the club and onto the after party.', - 'duration': 3507, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz', - 'note': 'Video', - 'info_dict': { - 'id': 'p025c103', - 'ext': 'flv', - 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)', - 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014', - 'duration': 226, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls', - 'info_dict': { - 'id': 'p02n76xf', - 'ext': 'flv', - 'title': 'Natural World, 2015-2016: 2. Super Powered Owls', - 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d', - 'duration': 3540, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'geolocation', - }, { - 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition', - 'info_dict': { - 'id': 'b05zmgw1', - 'ext': 'flv', - 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.', - 'title': 'Royal Academy Summer Exhibition', - 'duration': 3540, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'geolocation', - }, { - 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', - 'only_matching': True, - }, { - 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3', - 'only_matching': True, - }, { - 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo', - 'only_matching': True, - } - ] - - def _extract_asx_playlist(self, connection, programme_id): - asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') - return [ref.get('href') for ref in asx.findall('./Entry/ref')] - - def _extract_connection(self, connection, programme_id): - formats = [] - protocol = connection.get('protocol') - supplier = connection.get('supplier') - if protocol == 'http': - href = connection.get('href') - # ASX playlist - if supplier == 'asx': - for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): - formats.append({ - 'url': ref, - 'format_id': 'ref%s_%s' % (i, supplier), - }) - # Direct link - else: - formats.append({ - 'url': href, - 'format_id': supplier, - }) - elif protocol == 'rtmp': - application = connection.get('application', 'ondemand') - auth_string = connection.get('authString') - identifier = connection.get('identifier') - server = connection.get('server') - formats.append({ - 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), - 'play_path': identifier, - 'app': '%s?%s' % (application, auth_string), - 'page_url': 'http://www.bbc.co.uk', - 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', - 'rtmp_live': False, - 'ext': 'flv', - 'format_id': supplier, - }) - return formats - - def _extract_items(self, playlist): - return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item') - - def _extract_medias(self, media_selection): - error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error') - if error is not None: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True) - return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media') - - def _extract_connections(self, media): - return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection') - - def _extract_video(self, media, programme_id): - formats = [] - vbr = int(media.get('bitrate')) - vcodec = media.get('encoding') - service = media.get('service') - width = int(media.get('width')) - height = int(media.get('height')) - file_size = int(media.get('media_file_size')) - for connection in self._extract_connections(media): - conn_formats = self._extract_connection(connection, programme_id) - for format in conn_formats: - format.update({ - 'format_id': '%s_%s' % (service, format['format_id']), - 'width': width, - 'height': height, - 'vbr': vbr, - 'vcodec': vcodec, - 'filesize': file_size, - }) - formats.extend(conn_formats) - return formats - - def _extract_audio(self, media, programme_id): - formats = [] - abr = int(media.get('bitrate')) - acodec = media.get('encoding') - service = media.get('service') - for connection in self._extract_connections(media): - conn_formats = self._extract_connection(connection, programme_id) - for format in conn_formats: - format.update({ - 'format_id': '%s_%s' % (service, format['format_id']), - 'abr': abr, - 'acodec': acodec, - }) - formats.extend(conn_formats) - return formats - - def _get_subtitles(self, media, programme_id): - subtitles = {} - for connection in self._extract_connections(media): - captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') - lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') - subtitles[lang] = [ - { - 'url': connection.get('href'), - 'ext': 'ttml', - }, - ] - return subtitles - - def _download_media_selector(self, programme_id): - try: - media_selection = self._download_xml( - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id, - programme_id, 'Downloading media selection XML') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) - else: - raise - - formats = [] - subtitles = None - - for media in self._extract_medias(media_selection): - kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) - elif kind == 'captions': - subtitles = self.extract_subtitles(media, programme_id) - - return formats, subtitles - - def _download_playlist(self, playlist_id): - try: - playlist = self._download_json( - 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, - playlist_id, 'Downloading playlist JSON') - - version = playlist.get('defaultAvailableVersion') - if version: - smp_config = version['smpConfig'] - title = smp_config['title'] - description = smp_config['summary'] - for item in smp_config['items']: - kind = item['kind'] - if kind != 'programme' and kind != 'radioProgramme': - continue - programme_id = item.get('vpid') - duration = int(item.get('duration')) - formats, subtitles = self._download_media_selector(programme_id) - return programme_id, title, description, duration, formats, subtitles - except ExtractorError as ee: - if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): - raise - - # fallback to legacy playlist - playlist = self._download_xml( - 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, - playlist_id, 'Downloading legacy playlist XML') - - no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems') - if no_items is not None: - reason = no_items.get('reason') - if reason == 'preAvailability': - msg = 'Episode %s is not yet available' % playlist_id - elif reason == 'postAvailability': - msg = 'Episode %s is no longer available' % playlist_id - elif reason == 'noMedia': - msg = 'Episode %s is not currently available' % playlist_id - else: - msg = 'Episode %s is not available: %s' % (playlist_id, reason) - raise ExtractorError(msg, expected=True) - - for item in self._extract_items(playlist): - kind = item.get('kind') - if kind != 'programme' and kind != 'radioProgramme': - continue - title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text - description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text - programme_id = item.get('identifier') - duration = int(item.get('duration')) - formats, subtitles = self._download_media_selector(programme_id) - - return programme_id, title, description, duration, formats, subtitles - - def _real_extract(self, url): - group_id = self._match_id(url) - - webpage = self._download_webpage(url, group_id, 'Downloading video page') - - programme_id = None - - tviplayer = self._search_regex( - r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById', - webpage, 'player', default=None) - - if tviplayer: - player = self._parse_json(tviplayer, group_id).get('player', {}) - duration = int_or_none(player.get('duration')) - programme_id = player.get('vpid') - - if not programme_id: - programme_id = self._search_regex( - r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None) - - if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - title = self._og_search_title(webpage) - description = self._search_regex( - r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>', - webpage, 'description', fatal=False) - else: - programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) - - self._sort_formats(formats) - - return { - 'id': programme_id, - 'title': title, - 'description': description, - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 699b4f7d0..57e0cda2c 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -106,15 +106,11 @@ class CanalplusIE(InfoExtractor): continue format_id = fmt.tag if format_id == 'HLS': - hls_formats = self._extract_m3u8_formats(format_url, video_id, 'flv') - for fmt in hls_formats: - fmt['preference'] = preference(format_id) - formats.extend(hls_formats) + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', preference=preference(format_id))) elif format_id == 'HDS': - hds_formats = self._extract_f4m_formats(format_url + '?hdcore=2.11.3', video_id) - for fmt in hds_formats: - fmt['preference'] = preference(format_id) - formats.extend(hds_formats) + formats.extend(self._extract_f4m_formats( + format_url + '?hdcore=2.11.3', video_id, preference=preference(format_id))) else: formats.append({ 'url': format_url, diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 9c25b2223..81f3d7697 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -36,7 +36,7 @@ class ComCarCoffIE(InfoExtractor): webpage, 'full data json')) video_id = full_data['activeVideo']['video'] - video_data = full_data['videos'][video_id] + video_data = full_data.get('videos', {}).get(video_id) or full_data['singleshots'][video_id] thumbnails = [{ 'url': video_data['images']['thumb'], }, { diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b9014fc23..dc5080504 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -14,10 +14,12 @@ import xml.etree.ElementTree from ..compat import ( compat_cookiejar, + compat_cookies, compat_HTTPError, compat_http_client, compat_urllib_error, compat_urllib_parse_urlparse, + compat_urllib_request, compat_urlparse, compat_str, ) @@ -65,7 +67,7 @@ class InfoExtractor(object): Potential fields: * url Mandatory. The URL of the video file - * ext Will be calculated from url if missing + * ext Will be calculated from URL if missing * format A human-readable description of the format ("mp4 container with h264/opus"). Calculated from the format_id, width, height. @@ -155,7 +157,7 @@ class InfoExtractor(object): lower to higher preference, each element is a dictionary with the "ext" entry and one of: * "data": The subtitles file contents - * "url": A url pointing to the subtitles file + * "url": A URL pointing to the subtitles file automatic_captions: Like 'subtitles', used by the YoutubeIE for automatically generated captions duration: Length of the video in seconds, as an integer. @@ -176,13 +178,18 @@ class InfoExtractor(object): Set to "root" to indicate that this is a comment to the original video. age_limit: Age restriction for the video, as an integer (years) - webpage_url: The url to the video webpage, if given to youtube-dl it + webpage_url: The URL to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) categories: A list of categories that the video falls in, for example ["Sports", "Berlin"] + tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] is_live: True, False, or None (=unknown). Whether this video is a live stream that goes on instead of a fixed-length video. + start_time: Time in seconds where the reproduction should start, as + specified in the URL. + end_time: Time in seconds where the reproduction should end, as + specified in the URL. Unless mentioned otherwise, the fields should be Unicode strings. @@ -501,7 +508,7 @@ class InfoExtractor(object): # Methods for following #608 @staticmethod def url_result(url, ie=None, video_id=None, video_title=None): - """Returns a url that points to a page that should be processed""" + """Returns a URL that points to a page that should be processed""" # TODO: ie should be the class used for getting the info video_info = {'_type': 'url', 'url': url, @@ -626,6 +633,12 @@ class InfoExtractor(object): template % (content_re, property_re), ] + @staticmethod + def _meta_regex(prop): + return r'''(?isx)<meta + (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) + [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) + def _og_search_property(self, prop, html, name=None, **kargs): if name is None: name = 'OpenGraph %s' % prop @@ -635,7 +648,7 @@ class InfoExtractor(object): return unescapeHTML(escaped) def _og_search_thumbnail(self, html, **kargs): - return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs) + return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs) def _og_search_description(self, html, **kargs): return self._og_search_property('description', html, fatal=False, **kargs) @@ -656,9 +669,7 @@ class InfoExtractor(object): if display_name is None: display_name = name return self._html_search_regex( - r'''(?isx)<meta - (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) - [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name), + self._meta_regex(name), html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): @@ -1065,6 +1076,12 @@ class InfoExtractor(object): None, '/', True, False, expire_time, '', None, None, None) self._downloader.cookiejar.set_cookie(cookie) + def _get_cookies(self, url): + """ Return a compat_cookies.SimpleCookie with the cookies for the url """ + req = compat_urllib_request.Request(url) + self._downloader.cookiejar.add_cookie_header(req) + return compat_cookies.SimpleCookie(req.get_header('Cookie')) + def get_testcases(self, include_onlymatching=False): t = getattr(self, '_TEST', None) if t: @@ -1116,7 +1133,7 @@ class InfoExtractor(object): class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. - They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query} + They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} Instances should define _SEARCH_KEY and _MAX_RESULTS. """ diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 1a41c0db1..2d90b2224 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -13,8 +13,9 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + determine_ext, int_or_none, - orderedSet, + parse_iso8601, str_to_int, unescapeHTML, ) @@ -28,10 +29,16 @@ class DailymotionBaseInfoExtractor(InfoExtractor): request.add_header('Cookie', 'family_filter=off; ff=off') return request + def _download_webpage_handle_no_ff(self, url, *args, **kwargs): + request = self._build_request(url) + return self._download_webpage_handle(request, *args, **kwargs) + + def _download_webpage_no_ff(self, url, *args, **kwargs): + request = self._build_request(url) + return self._download_webpage(request, *args, **kwargs) -class DailymotionIE(DailymotionBaseInfoExtractor): - """Information Extractor for Dailymotion""" +class DailymotionIE(DailymotionBaseInfoExtractor): _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)' IE_NAME = 'dailymotion' @@ -50,10 +57,17 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'info_dict': { 'id': 'x2iuewm', 'ext': 'mp4', - 'uploader': 'IGN', 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', - 'upload_date': '20150306', + 'description': 'Several come bundled with the Steam Controller.', + 'thumbnail': 're:^https?:.*\.(?:jpg|png)$', 'duration': 74, + 'timestamp': 1425657362, + 'upload_date': '20150306', + 'uploader': 'IGN', + 'uploader_id': 'xijv66', + 'age_limit': 0, + 'view_count': int, + 'comment_count': int, } }, # Vevo video @@ -87,38 +101,106 @@ class DailymotionIE(DailymotionBaseInfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - url = 'https://www.dailymotion.com/video/%s' % video_id - # Retrieve video webpage to extract further information - request = self._build_request(url) - webpage = self._download_webpage(request, video_id) + webpage = self._download_webpage_no_ff( + 'https://www.dailymotion.com/video/%s' % video_id, video_id) + + age_limit = self._rta_search(webpage) - # Extract URL, uploader and title from webpage - self.report_extraction(video_id) + description = self._og_search_description(webpage) or self._html_search_meta( + 'description', webpage, 'description') - # It may just embed a vevo video: - m_vevo = re.search( + view_count = str_to_int(self._search_regex( + [r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:(\d+)"', + r'video_views_count[^>]+>\s+([\d\.,]+)'], + webpage, 'view count', fatal=False)) + comment_count = int_or_none(self._search_regex( + r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserComments:(\d+)"', + webpage, 'comment count', fatal=False)) + + player_v5 = self._search_regex( + r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);', + webpage, 'player v5', default=None) + if player_v5: + player = self._parse_json(player_v5, video_id) + metadata = player['metadata'] + formats = [] + for quality, media_list in metadata['qualities'].items(): + for media in media_list: + media_url = media.get('url') + if not media_url: + continue + type_ = media.get('type') + if type_ == 'application/vnd.lumberjack.manifest': + continue + if type_ == 'application/x-mpegURL' or determine_ext(media_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', m3u8_id='hls')) + else: + f = { + 'url': media_url, + 'format_id': quality, + } + m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url) + if m: + f.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + formats.append(f) + self._sort_formats(formats) + + title = metadata['title'] + duration = int_or_none(metadata.get('duration')) + timestamp = int_or_none(metadata.get('created_time')) + thumbnail = metadata.get('poster_url') + uploader = metadata.get('owner', {}).get('screenname') + uploader_id = metadata.get('owner', {}).get('id') + + subtitles = {} + for subtitle_lang, subtitle in metadata.get('subtitles', {}).get('data', {}).items(): + subtitles[subtitle_lang] = [{ + 'ext': determine_ext(subtitle_url), + 'url': subtitle_url, + } for subtitle_url in subtitle.get('urls', [])] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'age_limit': age_limit, + 'view_count': view_count, + 'comment_count': comment_count, + 'formats': formats, + 'subtitles': subtitles, + } + + # vevo embed + vevo_id = self._search_regex( r'<link rel="video_src" href="[^"]*?vevo.com[^"]*?video=(?P<id>[\w]*)', - webpage) - if m_vevo is not None: - vevo_id = m_vevo.group('id') - self.to_screen('Vevo video detected: %s' % vevo_id) - return self.url_result('vevo:%s' % vevo_id, ie='Vevo') + webpage, 'vevo embed', default=None) + if vevo_id: + return self.url_result('vevo:%s' % vevo_id, 'Vevo') - age_limit = self._rta_search(webpage) + # fallback old player + embed_page = self._download_webpage_no_ff( + 'https://www.dailymotion.com/embed/video/%s' % video_id, + video_id, 'Downloading embed page') + + timestamp = parse_iso8601(self._html_search_meta( + 'video:release_date', webpage, 'upload date')) + + info = self._parse_json( + self._search_regex( + r'var info = ({.*?}),$', embed_page, + 'video info', flags=re.MULTILINE), + video_id) - video_upload_date = None - mobj = re.search(r'<meta property="video:release_date" content="([0-9]{4})-([0-9]{2})-([0-9]{2}).+?"/>', webpage) - if mobj is not None: - video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3) - - embed_url = 'https://www.dailymotion.com/embed/video/%s' % video_id - embed_request = self._build_request(embed_url) - embed_page = self._download_webpage( - embed_request, video_id, 'Downloading embed page') - info = self._search_regex(r'var info = ({.*?}),$', embed_page, - 'video info', flags=re.MULTILINE) - info = json.loads(info) if info.get('error') is not None: msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] raise ExtractorError(msg, expected=True) @@ -139,16 +221,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'width': width, 'height': height, }) - if not formats: - raise ExtractorError('Unable to extract video URL') + self._sort_formats(formats) # subtitles video_subtitles = self.extract_subtitles(video_id, webpage) - view_count = str_to_int(self._search_regex( - r'video_views_count[^>]+>\s+([\d\.,]+)', - webpage, 'view count', fatal=False)) - title = self._og_search_title(webpage, default=None) if title is None: title = self._html_search_regex( @@ -159,8 +236,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'id': video_id, 'formats': formats, 'uploader': info['owner.screenname'], - 'upload_date': video_upload_date, + 'timestamp': timestamp, 'title': title, + 'description': description, 'subtitles': video_subtitles, 'thumbnail': info['thumbnail_url'], 'age_limit': age_limit, @@ -199,18 +277,26 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): }] def _extract_entries(self, id): - video_ids = [] + video_ids = set() + processed_urls = set() for pagenum in itertools.count(1): - request = self._build_request(self._PAGE_TEMPLATE % (id, pagenum)) - webpage = self._download_webpage(request, - id, 'Downloading page %s' % pagenum) + page_url = self._PAGE_TEMPLATE % (id, pagenum) + webpage, urlh = self._download_webpage_handle_no_ff( + page_url, id, 'Downloading page %s' % pagenum) + if urlh.geturl() in processed_urls: + self.report_warning('Stopped at duplicated page %s, which is the same as %s' % ( + page_url, urlh.geturl()), id) + break - video_ids.extend(re.findall(r'data-xid="(.+?)"', webpage)) + processed_urls.add(urlh.geturl()) + + for video_id in re.findall(r'data-xid="(.+?)"', webpage): + if video_id not in video_ids: + yield self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') + video_ids.add(video_id) if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: break - return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') - for video_id in orderedSet(video_ids)] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -227,7 +313,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): class DailymotionUserIE(DailymotionPlaylistIE): IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:(?:old/)?user/)?(?P<user>[^/]+)$' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)' _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' _TESTS = [{ 'url': 'https://www.dailymotion.com/user/nqtv', @@ -236,6 +322,17 @@ class DailymotionUserIE(DailymotionPlaylistIE): 'title': 'Rémi Gaillard', }, 'playlist_mincount': 100, + }, { + 'url': 'http://www.dailymotion.com/user/UnderProject', + 'info_dict': { + 'id': 'UnderProject', + 'title': 'UnderProject', + }, + 'playlist_mincount': 1800, + 'expected_warnings': [ + 'Stopped at duplicated page', + ], + 'skip': 'Takes too long time', }] def _real_extract(self, url): @@ -286,8 +383,7 @@ class DailymotionCloudIE(DailymotionBaseInfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - request = self._build_request(url) - webpage = self._download_webpage(request, video_id) + webpage = self._download_webpage_no_ff(url, video_id) title = self._html_search_regex(r'<title>([^>]+)</title>', webpage, 'title') diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py new file mode 100644 index 000000000..82261e25c --- /dev/null +++ b/youtube_dl/extractor/dcn.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse, + compat_urllib_request, +) +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class DCNIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)' + _TEST = { + 'url': 'http://www.dcndigital.ae/#/show/199074/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375/6887', + 'info_dict': + { + 'id': '17375', + 'ext': 'mp4', + 'title': 'رحلة العمر : الحلقة 1', + 'description': 'md5:0156e935d870acb8ef0a66d24070c6d6', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 2041, + 'timestamp': 1227504126, + 'upload_date': '20081124', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + request = compat_urllib_request.Request( + 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id, + headers={'Origin': 'http://www.dcndigital.ae'}) + + video = self._download_json(request, video_id) + title = video.get('title_en') or video['title_ar'] + + webpage = self._download_webpage( + 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + + compat_urllib_parse.urlencode({ + 'id': video['id'], + 'user_id': video['user_id'], + 'signature': video['signature'], + 'countries': 'Q0M=', + 'filter': 'DENY', + }), video_id) + + m3u8_url = self._html_search_regex(r'file:\s*"([^"]+)', webpage, 'm3u8 url') + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + + rtsp_url = self._search_regex( + r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) + if rtsp_url: + formats.append({ + 'url': rtsp_url, + 'format_id': 'rtsp', + }) + + self._sort_formats(formats) + + img = video.get('img') + thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None + duration = int_or_none(video.get('duration')) + description = video.get('description_en') or video.get('description_ar') + timestamp = parse_iso8601(video.get('create_time') or video.get('update_time'), ' ') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index e17bb9aea..178a7ca4c 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -17,6 +17,8 @@ from ..utils import ( int_or_none, limit_length, urlencode_postdata, + get_element_by_id, + clean_html, ) @@ -42,6 +44,7 @@ class FacebookIE(InfoExtractor): 'id': '637842556329505', 'ext': 'mp4', 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', + 'uploader': 'Tennis on Facebook', } }, { 'note': 'Video without discernible title', @@ -50,6 +53,7 @@ class FacebookIE(InfoExtractor): 'id': '274175099429670', 'ext': 'mp4', 'title': 'Facebook video #274175099429670', + 'uploader': 'Asif Nawab Butt', }, 'expected_warnings': [ 'title' @@ -161,6 +165,7 @@ class FacebookIE(InfoExtractor): video_title = limit_length(video_title, 80) if not video_title: video_title = 'Facebook video #%s' % video_id + uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) return { 'id': video_id, @@ -168,4 +173,5 @@ class FacebookIE(InfoExtractor): 'formats': formats, 'duration': int_or_none(video_data.get('video_duration')), 'thumbnail': video_data.get('thumbnail_src'), + 'uploader': uploader, } diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 43f916412..a6834db43 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -7,7 +7,10 @@ from ..compat import ( compat_urllib_parse, compat_urllib_request, ) -from ..utils import remove_end +from ..utils import ( + remove_end, + HEADRequest, +) class GDCVaultIE(InfoExtractor): @@ -73,10 +76,20 @@ class GDCVaultIE(InfoExtractor): return video_formats def _parse_flv(self, xml_description): - video_formats = [] + formats = [] akamai_url = xml_description.find('./metadata/akamaiHost').text + audios = xml_description.find('./metadata/audios') + if audios is not None: + for audio in audios: + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(audio.get('url'), '.flv'), + 'ext': 'flv', + 'vcodec': 'none', + 'format_id': audio.get('code'), + }) slide_video_path = xml_description.find('./metadata/slideVideo').text - video_formats.append({ + formats.append({ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, 'play_path': remove_end(slide_video_path, '.flv'), 'ext': 'flv', @@ -86,7 +99,7 @@ class GDCVaultIE(InfoExtractor): 'format_id': 'slides', }) speaker_video_path = xml_description.find('./metadata/speakerVideo').text - video_formats.append({ + formats.append({ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, 'play_path': remove_end(speaker_video_path, '.flv'), 'ext': 'flv', @@ -95,7 +108,7 @@ class GDCVaultIE(InfoExtractor): 'preference': -1, 'format_id': 'speaker', }) - return video_formats + return formats def _login(self, webpage_url, display_id): (username, password) = self._get_login_info() @@ -133,16 +146,18 @@ class GDCVaultIE(InfoExtractor): r's1\.addVariable\("file",\s*encodeURIComponent\("(/[^"]+)"\)\);', start_page, 'url', default=None) if direct_url: - video_url = 'http://www.gdcvault.com/' + direct_url title = self._html_search_regex( r'<td><strong>Session Name</strong></td>\s*<td>(.*?)</td>', start_page, 'title') + video_url = 'http://www.gdcvault.com' + direct_url + # resolve the url so that we can detect the correct extension + head = self._request_webpage(HEADRequest(video_url), video_id) + video_url = head.geturl() return { 'id': video_id, 'display_id': display_id, 'url': video_url, - 'ext': 'flv', 'title': title, } @@ -168,8 +183,8 @@ class GDCVaultIE(InfoExtractor): # Fallback to the older format xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename') - xml_decription_url = xml_root + 'xml/' + xml_name - xml_description = self._download_xml(xml_decription_url, display_id) + xml_description_url = xml_root + 'xml/' + xml_name + xml_description = self._download_xml(xml_description_url, display_id) video_title = xml_description.find('./metadata/title').text video_formats = self._parse_mp4(xml_description) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6d2efb22e..6df89f814 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -276,14 +276,6 @@ class GenericIE(InfoExtractor): 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.', }, }, - # BBC iPlayer embeds - { - 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER', - 'info_dict': { - 'title': 'BBC - Blogs - Adam Curtis - BUGGER', - }, - 'playlist_mincount': 18, - }, # RUTV embed { 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html', @@ -1663,7 +1655,7 @@ class GenericIE(InfoExtractor): if not found: # Broaden the findall a little bit: JWPlayer JS loader found = filter_video(re.findall( - r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) + r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) if not found: # Flow player found = filter_video(re.findall(r'''(?xs) diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py new file mode 100644 index 000000000..214bcd5b5 --- /dev/null +++ b/youtube_dl/extractor/ir90tv.py @@ -0,0 +1,42 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import remove_start + + +class Ir90TvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?90tv\.ir/video/(?P<id>[0-9]+)/.*' + _TESTS = [{ + 'url': 'http://90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', + 'md5': '411dbd94891381960cb9e13daa47a869', + 'info_dict': { + 'id': '95719', + 'ext': 'mp4', + 'title': 'شایعات نقل و انتقالات مهم فوتبال اروپا 94/02/18', + 'thumbnail': 're:^https?://.*\.jpg$', + } + }, { + 'url': 'http://www.90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = remove_start(self._html_search_regex( + r'<title>([^<]+)</title>', webpage, 'title'), '90tv.ir :: ') + + video_url = self._search_regex( + r'<source[^>]+src="([^"]+)"', webpage, 'video url') + + thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url', fatal=False) + + return { + 'url': video_url, + 'id': video_id, + 'title': title, + 'video_url': video_url, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index ba2ae8085..a28abb0f0 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -15,6 +15,7 @@ from ..utils import ( determine_ext, ExtractorError, parse_iso8601, + int_or_none, ) @@ -134,7 +135,7 @@ class LetvIE(InfoExtractor): } if format_id[-1:] == 'p': - url_info_dict['height'] = format_id[:-1] + url_info_dict['height'] = int_or_none(format_id[:-1]) urls.append(url_info_dict) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index a00f6e5e5..deead220a 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -17,7 +17,6 @@ from ..utils import ( class LyndaBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.lynda.com/login/login.aspx' - _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true' _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' _NETRC_MACHINE = 'lynda' @@ -41,7 +40,7 @@ class LyndaBaseIE(InfoExtractor): request, None, 'Logging in as %s' % username) # Not (yet) logged in - m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page) + m = re.search(r'loginResultJson\s*=\s*\'(?P<json>[^\']+)\';', login_page) if m is not None: response = m.group('json') response_json = json.loads(response) @@ -70,7 +69,7 @@ class LyndaBaseIE(InfoExtractor): request, None, 'Confirming log in and log out from another device') - if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: + if all(not re.search(p, login_page) for p in ('isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')): raise ExtractorError('Unable to log in') diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 5fdd19027..fc7499958 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -29,7 +29,7 @@ class MDRIE(InfoExtractor): doc = self._download_xml(domain + xmlurl, video_id) formats = [] for a in doc.findall('./assets/asset'): - url_el = a.find('.//progressiveDownloadUrl') + url_el = a.find('./progressiveDownloadUrl') if url_el is None: continue abr = int(a.find('bitrateAudio').text) // 1000 diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index dc2091be0..ccdbfb6c9 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -124,7 +124,7 @@ class NBCSportsIE(InfoExtractor): class NBCNewsIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/ (?:video/.+?/(?P<id>\d+)| - (?:feature|nightly-news)/[^/]+/(?P<title>.+)) + (?:watch|feature|nightly-news)/[^/]+/(?P<title>.+)) ''' _TESTS = [ @@ -169,6 +169,10 @@ class NBCNewsIE(InfoExtractor): 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', }, }, + { + 'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952', + 'only_matching': True, + }, ] def _real_extract(self, url): diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index 0b5ff4760..66c627bec 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -1,12 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, + determine_ext, int_or_none, parse_iso8601, parse_duration, @@ -15,7 +14,7 @@ from ..utils import ( class NowTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?P<station>rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player' + _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)' _TESTS = [{ # rtl @@ -23,7 +22,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '203519', 'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Die neuen Bauern und eine Hochzeit', 'description': 'md5:e234e1ed6d63cf06be5c070442612e7e', 'thumbnail': 're:^https?://.*\.jpg$', @@ -32,7 +31,7 @@ class NowTVIE(InfoExtractor): 'duration': 2786, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }, { @@ -41,7 +40,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '203481', 'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Berlin - Tag & Nacht (Folge 934)', 'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0', 'thumbnail': 're:^https?://.*\.jpg$', @@ -50,7 +49,7 @@ class NowTVIE(InfoExtractor): 'duration': 2641, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }, { @@ -59,7 +58,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '165780', 'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Hals- und Beinbruch', 'description': 'md5:b50d248efffe244e6f56737f0911ca57', 'thumbnail': 're:^https?://.*\.jpg$', @@ -68,7 +67,7 @@ class NowTVIE(InfoExtractor): 'duration': 2742, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }, { @@ -77,7 +76,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '99205', 'display_id': 'medicopter-117/angst', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Angst!', 'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e', 'thumbnail': 're:^https?://.*\.jpg$', @@ -86,7 +85,7 @@ class NowTVIE(InfoExtractor): 'duration': 3025, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }, { @@ -95,7 +94,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '203521', 'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Thema u.a.: Der erste Blick: Die Apple Watch', 'description': 'md5:4312b6c9d839ffe7d8caf03865a531af', 'thumbnail': 're:^https?://.*\.jpg$', @@ -104,7 +103,7 @@ class NowTVIE(InfoExtractor): 'duration': 1083, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }, { @@ -113,7 +112,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '128953', 'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel', - 'ext': 'mp4', + 'ext': 'flv', 'title': "Büro-Fall / Chihuahua 'Joel'", 'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d', 'thumbnail': 're:^https?://.*\.jpg$', @@ -122,15 +121,19 @@ class NowTVIE(InfoExtractor): 'duration': 3092, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, + }, { + 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview', + 'only_matching': True, + }, { + 'url': 'http://www.nowtv.at/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview?return=/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - station = mobj.group('station') + display_id = self._match_id(url) info = self._download_json( 'https://api.nowtv.de/v3/movies/%s?fields=id,title,free,geoblocked,articleLong,articleShort,broadcastStartDate,seoUrl,duration,format,files' % display_id, @@ -148,29 +151,19 @@ class NowTVIE(InfoExtractor): raise ExtractorError( 'Video %s is not available for free' % video_id, expected=True) - f = info.get('format', {}) - station = f.get('station') or station - - STATIONS = { - 'rtl': 'rtlnow', - 'rtl2': 'rtl2now', - 'vox': 'voxnow', - 'nitro': 'rtlnitronow', - 'ntv': 'n-tvnow', - 'superrtl': 'superrtlnow' - } - formats = [] for item in files['items']: - item_path = remove_start(item['path'], '/') - tbr = int_or_none(item['bitrate']) - m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path) - m3u8_url = m3u8_url.replace('now/', 'now/videos/') + if determine_ext(item['path']) != 'f4v': + continue + app, play_path = remove_start(item['path'], '/').split('/', 1) formats.append({ - 'url': m3u8_url, - 'format_id': '%s-%sk' % (item['id'], tbr), - 'ext': 'mp4', - 'tbr': tbr, + 'url': 'rtmpe://fms.rtl.de', + 'app': app, + 'play_path': 'mp4:%s' % play_path, + 'ext': 'flv', + 'page_url': url, + 'player_url': 'http://rtl-now.rtl.de/includes/nc_player.swf', + 'tbr': int_or_none(item.get('bitrate')), }) self._sort_formats(formats) @@ -178,6 +171,8 @@ class NowTVIE(InfoExtractor): description = info.get('articleLong') or info.get('articleShort') timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') duration = parse_duration(info.get('duration')) + + f = info.get('format', {}) thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') return { diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 0b7886840..7b0cdc41a 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -81,7 +81,7 @@ class PornHubIE(InfoExtractor): comment_count = self._extract_count( r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') - video_urls = list(map(compat_urllib_parse_unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage))) + video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"player_quality_[0-9]{3}p\s*=\s*'([^']+)'", webpage))) if webpage.find('"encrypted":true') != -1: password = compat_urllib_parse_unquote_plus( self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) @@ -94,7 +94,7 @@ class PornHubIE(InfoExtractor): format = path.split('/')[5].split('_')[:2] format = "-".join(format) - m = re.match(r'^(?P<height>[0-9]+)P-(?P<tbr>[0-9]+)K$', format) + m = re.match(r'^(?P<height>[0-9]+)[pP]-(?P<tbr>[0-9]+)[kK]$', format) if m is None: height = None tbr = None diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index e0c530d64..543d94417 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -82,16 +82,21 @@ class RtlNlIE(InfoExtractor): meta = info.get('meta', {}) - # Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118) - # NB: nowadays, recent ffmpeg and avconv can handle these encrypted streams, so - # this adaptive -> flash workaround is not required in general, but it also - # allows bypassing georestriction therefore is retained for now. - videopath = material['videopath'].replace('/adaptive/', '/flash/') + # m3u8 streams are encrypted and may not be handled properly by older ffmpeg/avconv. + # To workaround this previously adaptive -> flash trick was used to obtain + # unencrypted m3u8 streams (see https://github.com/rg3/youtube-dl/issues/4118) + # and bypass georestrictions as well. + # Currently, unencrypted m3u8 playlists are (intentionally?) invalid and therefore + # unusable albeit can be fixed by simple string replacement (see + # https://github.com/rg3/youtube-dl/pull/6337) + # Since recent ffmpeg and avconv handle encrypted streams just fine encrypted + # streams are used now. + videopath = material['videopath'] m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4') - video_urlpart = videopath.split('/flash/')[1][:-5] + video_urlpart = videopath.split('/adaptive/')[1][:-5] PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4' formats.extend([ diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py index 9fbe239d8..12639f08b 100644 --- a/youtube_dl/extractor/rts.py +++ b/youtube_dl/extractor/rts.py @@ -19,7 +19,16 @@ from ..utils import ( class RTSIE(InfoExtractor): IE_DESC = 'RTS.ch' - _VALID_URL = r'https?://(?:www\.)?rts\.ch/(?:(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html|play/tv/[^/]+/video/(?P<display_id_new>.+?)\?id=(?P<id_new>[0-9]+))' + _VALID_URL = r'''(?x) + (?: + rts:(?P<rts_id>\d+)| + https?:// + (?:www\.)?rts\.ch/ + (?: + (?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html| + play/tv/[^/]+/video/(?P<display_id_new>.+?)\?id=(?P<id_new>[0-9]+) + ) + )''' _TESTS = [ { @@ -123,6 +132,15 @@ class RTSIE(InfoExtractor): }, }, { + # article with videos on rhs + 'url': 'http://www.rts.ch/sport/hockey/6693917-hockey-davos-decroche-son-31e-titre-de-champion-de-suisse.html', + 'info_dict': { + 'id': '6693917', + 'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse', + }, + 'playlist_mincount': 5, + }, + { 'url': 'http://www.rts.ch/play/tv/le-19h30/video/le-chantier-du-nouveau-parlement-vaudois-a-permis-une-trouvaille-historique?id=6348280', 'only_matching': True, } @@ -130,7 +148,7 @@ class RTSIE(InfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url) - video_id = m.group('id') or m.group('id_new') + video_id = m.group('rts_id') or m.group('id') or m.group('id_new') display_id = m.group('display_id') or m.group('display_id_new') def download_json(internal_id): @@ -143,6 +161,15 @@ class RTSIE(InfoExtractor): # video_id extracted out of URL is not always a real id if 'video' not in all_info and 'audio' not in all_info: page = self._download_webpage(url, display_id) + + # article with videos on rhs + videos = re.findall( + r'<article[^>]+class="content-item"[^>]*>\s*<a[^>]+data-video-urn="urn:rts:video:(\d+)"', + page) + if videos: + entries = [self.url_result('rts:%s' % video_urn, 'RTS') for video_urn in videos] + return self.playlist_result(entries, video_id, self._og_search_title(page)) + internal_id = self._html_search_regex( r'<(?:video|audio) data-id="([0-9]+)"', page, 'internal video id') diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index d1ab66b32..3bc84989e 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -1,12 +1,11 @@ # encoding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, unified_strdate, + js_to_json, ) @@ -22,59 +21,48 @@ class ScreenwaveMediaIE(InfoExtractor): video_id = self._match_id(url) playerdata = self._download_webpage( - 'http://player.screenwavemedia.com/play/player.php?id=%s' % video_id, + 'http://player.screenwavemedia.com/player.php?id=%s' % video_id, video_id, 'Downloading player webpage') vidtitle = self._search_regex( r'\'vidtitle\'\s*:\s*"([^"]+)"', playerdata, 'vidtitle').replace('\\/', '/') - vidurl = self._search_regex( - r'\'vidurl\'\s*:\s*"([^"]+)"', playerdata, 'vidurl').replace('\\/', '/') - - videolist_url = None - - mobj = re.search(r"'videoserver'\s*:\s*'(?P<videoserver>[^']+)'", playerdata) - if mobj: - videoserver = mobj.group('videoserver') - mobj = re.search(r'\'vidid\'\s*:\s*"(?P<vidid>[^\']+)"', playerdata) - vidid = mobj.group('vidid') if mobj else video_id - videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid) - else: - mobj = re.search(r"file\s*:\s*'(?P<smil>http.+?/jwplayer\.smil)'", playerdata) - if mobj: - videolist_url = mobj.group('smil') - - if videolist_url: - videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML') - formats = [] - baseurl = vidurl[:vidurl.rfind('/') + 1] - for video in videolist.findall('.//video'): - src = video.get('src') - if not src: - continue - file_ = src.partition(':')[-1] - width = int_or_none(video.get('width')) - height = int_or_none(video.get('height')) - bitrate = int_or_none(video.get('system-bitrate'), scale=1000) - format = { - 'url': baseurl + file_, - 'format_id': src.rpartition('.')[0].rpartition('_')[-1], - } - if width or height: - format.update({ - 'tbr': bitrate, - 'width': width, - 'height': height, - }) - else: - format.update({ - 'abr': bitrate, - 'vcodec': 'none', - }) - formats.append(format) - else: - formats = [{ - 'url': vidurl, - }] + + playerconfig = self._download_webpage( + 'http://player.screenwavemedia.com/player.js', + video_id, 'Downloading playerconfig webpage') + + videoserver = self._search_regex(r"\[ipaddress\]\s*=>\s*([\d\.]+)", playerdata, 'videoserver') + + sources = self._parse_json( + js_to_json( + self._search_regex( + r"sources\s*:\s*(\[[^\]]+?\])", playerconfig, + 'sources', + ).replace( + "' + thisObj.options.videoserver + '", + videoserver + ).replace( + "' + playerVidId + '", + video_id + ) + ), + video_id + ) + + formats = [] + for source in sources: + if source['type'] == 'hls': + formats.extend(self._extract_m3u8_formats(source['file'], video_id)) + else: + format_label = source.get('label') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]', format_label, 'height', default=None)) + formats.append({ + 'url': source['file'], + 'format': format_label, + 'ext': source.get('type'), + 'height': height, + }) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 118ca4832..6ce86cbcd 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -29,7 +29,7 @@ class SoundcloudIE(InfoExtractor): _VALID_URL = r'''(?x)^(?:https?://)? (?:(?:(?:www\.|m\.)?soundcloud\.com/ (?P<uploader>[\w\d-]+)/ - (?!sets/|(?:likes|tracks)/?(?:$|[?#])) + (?!(?:tracks|sets(?:/[^/?#]+)?|reposts|likes|spotlight)/?(?:$|[?#])) (?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) @@ -282,69 +282,150 @@ class SoundcloudSetIE(SoundcloudIE): msgs = (compat_str(err['error_message']) for err in info['errors']) raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) + entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in info['tracks']] + return { '_type': 'playlist', - 'entries': [self._extract_info_dict(track, secret_token=token) for track in info['tracks']], + 'entries': entries, 'id': '%s' % info['id'], 'title': info['title'], } class SoundcloudUserIE(SoundcloudIE): - _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$' + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|m)\.)?soundcloud\.com/ + (?P<user>[^/]+) + (?:/ + (?P<rsrc>tracks|sets|reposts|likes|spotlight) + )? + /?(?:[?#].*)?$ + ''' IE_NAME = 'soundcloud:user' _TESTS = [{ - 'url': 'https://soundcloud.com/the-concept-band', + 'url': 'https://soundcloud.com/the-akashic-chronicler', 'info_dict': { - 'id': '9615865', - 'title': 'The Royal Concept', + 'id': '114582580', + 'title': 'The Akashic Chronicler (All)', }, - 'playlist_mincount': 12 + 'playlist_mincount': 112, }, { - 'url': 'https://soundcloud.com/the-concept-band/likes', + 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks', 'info_dict': { - 'id': '9615865', - 'title': 'The Royal Concept', + 'id': '114582580', + 'title': 'The Akashic Chronicler (Tracks)', }, - 'playlist_mincount': 1, + 'playlist_mincount': 50, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks', - 'only_matching': True, + 'url': 'https://soundcloud.com/the-akashic-chronicler/sets', + 'info_dict': { + 'id': '114582580', + 'title': 'The Akashic Chronicler (Playlists)', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://soundcloud.com/the-akashic-chronicler/reposts', + 'info_dict': { + 'id': '114582580', + 'title': 'The Akashic Chronicler (Reposts)', + }, + 'playlist_mincount': 9, + }, { + 'url': 'https://soundcloud.com/the-akashic-chronicler/likes', + 'info_dict': { + 'id': '114582580', + 'title': 'The Akashic Chronicler (Likes)', + }, + 'playlist_mincount': 333, + }, { + 'url': 'https://soundcloud.com/grynpyret/spotlight', + 'info_dict': { + 'id': '7098329', + 'title': 'Grynpyret (Spotlight)', + }, + 'playlist_mincount': 1, }] + _API_BASE = 'https://api.soundcloud.com' + _API_V2_BASE = 'https://api-v2.soundcloud.com' + + _BASE_URL_MAP = { + 'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE, + 'tracks': '%s/users/%%s/tracks' % _API_BASE, + 'sets': '%s/users/%%s/playlists' % _API_V2_BASE, + 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE, + 'likes': '%s/users/%%s/likes' % _API_V2_BASE, + 'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE, + } + + _TITLE_MAP = { + 'all': 'All', + 'tracks': 'Tracks', + 'sets': 'Playlists', + 'reposts': 'Reposts', + 'likes': 'Likes', + 'spotlight': 'Spotlight', + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader = mobj.group('user') - resource = mobj.group('rsrc') - if resource is None: - resource = 'tracks' - elif resource == 'likes': - resource = 'favorites' url = 'http://soundcloud.com/%s/' % uploader resolv_url = self._resolv_url(url) user = self._download_json( resolv_url, uploader, 'Downloading user info') - base_url = 'http://api.soundcloud.com/users/%s/%s.json?' % (uploader, resource) + + resource = mobj.group('rsrc') or 'all' + base_url = self._BASE_URL_MAP[resource] % user['id'] + + next_href = None entries = [] for i in itertools.count(): - data = compat_urllib_parse.urlencode({ - 'offset': i * 50, - 'limit': 50, - 'client_id': self._CLIENT_ID, - }) - new_entries = self._download_json( - base_url + data, uploader, 'Downloading track page %s' % (i + 1)) - if len(new_entries) == 0: + if not next_href: + data = compat_urllib_parse.urlencode({ + 'offset': i * 50, + 'limit': 50, + 'client_id': self._CLIENT_ID, + 'linked_partitioning': '1', + 'representation': 'speedy', + }) + next_href = base_url + '?' + data + + response = self._download_json( + next_href, uploader, 'Downloading track page %s' % (i + 1)) + + collection = response['collection'] + + if not collection: self.to_screen('%s: End page received' % uploader) break - entries.extend(self.url_result(e['permalink_url'], 'Soundcloud') for e in new_entries) + + def resolve_permalink_url(candidates): + for cand in candidates: + if isinstance(cand, dict): + permalink_url = cand.get('permalink_url') + if permalink_url and permalink_url.startswith('http'): + return permalink_url + + for e in collection: + permalink_url = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) + if permalink_url: + entries.append(self.url_result(permalink_url)) + + if 'next_href' in response: + next_href = response['next_href'] + if not next_href: + break + else: + next_href = None return { '_type': 'playlist', 'id': compat_str(user['id']), - 'title': user['username'], + 'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]), 'entries': entries, } @@ -379,9 +460,7 @@ class SoundcloudPlaylistIE(SoundcloudIE): data = self._download_json( base_url + data, playlist_id, 'Downloading playlist') - entries = [ - self._extract_info_dict(t, quiet=True, secret_token=token) - for t in data['tracks']] + entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in data['tracks']] return { '_type': 'playlist', diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index 7fb165a87..87b650468 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -45,6 +45,14 @@ class SouthParkDeIE(SouthParkIE): 'title': 'The Government Won\'t Respect My Privacy', 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', }, + }, { + # non-ASCII characters in initial URL + 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen', + 'playlist_count': 4, + }, { + # non-ASCII characters in redirect URL + 'url': 'http://www.southpark.de/alle-episoden/s18e09', + 'playlist_count': 4, }] diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index b868241d5..5bd3c0087 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -9,7 +9,7 @@ from .spiegeltv import SpiegeltvIE class SpiegelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed)?(?:\.html)?(?:#.*)?$' + _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', 'md5': '2c2754212136f35fb4b19767d242f66e', @@ -39,6 +39,9 @@ class SpiegelIE(InfoExtractor): 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', } + }, { + 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index c89de5ba4..84fe71aef 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -29,6 +29,8 @@ class TudouIE(InfoExtractor): } }] + _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' + def _url_for_id(self, id, quality=None): info_url = "http://v2.tudou.com/f?id=" + str(id) if quality: @@ -54,6 +56,10 @@ class TudouIE(InfoExtractor): thumbnail_url = self._search_regex( r",pic:\s*[\"'](.+?)[\"']", webpage, 'thumbnail URL', fatal=False) + player_url = self._search_regex( + r"playerUrl\s*:\s*['\"](.+?\.swf)[\"']", + webpage, 'player URL', default=self._PLAYER_URL) + segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments') segments = json.loads(segs_json) # It looks like the keys are the arguments that have to be passed as @@ -76,6 +82,9 @@ class TudouIE(InfoExtractor): 'ext': ext, 'title': title, 'thumbnail': thumbnail_url, + 'http_headers': { + 'Referer': player_url, + }, } result.append(part_info) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 73ce335b7..a2b6a35aa 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -7,12 +7,15 @@ import random from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_str, compat_urllib_parse, + compat_urllib_parse_urlparse, compat_urllib_request, ) from ..utils import ( ExtractorError, + parse_duration, parse_iso8601, ) @@ -185,7 +188,7 @@ class TwitchVodIE(TwitchItemBaseIE): _ITEM_SHORTCUT = 'v' _TEST = { - 'url': 'http://www.twitch.tv/riotgames/v/6528877', + 'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s', 'info_dict': { 'id': 'v6528877', 'ext': 'mp4', @@ -197,6 +200,7 @@ class TwitchVodIE(TwitchItemBaseIE): 'uploader': 'Riot Games', 'uploader_id': 'riotgames', 'view_count': int, + 'start_time': 310, }, 'params': { # m3u8 download @@ -216,6 +220,12 @@ class TwitchVodIE(TwitchItemBaseIE): item_id, 'mp4') self._prefer_source(formats) info['formats'] = formats + + parsed_url = compat_urllib_parse_urlparse(url) + query = compat_parse_qs(parsed_url.query) + if 't' in query: + info['start_time'] = parse_duration(query['t'][0]) + return info diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index e0b55078b..157bb74fe 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -22,6 +22,27 @@ class VidmeIE(InfoExtractor): 'timestamp': 1406313244, 'upload_date': '20140725', 'thumbnail': 're:^https?://.*\.jpg', + 'view_count': int, + 'like_count': int, + }, + }, { + # tests uploader field + 'url': 'https://vid.me/4Iib', + 'info_dict': { + 'id': '4Iib', + 'ext': 'mp4', + 'title': 'The Carver', + 'description': 'md5:e9c24870018ae8113be936645b93ba3c', + 'duration': 97.859999999999999, + 'timestamp': 1433203629, + 'upload_date': '20150602', + 'uploader': 'Thomas', + 'thumbnail': 're:^https?://.*\.jpg', + 'view_count': int, + 'like_count': int, + }, + 'params': { + 'skip_download': True, }, }, { # From http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching @@ -40,16 +61,23 @@ class VidmeIE(InfoExtractor): title = self._og_search_title(webpage) description = self._og_search_description(webpage, default='') thumbnail = self._og_search_thumbnail(webpage) - timestamp = int_or_none(self._og_search_property('updated_time', webpage, fatal=False)) - width = int_or_none(self._og_search_property('video:width', webpage, fatal=False)) - height = int_or_none(self._og_search_property('video:height', webpage, fatal=False)) + timestamp = int_or_none(self._og_search_property( + 'updated_time', webpage, fatal=False)) + width = int_or_none(self._og_search_property( + 'video:width', webpage, fatal=False)) + height = int_or_none(self._og_search_property( + 'video:height', webpage, fatal=False)) duration = float_or_none(self._html_search_regex( r'data-duration="([^"]+)"', webpage, 'duration', fatal=False)) view_count = str_to_int(self._html_search_regex( - r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False)) + r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', + webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">', webpage, 'like count', fatal=False)) + uploader = self._html_search_regex( + 'class="video_author_username"[^>]*>([^<]+)', + webpage, 'uploader', default=None) return { 'id': video_id, @@ -63,4 +91,5 @@ class VidmeIE(InfoExtractor): 'duration': duration, 'view_count': view_count, 'like_count': like_count, + 'uploader': uploader, } diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 6ef36290b..cda02ba24 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -5,11 +5,13 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_request, compat_urllib_parse, + compat_urllib_parse_unquote, ) from ..utils import ( determine_ext, int_or_none, parse_iso8601, + HEADRequest, ) @@ -62,7 +64,6 @@ class ViewsterIE(InfoExtractor): }] _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01' - _AUTH_TOKEN = '/YqhSYsx8EaU9Bsta3ojlA==' def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True): request = compat_urllib_request.Request(url) @@ -72,6 +73,10 @@ class ViewsterIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + # Get 'api_token' cookie + self._request_webpage(HEADRequest(url), video_id) + cookies = self._get_cookies(url) + self._AUTH_TOKEN = compat_urllib_parse_unquote(cookies['api_token'].value) info = self._download_json( 'https://public-api.viewster.com/search/%s' % video_id, diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index b4ad513a0..97315750f 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -4,7 +4,6 @@ import re from .common import InfoExtractor from ..utils import ( - ExtractorError, unified_strdate, str_to_int, int_or_none, @@ -22,7 +21,7 @@ class XHamsterIE(InfoExtractor): 'ext': 'mp4', 'title': 'FemaleAgent Shy beauty takes the bait', 'upload_date': '20121014', - 'uploader_id': 'Ruseful2011', + 'uploader': 'Ruseful2011', 'duration': 893, 'age_limit': 18, } @@ -34,7 +33,7 @@ class XHamsterIE(InfoExtractor): 'ext': 'mp4', 'title': 'Britney Spears Sexy Booty', 'upload_date': '20130914', - 'uploader_id': 'jojo747400', + 'uploader': 'jojo747400', 'duration': 200, 'age_limit': 18, } @@ -46,12 +45,12 @@ class XHamsterIE(InfoExtractor): ] def _real_extract(self, url): - def extract_video_url(webpage): - mp4 = re.search(r'<video\s+.*?file="([^"]+)".*?>', webpage) - if mp4 is None: - raise ExtractorError('Unable to extract media URL') - else: - return mp4.group(1) + def extract_video_url(webpage, name): + return self._search_regex( + [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''', + r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''', + r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''], + webpage, name, group='mp4') def is_hd(webpage): return '<div class=\'icon iconHD\'' in webpage @@ -75,10 +74,14 @@ class XHamsterIE(InfoExtractor): if upload_date: upload_date = unified_strdate(upload_date) - uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)', - webpage, 'uploader id', default='anonymous') + uploader = self._html_search_regex( + r"<a href='[^']+xhamster\.com/user/[^>]+>(?P<uploader>[^<]+)", + webpage, 'uploader', default='anonymous') - thumbnail = self._html_search_regex(r'<video\s+.*?poster="([^"]+)".*?>', webpage, 'thumbnail', fatal=False) + thumbnail = self._search_regex( + [r'''thumb\s*:\s*(?P<q>["'])(?P<thumbnail>.+?)(?P=q)''', + r'''<video[^>]+poster=(?P<q>["'])(?P<thumbnail>.+?)(?P=q)[^>]*>'''], + webpage, 'thumbnail', fatal=False, group='thumbnail') duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>', webpage, 'duration', fatal=False)) @@ -97,7 +100,9 @@ class XHamsterIE(InfoExtractor): hd = is_hd(webpage) - video_url = extract_video_url(webpage) + format_id = 'hd' if hd else 'sd' + + video_url = extract_video_url(webpage, format_id) formats = [{ 'url': video_url, 'format_id': 'hd' if hd else 'sd', @@ -108,7 +113,7 @@ class XHamsterIE(InfoExtractor): mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url') webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage') if is_hd(webpage): - video_url = extract_video_url(webpage) + video_url = extract_video_url(webpage, 'hd') formats.append({ 'url': video_url, 'format_id': 'hd', @@ -122,7 +127,7 @@ class XHamsterIE(InfoExtractor): 'title': title, 'description': description, 'upload_date': upload_date, - 'uploader_id': uploader_id, + 'uploader': uploader, 'thumbnail': thumbnail, 'duration': duration, 'view_count': view_count, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 323681960..67a1df9a0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -19,6 +19,7 @@ from ..compat import ( compat_urllib_parse, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, + compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, compat_str, @@ -31,9 +32,12 @@ from ..utils import ( get_element_by_id, int_or_none, orderedSet, + parse_duration, + smuggle_url, str_to_int, unescapeHTML, unified_strdate, + unsmuggle_url, uppercase_escape, ISO3166Utils, ) @@ -279,13 +283,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'}, + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'}, '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, @@ -295,11 +299,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, # Dash webm audio '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, @@ -317,7 +321,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): IE_NAME = 'youtube' _TESTS = [ { - 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc', + 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9', 'info_dict': { 'id': 'BaW_jenozKc', 'ext': 'mp4', @@ -327,8 +331,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20121002', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], + 'tags': ['youtube-dl'], 'like_count': int, 'dislike_count': int, + 'start_time': 1, + 'end_time': 9, } }, { @@ -339,7 +346,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', - 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f', + 'description': 'md5:782e8651347686cba06e58f71ab51773', + 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', + 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', + 'iconic ep', 'iconic', 'love', 'it'], 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', } @@ -554,6 +564,59 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'format': '135', # bestvideo } }, + { + # Multifeed videos (multiple cameras), URL is for Main Camera + 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs', + 'info_dict': { + 'id': 'jqWvoWXjCVs', + 'title': 'teamPGP: Rocket League Noob Stream', + 'description': 'md5:dc7872fb300e143831327f1bae3af010', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'jqWvoWXjCVs', + 'ext': 'mp4', + 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)', + 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'upload_date': '20150721', + 'uploader': 'Beer Games Beer', + 'uploader_id': 'beergamesbeer', + }, + }, { + 'info_dict': { + 'id': '6h8e8xoXJzg', + 'ext': 'mp4', + 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)', + 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'upload_date': '20150721', + 'uploader': 'Beer Games Beer', + 'uploader_id': 'beergamesbeer', + }, + }, { + 'info_dict': { + 'id': 'PUOgX5z9xZw', + 'ext': 'mp4', + 'title': 'teamPGP: Rocket League Noob Stream (grizzle)', + 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'upload_date': '20150721', + 'uploader': 'Beer Games Beer', + 'uploader_id': 'beergamesbeer', + }, + }, { + 'info_dict': { + 'id': 'teuwxikvS5k', + 'ext': 'mp4', + 'title': 'teamPGP: Rocket League Noob Stream (zim)', + 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'upload_date': '20150721', + 'uploader': 'Beer Games Beer', + 'uploader_id': 'beergamesbeer', + }, + }], + 'params': { + 'skip_download': True, + }, + } ] def __init__(self, *args, **kwargs): @@ -885,10 +948,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return formats def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + proto = ( 'http' if self._downloader.params.get('prefer_insecure', False) else 'https') + start_time = None + end_time = None + parsed_url = compat_urllib_parse_urlparse(url) + for component in [parsed_url.fragment, parsed_url.query]: + query = compat_parse_qs(component) + if start_time is None and 't' in query: + start_time = parse_duration(query['t'][0]) + if start_time is None and 'start' in query: + start_time = parse_duration(query['start'][0]) + if end_time is None and 'end' in query: + end_time = parse_duration(query['end'][0]) + # Extract original video URL from URL with redirection, like age verification, using next_url parameter mobj = re.search(self._NEXT_URL_RE, url) if mobj: @@ -977,7 +1054,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'reason' in video_info: if 'The uploader has not made this video available in your country.' in video_info['reason']: regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None) - if regions_allowed is not None: + if regions_allowed: raise ExtractorError('YouTube said: This video is available in %s only' % ( ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))), expected=True) @@ -989,6 +1066,55 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '"token" parameter not in video info for unknown reason', video_id=video_id) + # title + if 'title' in video_info: + video_title = video_info['title'][0] + else: + self._downloader.report_warning('Unable to extract video title') + video_title = '_' + + # description + video_description = get_element_by_id("eow-description", video_webpage) + if video_description: + video_description = re.sub(r'''(?x) + <a\s+ + (?:[a-zA-Z-]+="[^"]+"\s+)*? + title="([^"]+)"\s+ + (?:[a-zA-Z-]+="[^"]+"\s+)*? + class="yt-uix-redirect-link"\s*> + [^<]+ + </a> + ''', r'\1', video_description) + video_description = clean_html(video_description) + else: + fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage) + if fd_mobj: + video_description = unescapeHTML(fd_mobj.group(1)) + else: + video_description = '' + + if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False): + if not self._downloader.params.get('noplaylist'): + entries = [] + feed_ids = [] + multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0]) + for feed in multifeed_metadata_list.split(','): + feed_data = compat_parse_qs(feed) + entries.append({ + '_type': 'url_transparent', + 'ie_key': 'Youtube', + 'url': smuggle_url( + '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), + {'force_singlefeed': True}), + 'title': '%s (%s)' % (video_title, feed_data['title'][0]), + }) + feed_ids.append(feed_data['id'][0]) + self.to_screen( + 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' + % (', '.join(feed_ids), video_id)) + return self.playlist_result(entries, video_id, video_title, video_description) + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + if 'view_count' in video_info: view_count = int(video_info['view_count'][0]) else: @@ -1014,13 +1140,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: self._downloader.report_warning('unable to extract uploader nickname') - # title - if 'title' in video_info: - video_title = video_info['title'][0] - else: - self._downloader.report_warning('Unable to extract video title') - video_title = '_' - # thumbnail image # We try first to get a high quality image: m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', @@ -1056,25 +1175,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_categories = None - # description - video_description = get_element_by_id("eow-description", video_webpage) - if video_description: - video_description = re.sub(r'''(?x) - <a\s+ - (?:[a-zA-Z-]+="[^"]+"\s+)*? - title="([^"]+)"\s+ - (?:[a-zA-Z-]+="[^"]+"\s+)*? - class="yt-uix-redirect-link"\s*> - [^<]+ - </a> - ''', r'\1', video_description) - video_description = clean_html(video_description) - else: - fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage) - if fd_mobj: - video_description = unescapeHTML(fd_mobj.group(1)) - else: - video_description = '' + video_tags = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] def _extract_count(count_name): return str_to_int(self._search_regex( @@ -1244,6 +1347,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'thumbnail': video_thumbnail, 'description': video_description, 'categories': video_categories, + 'tags': video_tags, 'subtitles': video_subtitles, 'automatic_captions': automatic_captions, 'duration': video_duration, @@ -1256,6 +1360,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]), 'formats': formats, 'is_live': is_live, + 'start_time': start_time, + 'end_time': end_time, } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ae813099d..e265c7574 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -139,21 +139,24 @@ def write_json_file(obj, fn): if sys.version_info >= (2, 7): - def find_xpath_attr(node, xpath, key, val): + def find_xpath_attr(node, xpath, key, val=None): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z-]+$', key) - assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) - expr = xpath + "[@%s='%s']" % (key, val) + if val: + assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) + expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val)) return node.find(expr) else: - def find_xpath_attr(node, xpath, key, val): + def find_xpath_attr(node, xpath, key, val=None): # Here comes the crazy part: In 2.6, if the xpath is a unicode, # .//node does not match if a node is a direct child of . ! if isinstance(xpath, compat_str): xpath = xpath.encode('ascii') for f in node.findall(xpath): - if f.attrib.get(key) == val: + if key not in f.attrib: + continue + if val is None or f.attrib.get(key) == val: return f return None @@ -576,11 +579,9 @@ class ContentTooShortError(Exception): download is too small for what the server announced first, indicating the connection was probably interrupted. """ - # Both in bytes - downloaded = None - expected = None def __init__(self, downloaded, expected): + # Both in bytes self.downloaded = downloaded self.expected = expected @@ -650,6 +651,26 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): return ret def http_request(self, req): + # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not + # always respected by websites, some tend to give out URLs with non percent-encoded + # non-ASCII characters (see telemb.py, ard.py [#3412]) + # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) + # To work around aforementioned issue we will replace request's original URL with + # percent-encoded one + # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) + # the code of this workaround has been moved here from YoutubeDL.urlopen() + url = req.get_full_url() + url_escaped = escape_url(url) + + # Substitute URL if any change after escaping + if url != url_escaped: + req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + new_req = req_type( + url_escaped, data=req.data, headers=req.headers, + origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) + new_req.timeout = req.timeout + req = new_req + for h, v in std_headers.items(): # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 # The dict keys are capitalized because of this bug by urllib @@ -694,6 +715,17 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): gz = io.BytesIO(self.deflate(resp.read())) resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg + # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 + if 300 <= resp.code < 400: + location = resp.headers.get('Location') + if location: + # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 + if sys.version_info >= (3, 0): + location = location.encode('iso-8859-1').decode('utf-8') + location_escaped = escape_url(location) + if location != location_escaped: + del resp.headers['Location'] + resp.headers['Location'] = location_escaped return resp https_request = http_request diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 280afdd7f..9f209499c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.07.21' +__version__ = '2015.08.06.1' |