diff options
Diffstat (limited to 'youtube_dl')
211 files changed, 8270 insertions, 3801 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 033b50702..bfb4ff225 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -26,6 +26,8 @@ import tokenize import traceback import random +from string import ascii_letters + from .compat import ( compat_basestring, compat_cookiejar, @@ -58,6 +60,7 @@ from .utils import ( format_bytes, formatSeconds, GeoRestrictedError, + int_or_none, ISO3166Utils, locked_file, make_HTTPS_handler, @@ -303,6 +306,17 @@ class YoutubeDL(object): postprocessor. """ + _NUMERIC_FIELDS = set(( + 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx', + 'timestamp', 'upload_year', 'upload_month', 'upload_day', + 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count', + 'average_rating', 'comment_count', 'age_limit', + 'start_time', 'end_time', + 'chapter_number', 'season_number', 'episode_number', + 'track_number', 'disc_number', 'release_year', + 'playlist_index', + )) + params = None _ies = [] _pps = [] @@ -371,10 +385,10 @@ class YoutubeDL(object): else: raise - if (sys.version_info >= (3,) and sys.platform != 'win32' and + if (sys.platform != 'win32' and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and not params.get('restrictfilenames', False)): - # On Python 3, the Unicode filesystem API will throw errors (#1474) + # Unicode filesystem API will throw errors (#1474, #13027) self.report_warning( 'Assuming --restrict-filenames since file system encoding ' 'cannot encode all characters. ' @@ -499,24 +513,25 @@ class YoutubeDL(object): def to_console_title(self, message): if not self.params.get('consoletitle', False): return - if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): - # c_wchar_p() might not be necessary if `message` is - # already of type unicode() - ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) + if compat_os_name == 'nt': + if ctypes.windll.kernel32.GetConsoleWindow(): + # c_wchar_p() might not be necessary if `message` is + # already of type unicode() + ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) elif 'TERM' in os.environ: self._write_string('\033]0;%s\007' % message, self._screen_file) def save_console_title(self): if not self.params.get('consoletitle', False): return - if 'TERM' in os.environ: + if compat_os_name != 'nt' and 'TERM' in os.environ: # Save the title on stack self._write_string('\033[22;0t', self._screen_file) def restore_console_title(self): if not self.params.get('consoletitle', False): return - if 'TERM' in os.environ: + if compat_os_name != 'nt' and 'TERM' in os.environ: # Restore the title from stack self._write_string('\033[23;0t', self._screen_file) @@ -639,22 +654,11 @@ class YoutubeDL(object): r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')], outtmpl) - NUMERIC_FIELDS = set(( - 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx', - 'timestamp', 'upload_year', 'upload_month', 'upload_day', - 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count', - 'average_rating', 'comment_count', 'age_limit', - 'start_time', 'end_time', - 'chapter_number', 'season_number', 'episode_number', - 'track_number', 'disc_number', 'release_year', - 'playlist_index', - )) - # Missing numeric fields used together with integer presentation types # in format specification will break the argument substitution since # string 'NA' is returned for missing fields. We will patch output # template for missing fields to meet string presentation type. - for numeric_field in NUMERIC_FIELDS: + for numeric_field in self._NUMERIC_FIELDS: if numeric_field not in template_dict: # As of [1] format syntax is: # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type @@ -673,7 +677,19 @@ class YoutubeDL(object): FORMAT_RE.format(numeric_field), r'%({0})s'.format(numeric_field), outtmpl) - filename = expand_path(outtmpl % template_dict) + # expand_path translates '%%' into '%' and '$$' into '$' + # correspondingly that is not what we want since we need to keep + # '%%' intact for template dict substitution step. Working around + # with boundary-alike separator hack. + sep = ''.join([random.choice(ascii_letters) for _ in range(32)]) + outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep)) + + # outtmpl should be expand_path'ed before template dict substitution + # because meta fields may contain env variables we don't want to + # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and + # title "Hello $PATH", we don't want `$PATH` to be expanded. + filename = expand_path(outtmpl).replace(sep, '') % template_dict + # Temporary fix for #4787 # 'Treat' all problem characters by passing filename through preferredencoding # to workaround encoding issues with subprocess on python2 @ Windows @@ -845,7 +861,7 @@ class YoutubeDL(object): force_properties = dict( (k, v) for k, v in ie_result.items() if v is not None) - for f in ('_type', 'url', 'ie_key'): + for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'): if f in force_properties: del force_properties[f] new_result = info.copy() @@ -1049,6 +1065,25 @@ class YoutubeDL(object): return op(actual_value, comparison_value) return _filter + def _default_format_spec(self, info_dict, download=True): + req_format_list = [] + + def can_have_partial_formats(): + if self.params.get('simulate', False): + return True + if not download: + return True + if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-': + return False + if info_dict.get('is_live'): + return False + merger = FFmpegMergerPP(self) + return merger.available and merger.can_merge() + if can_have_partial_formats(): + req_format_list.append('bestvideo+bestaudio') + req_format_list.append('best') + return '/'.join(req_format_list) + def build_format_selector(self, format_spec): def syntax_error(note, start): message = ( @@ -1345,9 +1380,28 @@ class YoutubeDL(object): if 'title' not in info_dict: raise ExtractorError('Missing "title" field in extractor result') - if not isinstance(info_dict['id'], compat_str): - self.report_warning('"id" field is not a string - forcing string conversion') - info_dict['id'] = compat_str(info_dict['id']) + def report_force_conversion(field, field_not, conversion): + self.report_warning( + '"%s" field is not %s - forcing %s conversion, there is an error in extractor' + % (field, field_not, conversion)) + + def sanitize_string_field(info, string_field): + field = info.get(string_field) + if field is None or isinstance(field, compat_str): + return + report_force_conversion(string_field, 'a string', 'string') + info[string_field] = compat_str(field) + + def sanitize_numeric_fields(info): + for numeric_field in self._NUMERIC_FIELDS: + field = info.get(numeric_field) + if field is None or isinstance(field, compat_numeric_types): + continue + report_force_conversion(numeric_field, 'numeric', 'int') + info[numeric_field] = int_or_none(field) + + sanitize_string_field(info_dict, 'id') + sanitize_numeric_fields(info_dict) if 'playlist' not in info_dict: # It isn't part of a playlist @@ -1428,16 +1482,28 @@ class YoutubeDL(object): if not formats: raise ExtractorError('No video formats found!') + def is_wellformed(f): + url = f.get('url') + if not url: + self.report_warning( + '"url" field is missing or empty - skipping format, ' + 'there is an error in extractor') + return False + if isinstance(url, bytes): + sanitize_string_field(f, 'url') + return True + + # Filter out malformed formats for better extraction robustness + formats = list(filter(is_wellformed, formats)) + formats_dict = {} # We check that all the formats have the format and format_id fields for i, format in enumerate(formats): - if 'url' not in format: - raise ExtractorError('Missing "url" key in result (index %d)' % i) - + sanitize_string_field(format, 'format_id') + sanitize_numeric_fields(format) format['url'] = sanitize_url(format['url']) - - if format.get('format_id') is None: + if not format.get('format_id'): format['format_id'] = compat_str(i) else: # Sanitize format_id from characters used in format selector expression @@ -1490,14 +1556,10 @@ class YoutubeDL(object): req_format = self.params.get('format') if req_format is None: - req_format_list = [] - if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and - not info_dict.get('is_live')): - merger = FFmpegMergerPP(self) - if merger.available and merger.can_merge(): - req_format_list.append('bestvideo+bestaudio') - req_format_list.append('best') - req_format = '/'.join(req_format_list) + req_format = self._default_format_spec(info_dict, download=download) + if self.params.get('verbose'): + self.to_stdout('[debug] Default format spec: %s' % req_format) + format_selector = self.build_format_selector(req_format) # While in format selection we may need to have an access to the original @@ -1649,12 +1711,17 @@ class YoutubeDL(object): if filename is None: return - try: - dn = os.path.dirname(sanitize_path(encodeFilename(filename))) - if dn and not os.path.exists(dn): - os.makedirs(dn) - except (OSError, IOError) as err: - self.report_error('unable to create directory ' + error_to_compat_str(err)) + def ensure_dir_exists(path): + try: + dn = os.path.dirname(path) + if dn and not os.path.exists(dn): + os.makedirs(dn) + return True + except (OSError, IOError) as err: + self.report_error('unable to create directory ' + error_to_compat_str(err)) + return False + + if not ensure_dir_exists(sanitize_path(encodeFilename(filename))): return if self.params.get('writedescription', False): @@ -1697,29 +1764,30 @@ class YoutubeDL(object): ie = self.get_info_extractor(info_dict['extractor_key']) for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] - if sub_info.get('data') is not None: - sub_data = sub_info['data'] + sub_filename = subtitles_filename(filename, sub_lang, sub_format) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): + self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format)) else: - try: - sub_data = ie._download_webpage( - sub_info['url'], info_dict['id'], note=False) - except ExtractorError as err: - self.report_warning('Unable to download subtitle for "%s": %s' % - (sub_lang, error_to_compat_str(err.cause))) - continue - try: - sub_filename = subtitles_filename(filename, sub_lang, sub_format) - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): - self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format)) + self.to_screen('[info] Writing video subtitles to: ' + sub_filename) + if sub_info.get('data') is not None: + try: + # Use newline='' to prevent conversion of newline characters + # See https://github.com/rg3/youtube-dl/issues/10268 + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: + subfile.write(sub_info['data']) + except (OSError, IOError): + self.report_error('Cannot write subtitles file ' + sub_filename) + return else: - self.to_screen('[info] Writing video subtitles to: ' + sub_filename) - # Use newline='' to prevent conversion of newline characters - # See https://github.com/rg3/youtube-dl/issues/10268 - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: - subfile.write(sub_data) - except (OSError, IOError): - self.report_error('Cannot write subtitles file ' + sub_filename) - return + try: + sub_data = ie._request_webpage( + sub_info['url'], info_dict['id'], note=False).read() + with io.open(encodeFilename(sub_filename), 'wb') as subfile: + subfile.write(sub_data) + except (ExtractorError, IOError, OSError, ValueError) as err: + self.report_warning('Unable to download subtitle for "%s": %s' % + (sub_lang, error_to_compat_str(err))) + continue if self.params.get('writeinfojson', False): infofn = replace_extension(filename, 'info.json', info_dict.get('ext')) @@ -1792,8 +1860,11 @@ class YoutubeDL(object): for f in requested_formats: new_info = dict(info_dict) new_info.update(f) - fname = self.prepare_filename(new_info) - fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext']) + fname = prepend_extension( + self.prepare_filename(new_info), + 'f%s' % f['format_id'], new_info['ext']) + if not ensure_dir_exists(fname): + return downloaded.append(fname) partial_success = dl(fname, new_info) success = success and partial_success @@ -1860,7 +1931,7 @@ class YoutubeDL(object): info_dict.get('protocol') == 'm3u8' and self.params.get('hls_prefer_native')): if fixup_policy == 'warn': - self.report_warning('%s: malformated aac bitstream.' % ( + self.report_warning('%s: malformed AAC bitstream detected.' % ( info_dict['id'])) elif fixup_policy == 'detect_or_warn': fixup_pp = FFmpegFixupM3u8PP(self) @@ -1869,7 +1940,7 @@ class YoutubeDL(object): info_dict['__postprocessors'].append(fixup_pp) else: self.report_warning( - '%s: malformated aac bitstream. %s' + '%s: malformed AAC bitstream detected. %s' % (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) else: assert fixup_policy in ('ignore', 'never') diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 39527117f..9e4e13bcf 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2322,6 +2322,19 @@ try: except ImportError: # Python 2 from HTMLParser import HTMLParser as compat_HTMLParser +try: # Python 2 + from HTMLParser import HTMLParseError as compat_HTMLParseError +except ImportError: # Python <3.4 + try: + from html.parser import HTMLParseError as compat_HTMLParseError + except ImportError: # Python >3.4 + + # HTMLParseError has been deprecated in Python 3.3 and removed in + # Python 3.5. Introducing dummy exception for Python >3.5 for compatible + # and uniform cross-version exceptiong handling + class compat_HTMLParseError(Exception): + pass + try: from subprocess import DEVNULL compat_subprocess_get_DEVNULL = lambda: DEVNULL @@ -2604,14 +2617,22 @@ except ImportError: # Python 2 parsed_result[name] = [value] return parsed_result -try: - from shlex import quote as compat_shlex_quote -except ImportError: # Python < 3.3 + +compat_os_name = os._name if os.name == 'java' else os.name + + +if compat_os_name == 'nt': def compat_shlex_quote(s): - if re.match(r'^[-_\w./]+$', s): - return s - else: - return "'" + s.replace("'", "'\"'\"'") + "'" + return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"') +else: + try: + from shlex import quote as compat_shlex_quote + except ImportError: # Python < 3.3 + def compat_shlex_quote(s): + if re.match(r'^[-_\w./]+$', s): + return s + else: + return "'" + s.replace("'", "'\"'\"'") + "'" try: @@ -2636,9 +2657,6 @@ def compat_ord(c): return ord(c) -compat_os_name = os._name if os.name == 'java' else os.name - - if sys.version_info >= (3, 0): compat_getenv = os.getenv compat_expanduser = os.path.expanduser @@ -2882,6 +2900,7 @@ else: __all__ = [ + 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', 'compat_basestring', diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 5d6621147..75b8166c5 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -8,10 +8,11 @@ import random from ..compat import compat_os_name from ..utils import ( + decodeArgument, encodeFilename, error_to_compat_str, - decodeArgument, format_bytes, + shell_quote, timeconvert, ) @@ -303,11 +304,11 @@ class FileDownloader(object): """Report attempt to resume at given byte.""" self.to_screen('[download] Resuming download at byte %s' % resume_len) - def report_retry(self, count, retries): + def report_retry(self, err, count, retries): """Report retry in case of HTTP error 5xx""" self.to_screen( - '[download] Got server HTTP error. Retrying (attempt %d of %s)...' - % (count, self.format_retries(retries))) + '[download] Got server HTTP error: %s. Retrying (attempt %d of %s)...' + % (error_to_compat_str(err), count, self.format_retries(retries))) def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" @@ -381,10 +382,5 @@ class FileDownloader(object): if exe is None: exe = os.path.basename(str_args[0]) - try: - import pipes - shell_quote = lambda args: ' '.join(map(pipes.quote, str_args)) - except ImportError: - shell_quote = repr self.to_screen('[debug] %s command line: %s' % ( exe, shell_quote(str_args))) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 7491fdad8..576ece6db 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .fragment import FragmentFD from ..compat import compat_urllib_error +from ..utils import urljoin class DashSegmentsFD(FragmentFD): @@ -12,12 +13,13 @@ class DashSegmentsFD(FragmentFD): FD_NAME = 'dashsegments' def real_download(self, filename, info_dict): - segments = info_dict['fragments'][:1] if self.params.get( + fragment_base_url = info_dict.get('fragment_base_url') + fragments = info_dict['fragments'][:1] if self.params.get( 'test', False) else info_dict['fragments'] ctx = { 'filename': filename, - 'total_frags': len(segments), + 'total_frags': len(fragments), } self._prepare_and_start_frag_download(ctx) @@ -26,7 +28,7 @@ class DashSegmentsFD(FragmentFD): skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) frag_index = 0 - for i, segment in enumerate(segments): + for i, fragment in enumerate(fragments): frag_index += 1 if frag_index <= ctx['fragment_index']: continue @@ -36,7 +38,11 @@ class DashSegmentsFD(FragmentFD): count = 0 while count <= fragment_retries: try: - success, frag_content = self._download_fragment(ctx, segment['url'], info_dict) + fragment_url = fragment.get('url') + if not fragment_url: + assert fragment_base_url + fragment_url = urljoin(fragment_base_url, fragment['path']) + success, frag_content = self._download_fragment(ctx, fragment_url, info_dict) if not success: return False self._append_fragment(ctx, frag_content) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index e78169a0d..db018fa89 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -212,6 +212,11 @@ class FFmpegFD(ExternalFD): args = [ffpp.executable, '-y'] + for log_level in ('quiet', 'verbose'): + if self.params.get(log_level, False): + args += ['-loglevel', log_level] + break + seekable = info_dict.get('_seekable') if seekable is not None: # setting -seekable prevents ffmpeg from guessing if the server diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index bccc8ecc1..6f6fb4a77 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -151,10 +151,15 @@ class FragmentFD(FileDownloader): if self.__do_ytdl_file(ctx): if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): self._read_ytdl_file(ctx) + if ctx['fragment_index'] > 0 and resume_len == 0: + self.report_error( + 'Inconsistent state of incomplete fragment download. ' + 'Restarting from the beginning...') + ctx['fragment_index'] = resume_len = 0 + self._write_ytdl_file(ctx) else: self._write_ytdl_file(ctx) - if ctx['fragment_index'] > 0: - assert resume_len > 0 + assert ctx['fragment_index'] == 0 dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 0e29c8a2a..46308cf07 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -59,9 +59,9 @@ class HlsFD(FragmentFD): man_url = info_dict['url'] self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) - manifest = self.ydl.urlopen(self._prepare_url(info_dict, man_url)).read() - - s = manifest.decode('utf-8', 'ignore') + urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) + man_url = urlh.geturl() + s = urlh.read().decode('utf-8', 'ignore') if not self.can_download(s, info_dict): if info_dict.get('extra_param_to_segment_url'): diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index af405b950..8a6638cc2 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -22,8 +22,16 @@ from ..utils import ( class HttpFD(FileDownloader): def real_download(self, filename, info_dict): url = info_dict['url'] - tmpfilename = self.temp_name(filename) - stream = None + + class DownloadContext(dict): + __getattr__ = dict.get + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ + + ctx = DownloadContext() + ctx.filename = filename + ctx.tmpfilename = self.temp_name(filename) + ctx.stream = None # Do not include the Accept-Encoding header headers = {'Youtubedl-no-compression': 'True'} @@ -38,46 +46,51 @@ class HttpFD(FileDownloader): if is_test: request.add_header('Range', 'bytes=0-%s' % str(self._TEST_FILE_SIZE - 1)) - # Establish possible resume length - if os.path.isfile(encodeFilename(tmpfilename)): - resume_len = os.path.getsize(encodeFilename(tmpfilename)) - else: - resume_len = 0 - - open_mode = 'wb' - if resume_len != 0: - if self.params.get('continuedl', True): - self.report_resuming_byte(resume_len) - request.add_header('Range', 'bytes=%d-' % resume_len) - open_mode = 'ab' - else: - resume_len = 0 + ctx.open_mode = 'wb' + ctx.resume_len = 0 + + if self.params.get('continuedl', True): + # Establish possible resume length + if os.path.isfile(encodeFilename(ctx.tmpfilename)): + ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) count = 0 retries = self.params.get('retries', 0) - while count <= retries: + + class SucceedDownload(Exception): + pass + + class RetryDownload(Exception): + def __init__(self, source_error): + self.source_error = source_error + + def establish_connection(): + if ctx.resume_len != 0: + self.report_resuming_byte(ctx.resume_len) + request.add_header('Range', 'bytes=%d-' % ctx.resume_len) + ctx.open_mode = 'ab' # Establish connection try: - data = self.ydl.urlopen(request) + ctx.data = self.ydl.urlopen(request) # When trying to resume, Content-Range HTTP header of response has to be checked # to match the value of requested Range HTTP header. This is due to a webservers # that don't support resuming and serve a whole file with no Content-Range # set in response despite of requested Range (see # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) - if resume_len > 0: - content_range = data.headers.get('Content-Range') + if ctx.resume_len > 0: + content_range = ctx.data.headers.get('Content-Range') if content_range: content_range_m = re.search(r'bytes (\d+)-', content_range) # Content-Range is present and matches requested Range, resume is possible - if content_range_m and resume_len == int(content_range_m.group(1)): - break + if content_range_m and ctx.resume_len == int(content_range_m.group(1)): + return # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file # and performing entire redownload self.report_unable_to_resume() - resume_len = 0 - open_mode = 'wb' - break + ctx.resume_len = 0 + ctx.open_mode = 'wb' + return except (compat_urllib_error.HTTPError, ) as err: if (err.code < 500 or err.code >= 600) and err.code != 416: # Unexpected HTTP error @@ -86,15 +99,15 @@ class HttpFD(FileDownloader): # Unable to resume (requested range not satisfiable) try: # Open the connection again without the range header - data = self.ydl.urlopen(basic_request) - content_length = data.info()['Content-Length'] + ctx.data = self.ydl.urlopen(basic_request) + content_length = ctx.data.info()['Content-Length'] except (compat_urllib_error.HTTPError, ) as err: if err.code < 500 or err.code >= 600: raise else: # Examine the reported length if (content_length is not None and - (resume_len - 100 < int(content_length) < resume_len + 100)): + (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)): # The file had already been fully downloaded. # Explanation to the above condition: in issue #175 it was revealed that # YouTube sometimes adds or removes a few bytes from the end of the file, @@ -102,152 +115,184 @@ class HttpFD(FileDownloader): # I decided to implement a suggested change and consider the file # completely downloaded if the file size differs less than 100 bytes from # the one in the hard drive. - self.report_file_already_downloaded(filename) - self.try_rename(tmpfilename, filename) + self.report_file_already_downloaded(ctx.filename) + self.try_rename(ctx.tmpfilename, ctx.filename) self._hook_progress({ - 'filename': filename, + 'filename': ctx.filename, 'status': 'finished', - 'downloaded_bytes': resume_len, - 'total_bytes': resume_len, + 'downloaded_bytes': ctx.resume_len, + 'total_bytes': ctx.resume_len, }) - return True + raise SucceedDownload() else: # The length does not match, we start the download over self.report_unable_to_resume() - resume_len = 0 - open_mode = 'wb' - break - except socket.error as e: - if e.errno != errno.ECONNRESET: + ctx.resume_len = 0 + ctx.open_mode = 'wb' + return + raise RetryDownload(err) + except socket.error as err: + if err.errno != errno.ECONNRESET: # Connection reset is no problem, just retry raise + raise RetryDownload(err) + + def download(): + data_len = ctx.data.info().get('Content-length', None) + + # Range HTTP header may be ignored/unsupported by a webserver + # (e.g. extractor/scivee.py, extractor/bambuser.py). + # However, for a test we still would like to download just a piece of a file. + # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control + # block size when downloading a file. + if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): + data_len = self._TEST_FILE_SIZE + + if data_len is not None: + data_len = int(data_len) + ctx.resume_len + min_data_len = self.params.get('min_filesize') + max_data_len = self.params.get('max_filesize') + if min_data_len is not None and data_len < min_data_len: + self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) + return False + if max_data_len is not None and data_len > max_data_len: + self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) + return False - # Retry - count += 1 - if count <= retries: - self.report_retry(count, retries) - - if count > retries: - self.report_error('giving up after %s retries' % retries) - return False - - data_len = data.info().get('Content-length', None) - - # Range HTTP header may be ignored/unsupported by a webserver - # (e.g. extractor/scivee.py, extractor/bambuser.py). - # However, for a test we still would like to download just a piece of a file. - # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control - # block size when downloading a file. - if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): - data_len = self._TEST_FILE_SIZE - - if data_len is not None: - data_len = int(data_len) + resume_len - min_data_len = self.params.get('min_filesize') - max_data_len = self.params.get('max_filesize') - if min_data_len is not None and data_len < min_data_len: - self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) - return False - if max_data_len is not None and data_len > max_data_len: - self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) - return False - - byte_counter = 0 + resume_len - block_size = self.params.get('buffersize', 1024) - start = time.time() + byte_counter = 0 + ctx.resume_len + block_size = self.params.get('buffersize', 1024) + start = time.time() - # measure time over whole while-loop, so slow_down() and best_block_size() work together properly - now = None # needed for slow_down() in the first loop run - before = start # start measuring - while True: + # measure time over whole while-loop, so slow_down() and best_block_size() work together properly + now = None # needed for slow_down() in the first loop run + before = start # start measuring - # Download and write - data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) - byte_counter += len(data_block) + def retry(e): + if ctx.tmpfilename != '-': + ctx.stream.close() + ctx.stream = None + ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) + raise RetryDownload(e) - # exit loop when download is finished - if len(data_block) == 0: - break + while True: + try: + # Download and write + data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) + # socket.timeout is a subclass of socket.error but may not have + # errno set + except socket.timeout as e: + retry(e) + except socket.error as e: + if e.errno not in (errno.ECONNRESET, errno.ETIMEDOUT): + raise + retry(e) + + byte_counter += len(data_block) + + # exit loop when download is finished + if len(data_block) == 0: + break + + # Open destination file just in time + if ctx.stream is None: + try: + ctx.stream, ctx.tmpfilename = sanitize_open( + ctx.tmpfilename, ctx.open_mode) + assert ctx.stream is not None + ctx.filename = self.undo_temp_name(ctx.tmpfilename) + self.report_destination(ctx.filename) + except (OSError, IOError) as err: + self.report_error('unable to open for writing: %s' % str(err)) + return False + + if self.params.get('xattr_set_filesize', False) and data_len is not None: + try: + write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) + except (XAttrUnavailableError, XAttrMetadataError) as err: + self.report_error('unable to set filesize xattr: %s' % str(err)) - # Open destination file just in time - if stream is None: try: - (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode) - assert stream is not None - filename = self.undo_temp_name(tmpfilename) - self.report_destination(filename) - except (OSError, IOError) as err: - self.report_error('unable to open for writing: %s' % str(err)) + ctx.stream.write(data_block) + except (IOError, OSError) as err: + self.to_stderr('\n') + self.report_error('unable to write data: %s' % str(err)) return False - if self.params.get('xattr_set_filesize', False) and data_len is not None: - try: - write_xattr(tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) - except (XAttrUnavailableError, XAttrMetadataError) as err: - self.report_error('unable to set filesize xattr: %s' % str(err)) - - try: - stream.write(data_block) - except (IOError, OSError) as err: + # Apply rate limit + self.slow_down(start, now, byte_counter - ctx.resume_len) + + # end measuring of one loop run + now = time.time() + after = now + + # Adjust block size + if not self.params.get('noresizebuffer', False): + block_size = self.best_block_size(after - before, len(data_block)) + + before = after + + # Progress message + speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) + if data_len is None: + eta = None + else: + eta = self.calc_eta(start, time.time(), data_len - ctx.resume_len, byte_counter - ctx.resume_len) + + self._hook_progress({ + 'status': 'downloading', + 'downloaded_bytes': byte_counter, + 'total_bytes': data_len, + 'tmpfilename': ctx.tmpfilename, + 'filename': ctx.filename, + 'eta': eta, + 'speed': speed, + 'elapsed': now - start, + }) + + if is_test and byte_counter == data_len: + break + + if ctx.stream is None: self.to_stderr('\n') - self.report_error('unable to write data: %s' % str(err)) + self.report_error('Did not get any data blocks') return False + if ctx.tmpfilename != '-': + ctx.stream.close() - # Apply rate limit - self.slow_down(start, now, byte_counter - resume_len) + if data_len is not None and byte_counter != data_len: + err = ContentTooShortError(byte_counter, int(data_len)) + if count <= retries: + retry(err) + raise err - # end measuring of one loop run - now = time.time() - after = now + self.try_rename(ctx.tmpfilename, ctx.filename) - # Adjust block size - if not self.params.get('noresizebuffer', False): - block_size = self.best_block_size(after - before, len(data_block)) - - before = after - - # Progress message - speed = self.calc_speed(start, now, byte_counter - resume_len) - if data_len is None: - eta = None - else: - eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) + # Update file modification time + if self.params.get('updatetime', True): + info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None)) self._hook_progress({ - 'status': 'downloading', 'downloaded_bytes': byte_counter, - 'total_bytes': data_len, - 'tmpfilename': tmpfilename, - 'filename': filename, - 'eta': eta, - 'speed': speed, - 'elapsed': now - start, + 'total_bytes': byte_counter, + 'filename': ctx.filename, + 'status': 'finished', + 'elapsed': time.time() - start, }) - if is_test and byte_counter == data_len: - break - - if stream is None: - self.to_stderr('\n') - self.report_error('Did not get any data blocks') - return False - if tmpfilename != '-': - stream.close() - - if data_len is not None and byte_counter != data_len: - raise ContentTooShortError(byte_counter, int(data_len)) - self.try_rename(tmpfilename, filename) - - # Update file modification time - if self.params.get('updatetime', True): - info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None)) - - self._hook_progress({ - 'downloaded_bytes': byte_counter, - 'total_bytes': byte_counter, - 'filename': filename, - 'status': 'finished', - 'elapsed': time.time() - start, - }) - - return True + return True + + while count <= retries: + try: + establish_connection() + download() + return True + except RetryDownload as e: + count += 1 + if count <= retries: + self.report_retry(e.source_error, count, retries) + continue + except SucceedDownload: + return True + + self.report_error('giving up after %s retries' % retries) + return False diff --git a/youtube_dl/downloader/ism.py b/youtube_dl/downloader/ism.py index 5f6f9faef..9b001ecff 100644 --- a/youtube_dl/downloader/ism.py +++ b/youtube_dl/downloader/ism.py @@ -98,7 +98,7 @@ def write_piff_header(stream, params): if is_audio: smhd_payload = s88.pack(0) # balance - smhd_payload = u16.pack(0) # reserved + smhd_payload += u16.pack(0) # reserved media_header_box = full_box(b'smhd', 0, 0, smhd_payload) # Sound Media Header else: vmhd_payload = u16.pack(0) # graphics mode @@ -126,7 +126,6 @@ def write_piff_header(stream, params): if fourcc == 'AACL': sample_entry_box = box(b'mp4a', sample_entry_payload) else: - sample_entry_payload = sample_entry_payload sample_entry_payload += u16.pack(0) # pre defined sample_entry_payload += u16.pack(0) # reserved sample_entry_payload += u32.pack(0) * 3 # pre defined diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 0247cabf9..60f753b95 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -3,11 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, js_to_json, int_or_none, parse_iso8601, + try_get, ) @@ -124,7 +126,20 @@ class ABCIViewIE(InfoExtractor): title = video_params.get('title') or video_params['seriesTitle'] stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') - formats = self._extract_akamai_formats(stream['hds-unmetered'], video_id) + format_urls = [ + try_get(stream, lambda x: x['hds-unmetered'], compat_str)] + + # May have higher quality video + sd_url = try_get( + stream, lambda x: x['streams']['hds']['sd'], compat_str) + if sd_url: + format_urls.append(sd_url.replace('metered', 'um')) + + formats = [] + for format_url in format_urls: + if format_url: + formats.extend( + self._extract_akamai_formats(format_url, video_id)) self._sort_formats(formats) subtitles = {} diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index 4f56c4c11..f770fe901 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -7,12 +7,21 @@ import time from .amp import AMPIE from .common import InfoExtractor +from .youtube import YoutubeIE from ..compat import compat_urlparse class AbcNewsVideoIE(AMPIE): IE_NAME = 'abcnews:video' - _VALID_URL = r'https?://abcnews\.go\.com/[^/]+/video/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + abcnews\.go\.com/ + (?: + [^/]+/video/(?P<display_id>[0-9a-z-]+)-| + video/embed\?.*?\bid= + ) + (?P<id>\d+) + ''' _TESTS = [{ 'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932', @@ -30,6 +39,9 @@ class AbcNewsVideoIE(AMPIE): 'skip_download': True, }, }, { + 'url': 'http://abcnews.go.com/video/embed?id=46979033', + 'only_matching': True, + }, { 'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478', 'only_matching': True, }] @@ -97,9 +109,7 @@ class AbcNewsIE(InfoExtractor): r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL') full_video_url = compat_urlparse.urljoin(url, video_url) - youtube_url = self._html_search_regex( - r'<iframe[^>]+src="(https://www\.youtube\.com/embed/[^"]+)"', - webpage, 'YouTube URL', default=None) + youtube_url = YoutubeIE._extract_url(webpage) timestamp = None date_str = self._html_search_regex( @@ -129,7 +139,7 @@ class AbcNewsIE(InfoExtractor): } if youtube_url: - entries = [entry, self.url_result(youtube_url, 'Youtube')] + entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())] return self.playlist_result(entries) return entry diff --git a/youtube_dl/extractor/abcotvs.py b/youtube_dl/extractor/abcotvs.py index 76e98132b..03b92a39c 100644 --- a/youtube_dl/extractor/abcotvs.py +++ b/youtube_dl/extractor/abcotvs.py @@ -22,7 +22,7 @@ class ABCOTVSIE(InfoExtractor): 'display_id': 'east-bay-museum-celebrates-vintage-synthesizers', 'ext': 'mp4', 'title': 'East Bay museum celebrates vintage synthesizers', - 'description': 'md5:a4f10fb2f2a02565c1749d4adbab4b10', + 'description': 'md5:24ed2bd527096ec2a5c67b9d5a9005f3', 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1421123075, 'upload_date': '20150113', diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index 66caf6a81..cffdab6ca 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -15,6 +15,7 @@ from ..utils import ( intlist_to_bytes, srt_subtitles_timecode, strip_or_none, + urljoin, ) @@ -31,25 +32,28 @@ class ADNIE(InfoExtractor): 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5', } } + _BASE_URL = 'http://animedigitalnetwork.fr' def _get_subtitles(self, sub_path, video_id): if not sub_path: return None enc_subtitles = self._download_webpage( - 'http://animedigitalnetwork.fr/' + sub_path, - video_id, fatal=False) + urljoin(self._BASE_URL, sub_path), + video_id, fatal=False, headers={ + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0', + }) if not enc_subtitles: return None # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( bytes_to_intlist(base64.b64decode(enc_subtitles[24:])), - bytes_to_intlist(b'\nd\xaf\xd2J\xd0\xfc\xe1\xfc\xdf\xb61\xe8\xe1\xf0\xcc'), + bytes_to_intlist(b'\x1b\xe0\x29\x61\x38\x94\x24\x00\x12\xbd\xc5\x80\xac\xce\xbe\xb0'), bytes_to_intlist(base64.b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( - dec_subtitles[:-compat_ord(dec_subtitles[-1])], + dec_subtitles[:-compat_ord(dec_subtitles[-1])].decode(), None, fatal=False) if not subtitles_json: return None @@ -103,9 +107,18 @@ class ADNIE(InfoExtractor): metas = options.get('metas') or {} title = metas.get('title') or video_info['title'] links = player_config.get('links') or {} + error = None + if not links: + links_url = player_config['linksurl'] + links_data = self._download_json(urljoin( + self._BASE_URL, links_url), video_id) + links = links_data.get('links') or {} + error = links_data.get('error') formats = [] for format_id, qualities in links.items(): + if not isinstance(qualities, dict): + continue for load_balancer_url in qualities.values(): load_balancer_data = self._download_json( load_balancer_url, video_id, fatal=False) or {} @@ -119,7 +132,8 @@ class ADNIE(InfoExtractor): for f in m3u8_formats: f['language'] = 'fr' formats.extend(m3u8_formats) - error = options.get('error') + if not error: + error = options.get('error') if not formats and error: raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) self._sort_formats(formats) diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index 7da96c65c..b83b51efb 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -6,12 +6,16 @@ import time import xml.etree.ElementTree as etree from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_kwargs, + compat_urlparse, +) from ..utils import ( unescapeHTML, urlencode_postdata, unified_timestamp, ExtractorError, + NO_DEFAULT, ) @@ -21,6 +25,11 @@ MSO_INFO = { 'username_field': 'username', 'password_field': 'password', }, + 'ATTOTT': { + 'name': 'DIRECTV NOW', + 'username_field': 'email', + 'password_field': 'loginpassword', + }, 'Rogers': { 'name': 'Rogers', 'username_field': 'UserName', @@ -36,6 +45,11 @@ MSO_INFO = { 'username_field': 'Ecom_User_ID', 'password_field': 'Ecom_Password', }, + 'Brighthouse': { + 'name': 'Bright House Networks | Spectrum', + 'username_field': 'j_username', + 'password_field': 'j_password', + }, 'Charter_Direct': { 'name': 'Charter Spectrum', 'username_field': 'IDToken1', @@ -1308,11 +1322,14 @@ class AdobePassIE(InfoExtractor): _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' _MVPD_CACHE = 'ap-mvpd' + _DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page' + def _download_webpage_handle(self, *args, **kwargs): headers = kwargs.get('headers', {}) headers.update(self.geo_verification_headers()) kwargs['headers'] = headers - return super(AdobePassIE, self)._download_webpage_handle(*args, **kwargs) + return super(AdobePassIE, self)._download_webpage_handle( + *args, **compat_kwargs(kwargs)) @staticmethod def _get_mvpd_resource(provider_id, title, guid, rating): @@ -1356,6 +1373,21 @@ class AdobePassIE(InfoExtractor): 'Use --ap-mso to specify Adobe Pass Multiple-system operator Identifier ' 'and --ap-username and --ap-password or --netrc to provide account credentials.', expected=True) + def extract_redirect_url(html, url=None, fatal=False): + # TODO: eliminate code duplication with generic extractor and move + # redirection code into _download_webpage_handle + REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' + redirect_url = self._search_regex( + r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' + r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX, + html, 'meta refresh redirect', + default=NO_DEFAULT if fatal else None, fatal=fatal) + if not redirect_url: + return None + if url: + redirect_url = compat_urlparse.urljoin(url, unescapeHTML(redirect_url)) + return redirect_url + mvpd_headers = { 'ap_42': 'anonymous', 'ap_11': 'Linux i686', @@ -1405,16 +1437,15 @@ class AdobePassIE(InfoExtractor): if '<form name="signin"' in provider_redirect_page: provider_login_page_res = provider_redirect_page_res elif 'http-equiv="refresh"' in provider_redirect_page: - oauth_redirect_url = self._html_search_regex( - r'content="0;\s*url=([^\'"]+)', - provider_redirect_page, 'meta refresh redirect') + oauth_redirect_url = extract_redirect_url( + provider_redirect_page, fatal=True) provider_login_page_res = self._download_webpage_handle( oauth_redirect_url, video_id, - 'Downloading Provider Login Page') + self._DOWNLOADING_LOGIN_PAGE) else: provider_login_page_res = post_form( provider_redirect_page_res, - 'Downloading Provider Login Page') + self._DOWNLOADING_LOGIN_PAGE) mvpd_confirm_page_res = post_form( provider_login_page_res, 'Logging in', { @@ -1461,8 +1492,17 @@ class AdobePassIE(InfoExtractor): 'Content-Type': 'application/x-www-form-urlencoded' }) else: + # Some providers (e.g. DIRECTV NOW) have another meta refresh + # based redirect that should be followed. + provider_redirect_page, urlh = provider_redirect_page_res + provider_refresh_redirect_url = extract_redirect_url( + provider_redirect_page, url=urlh.geturl()) + if provider_refresh_redirect_url: + provider_redirect_page_res = self._download_webpage_handle( + provider_refresh_redirect_url, video_id, + 'Downloading Provider Redirect Page (meta refresh)') provider_login_page_res = post_form( - provider_redirect_page_res, 'Downloading Provider Login Page') + provider_redirect_page_res, self._DOWNLOADING_LOGIN_PAGE) mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', { mso_info.get('username_field', 'username'): username, mso_info.get('password_field', 'password'): password, diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 989505c82..acc4ce38d 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -5,91 +5,52 @@ import re from .turner import TurnerBaseIE from ..utils import ( - ExtractorError, int_or_none, + strip_or_none, ) class AdultSwimIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<is_playlist>playlists/)?(?P<show_path>[^/]+)/(?P<episode_path>[^/?#]+)/?' + _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<show_path>[^/?#]+)(?:/(?P<episode_path>[^/?#]+))?' _TESTS = [{ 'url': 'http://adultswim.com/videos/rick-and-morty/pilot', - 'playlist': [ - { - 'md5': '247572debc75c7652f253c8daa51a14d', - 'info_dict': { - 'id': 'rQxZvXQ4ROaSOqq-or2Mow-0', - 'ext': 'flv', - 'title': 'Rick and Morty - Pilot Part 1', - 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " - }, - }, - { - 'md5': '77b0e037a4b20ec6b98671c4c379f48d', - 'info_dict': { - 'id': 'rQxZvXQ4ROaSOqq-or2Mow-3', - 'ext': 'flv', - 'title': 'Rick and Morty - Pilot Part 4', - 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " - }, - }, - ], 'info_dict': { 'id': 'rQxZvXQ4ROaSOqq-or2Mow', + 'ext': 'mp4', 'title': 'Rick and Morty - Pilot', - 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " + 'description': 'Rick moves in with his daughter\'s family and establishes himself as a bad influence on his grandson, Morty.', + 'timestamp': 1493267400, + 'upload_date': '20170427', }, - 'skip': 'This video is only available for registered users', - }, { - 'url': 'http://www.adultswim.com/videos/playlists/american-parenting/putting-francine-out-of-business/', - 'playlist': [ - { - 'md5': '2eb5c06d0f9a1539da3718d897f13ec5', - 'info_dict': { - 'id': '-t8CamQlQ2aYZ49ItZCFog-0', - 'ext': 'flv', - 'title': 'American Dad - Putting Francine Out of Business', - 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].' - }, - } - ], - 'info_dict': { - 'id': '-t8CamQlQ2aYZ49ItZCFog', - 'title': 'American Dad - Putting Francine Out of Business', - 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].' + 'params': { + # m3u8 download + 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/', - 'playlist': [ - { - 'md5': '3e346a2ab0087d687a05e1e7f3b3e529', - 'info_dict': { - 'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0', - 'ext': 'mp4', - 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', - 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', - }, - } - ], 'info_dict': { 'id': 'sY3cMUR_TbuE4YmdjzbIcQ', + 'ext': 'mp4', 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', - 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', + 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.', + 'upload_date': '20080124', + 'timestamp': 1201150800, }, 'params': { # m3u8 download 'skip_download': True, - } + }, }, { - # heroMetadata.trailer 'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/', 'info_dict': { 'id': 'I0LQFQkaSUaFp8PnAWHhoQ', 'ext': 'mp4', 'title': 'Decker - Inside Decker: A New Hero', - 'description': 'md5:c916df071d425d62d70c86d4399d3ee0', - 'duration': 249.008, + 'description': 'The guys recap the conclusion of the season. They announce a new hero, take a peek into the Victorville Film Archive and welcome back the talented James Dean.', + 'timestamp': 1469480460, + 'upload_date': '20160725', }, 'params': { # m3u8 download @@ -97,136 +58,102 @@ class AdultSwimIE(TurnerBaseIE): }, 'expected_warnings': ['Unable to download f4m manifest'], }, { - 'url': 'http://www.adultswim.com/videos/toonami/friday-october-14th-2016/', + 'url': 'http://www.adultswim.com/videos/attack-on-titan', + 'info_dict': { + 'id': 'b7A69dzfRzuaXIECdxW8XQ', + 'title': 'Attack on Titan', + 'description': 'md5:6c8e003ea0777b47013e894767f5e114', + }, + 'playlist_mincount': 12, + }, { + 'url': 'http://www.adultswim.com/videos/streams/williams-stream', 'info_dict': { - 'id': 'eYiLsKVgQ6qTC6agD67Sig', - 'title': 'Toonami - Friday, October 14th, 2016', - 'description': 'md5:99892c96ffc85e159a428de85c30acde', + 'id': 'd8DEBj7QRfetLsRgFnGEyg', + 'ext': 'mp4', + 'title': r're:^Williams Stream \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'description': 'original programming', }, - 'playlist': [{ - 'md5': '', - 'info_dict': { - 'id': 'eYiLsKVgQ6qTC6agD67Sig', - 'ext': 'mp4', - 'title': 'Toonami - Friday, October 14th, 2016', - 'description': 'md5:99892c96ffc85e159a428de85c30acde', - }, - }], 'params': { # m3u8 download 'skip_download': True, }, - 'expected_warnings': ['Unable to download f4m manifest'], }] - @staticmethod - def find_video_info(collection, slug): - for video in collection.get('videos'): - if video.get('slug') == slug: - return video - - @staticmethod - def find_collection_by_linkURL(collections, linkURL): - for collection in collections: - if collection.get('linkURL') == linkURL: - return collection - - @staticmethod - def find_collection_containing_video(collections, slug): - for collection in collections: - for video in collection.get('videos'): - if video.get('slug') == slug: - return collection, video - return None, None - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - show_path = mobj.group('show_path') - episode_path = mobj.group('episode_path') - is_playlist = True if mobj.group('is_playlist') else False - - webpage = self._download_webpage(url, episode_path) - - # Extract the value of `bootstrappedData` from the Javascript in the page. - bootstrapped_data = self._parse_json(self._search_regex( - r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path) - - # Downloading videos from a /videos/playlist/ URL needs to be handled differently. - # NOTE: We are only downloading one video (the current one) not the playlist - if is_playlist: - collections = bootstrapped_data['playlists']['collections'] - collection = self.find_collection_by_linkURL(collections, show_path) - video_info = self.find_video_info(collection, episode_path) - - show_title = video_info['showTitle'] - segment_ids = [video_info['videoPlaybackID']] + show_path, episode_path = re.match(self._VALID_URL, url).groups() + display_id = episode_path or show_path + webpage = self._download_webpage(url, display_id) + initial_data = self._parse_json(self._search_regex( + r'AS_INITIAL_DATA(?:__)?\s*=\s*({.+?});', + webpage, 'initial data'), display_id) + + is_stream = show_path == 'streams' + if is_stream: + if not episode_path: + episode_path = 'live-stream' + + video_data = next(stream for stream_path, stream in initial_data['streams'].items() if stream_path == episode_path) + video_id = video_data.get('stream') + + if not video_id: + entries = [] + for episode in video_data.get('archiveEpisodes', []): + episode_url = episode.get('url') + if not episode_url: + continue + entries.append(self.url_result( + episode_url, 'AdultSwim', episode.get('id'))) + return self.playlist_result( + entries, video_data.get('id'), video_data.get('title'), + strip_or_none(video_data.get('description'))) else: - collections = bootstrapped_data['show']['collections'] - collection, video_info = self.find_collection_containing_video(collections, episode_path) - # Video wasn't found in the collections, let's try `slugged_video`. - if video_info is None: - if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: - video_info = bootstrapped_data['slugged_video'] - if not video_info: - video_info = bootstrapped_data.get( - 'heroMetadata', {}).get('trailer', {}).get('video') - if not video_info: - video_info = bootstrapped_data.get('onlineOriginals', [None])[0] - if not video_info: - raise ExtractorError('Unable to find video info') - - show = bootstrapped_data['show'] - show_title = show['title'] - stream = video_info.get('stream') - if stream and stream.get('videoPlaybackID'): - segment_ids = [stream['videoPlaybackID']] - elif video_info.get('clips'): - segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']] - elif video_info.get('videoPlaybackID'): - segment_ids = [video_info['videoPlaybackID']] - elif video_info.get('id'): - segment_ids = [video_info['id']] - else: - if video_info.get('auth') is True: - raise ExtractorError( - 'This video is only available via cable service provider subscription that' - ' is not currently supported. You may want to use --cookies.', expected=True) - else: - raise ExtractorError('Unable to find stream or clips') - - episode_id = video_info['id'] - episode_title = video_info['title'] - episode_description = video_info.get('description') - episode_duration = int_or_none(video_info.get('duration')) - view_count = int_or_none(video_info.get('views')) + show_data = initial_data['show'] + + if not episode_path: + entries = [] + for video in show_data.get('videos', []): + slug = video.get('slug') + if not slug: + continue + entries.append(self.url_result( + 'http://adultswim.com/videos/%s/%s' % (show_path, slug), + 'AdultSwim', video.get('id'))) + return self.playlist_result( + entries, show_data.get('id'), show_data.get('title'), + strip_or_none(show_data.get('metadata', {}).get('description'))) + + video_data = show_data['sluggedVideo'] + video_id = video_data['id'] + + info = self._extract_cvp_info( + 'http://www.adultswim.com/videos/api/v0/assets?platform=desktop&id=' + video_id, + video_id, { + 'secure': { + 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big', + 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do', + }, + }, { + 'url': url, + 'site_name': 'AdultSwim', + 'auth_required': video_data.get('auth'), + }) - entries = [] - for part_num, segment_id in enumerate(segment_ids): - segement_info = self._extract_cvp_info( - 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id, - segment_id, { - 'secure': { - 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big', - 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do', - }, - }) - segment_title = '%s - %s' % (show_title, episode_title) - if len(segment_ids) > 1: - segment_title += ' Part %d' % (part_num + 1) - segement_info.update({ - 'id': segment_id, - 'title': segment_title, - 'description': episode_description, + info.update({ + 'id': video_id, + 'display_id': display_id, + 'description': info.get('description') or strip_or_none(video_data.get('description')), + }) + if not is_stream: + info.update({ + 'duration': info.get('duration') or int_or_none(video_data.get('duration')), + 'timestamp': info.get('timestamp') or int_or_none(video_data.get('launch_date')), + 'season_number': info.get('season_number') or int_or_none(video_data.get('season_number')), + 'episode': info['title'], + 'episode_number': info.get('episode_number') or int_or_none(video_data.get('episode_number')), }) - entries.append(segement_info) - return { - '_type': 'playlist', - 'id': episode_id, - 'display_id': episode_path, - 'entries': entries, - 'title': '%s - %s' % (show_title, episode_title), - 'description': episode_description, - 'duration': episode_duration, - 'view_count': view_count, - } + info['series'] = video_data.get('collection_title') or info.get('series') + if info['series'] and info['series'] != info['title']: + info['title'] = '%s - %s' % (info['series'], info['title']) + + return info diff --git a/youtube_dl/extractor/aliexpress.py b/youtube_dl/extractor/aliexpress.py new file mode 100644 index 000000000..6f241e683 --- /dev/null +++ b/youtube_dl/extractor/aliexpress.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + try_get, +) + + +class AliExpressLiveIE(InfoExtractor): + _VALID_URL = r'https?://live\.aliexpress\.com/live/(?P<id>\d+)' + _TEST = { + 'url': 'https://live.aliexpress.com/live/2800002704436634', + 'md5': 'e729e25d47c5e557f2630eaf99b740a5', + 'info_dict': { + 'id': '2800002704436634', + 'ext': 'mp4', + 'title': 'CASIMA7.22', + 'thumbnail': r're:http://.*\.jpg', + 'uploader': 'CASIMA Official Store', + 'timestamp': 1500717600, + 'upload_date': '20170722', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + data = self._parse_json( + self._search_regex( + r'(?s)runParams\s*=\s*({.+?})\s*;?\s*var', + webpage, 'runParams'), + video_id) + + title = data['title'] + + formats = self._extract_m3u8_formats( + data['replyStreamUrl'], video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + + return { + 'id': video_id, + 'title': title, + 'thumbnail': data.get('coverUrl'), + 'uploader': try_get( + data, lambda x: x['followBar']['name'], compat_str), + 'timestamp': float_or_none(data.get('startTimeLong'), scale=1000), + 'formats': formats, + } diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index 388e578d5..c68be3134 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -4,9 +4,9 @@ from .common import InfoExtractor class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/programmes/.*?/(?P<id>[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P<id>[^/]+)\.html' - _TEST = { + _TESTS = [{ 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', 'info_dict': { 'id': '3792260579001', @@ -19,7 +19,10 @@ class AlJazeeraIE(InfoExtractor): }, 'add_ie': ['BrightcoveNew'], 'skip': 'Not accessible from Travis CI server', - } + }, { + 'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s' def _real_extract(self, url): diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py index 3a0ec6776..dd3b18d72 100644 --- a/youtube_dl/extractor/amcnetworks.py +++ b/youtube_dl/extractor/amcnetworks.py @@ -3,9 +3,10 @@ from __future__ import unicode_literals from .theplatform import ThePlatformIE from ..utils import ( - update_url_query, - parse_age_limit, int_or_none, + parse_age_limit, + try_get, + update_url_query, ) @@ -68,7 +69,8 @@ class AMCNetworksIE(ThePlatformIE): info = self._parse_theplatform_metadata(theplatform_metadata) video_id = theplatform_metadata['pid'] title = theplatform_metadata['title'] - rating = theplatform_metadata['ratings'][0]['rating'] + rating = try_get( + theplatform_metadata, lambda x: x['ratings'][0]['rating']) auth_required = self._search_regex( r'window\.authRequired\s*=\s*(true|false);', webpage, 'auth required') diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 9e28f2579..69d363311 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -3,16 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_str, -) +from ..compat import compat_str from ..utils import ( determine_ext, extract_attributes, ExtractorError, - sanitized_Request, urlencode_postdata, + urljoin, ) @@ -21,6 +18,8 @@ class AnimeOnDemandIE(InfoExtractor): _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' _NETRC_MACHINE = 'animeondemand' + # German-speaking countries of Europe + _GEO_COUNTRIES = ['AT', 'CH', 'DE', 'LI', 'LU'] _TESTS = [{ # jap, OmU 'url': 'https://www.anime-on-demand.de/anime/161', @@ -46,6 +45,10 @@ class AnimeOnDemandIE(InfoExtractor): # Full length film, non-series, ger/jap, Dub/OmU, account required 'url': 'https://www.anime-on-demand.de/anime/185', 'only_matching': True, + }, { + # Flash videos + 'url': 'https://www.anime-on-demand.de/anime/12', + 'only_matching': True, }] def _login(self): @@ -72,14 +75,13 @@ class AnimeOnDemandIE(InfoExtractor): 'post url', default=self._LOGIN_URL, group='url') if not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - - request = sanitized_Request( - post_url, urlencode_postdata(login_form)) - request.add_header('Referer', self._LOGIN_URL) + post_url = urljoin(self._LOGIN_URL, post_url) response = self._download_webpage( - request, None, 'Logging in as %s' % username) + post_url, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form), headers={ + 'Referer': self._LOGIN_URL, + }) if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')): error = self._search_regex( @@ -120,10 +122,11 @@ class AnimeOnDemandIE(InfoExtractor): formats = [] for input_ in re.findall( - r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', html): + r'<input[^>]+class=["\'].*?streamstarter[^>]+>', html): attributes = extract_attributes(input_) + title = attributes.get('data-dialog-header') playlist_urls = [] - for playlist_key in ('data-playlist', 'data-otherplaylist'): + for playlist_key in ('data-playlist', 'data-otherplaylist', 'data-stream'): playlist_url = attributes.get(playlist_key) if isinstance(playlist_url, compat_str) and re.match( r'/?[\da-zA-Z]+', playlist_url): @@ -147,19 +150,38 @@ class AnimeOnDemandIE(InfoExtractor): format_id_list.append(compat_str(num)) format_id = '-'.join(format_id_list) format_note = ', '.join(filter(None, (kind, lang_note))) - request = sanitized_Request( - compat_urlparse.urljoin(url, playlist_url), + item_id_list = [] + if format_id: + item_id_list.append(format_id) + item_id_list.append('videomaterial') + playlist = self._download_json( + urljoin(url, playlist_url), video_id, + 'Downloading %s JSON' % ' '.join(item_id_list), headers={ 'X-Requested-With': 'XMLHttpRequest', 'X-CSRF-Token': csrf_token, 'Referer': url, 'Accept': 'application/json, text/javascript, */*; q=0.01', - }) - playlist = self._download_json( - request, video_id, 'Downloading %s playlist JSON' % format_id, - fatal=False) + }, fatal=False) if not playlist: continue + stream_url = playlist.get('streamurl') + if stream_url: + rtmp = re.search( + r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)', + stream_url) + if rtmp: + formats.append({ + 'url': rtmp.group('url'), + 'app': rtmp.group('app'), + 'play_path': rtmp.group('playpath'), + 'page_url': url, + 'player_url': 'https://www.anime-on-demand.de/assets/jwplayer.flash-55abfb34080700304d49125ce9ffb4a6.swf', + 'rtmp_real_time': True, + 'format_id': 'rtmp', + 'ext': 'flv', + }) + continue start_video = playlist.get('startvideo', 0) playlist = playlist.get('playlist') if not playlist or not isinstance(playlist, list): @@ -222,7 +244,7 @@ class AnimeOnDemandIE(InfoExtractor): f.update({ 'id': '%s-%s' % (f['id'], m.group('kind').lower()), 'title': m.group('title'), - 'url': compat_urlparse.urljoin(url, m.group('href')), + 'url': urljoin(url, m.group('href')), }) entries.append(f) diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 025e29aa4..e394cb661 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -3,13 +3,13 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - ExtractorError, - HEADRequest, + int_or_none, + mimetype2ext, ) class AparatIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)' _TEST = { 'url': 'http://www.aparat.com/v/wP8On', @@ -29,30 +29,41 @@ class AparatIE(InfoExtractor): # Note: There is an easier-to-parse configuration at # http://www.aparat.com/video/video/config/videohash/%video_id # but the URL in there does not work - embed_url = 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id - webpage = self._download_webpage(embed_url, video_id) - - file_list = self._parse_json(self._search_regex( - r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, 'file list'), video_id) - for i, item in enumerate(file_list[0]): - video_url = item['file'] - req = HEADRequest(video_url) - res = self._request_webpage( - req, video_id, note='Testing video URL %d' % i, errnote=False) - if res: - break - else: - raise ExtractorError('No working video URLs found') + webpage = self._download_webpage( + 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, + video_id) title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, 'title') + + file_list = self._parse_json( + self._search_regex( + r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, + 'file list'), + video_id) + + formats = [] + for item in file_list[0]: + file_url = item.get('file') + if not file_url: + continue + ext = mimetype2ext(item.get('type')) + label = item.get('label') + formats.append({ + 'url': file_url, + 'ext': ext, + 'format_id': label or ext, + 'height': int_or_none(self._search_regex( + r'(\d+)[pP]', label or '', 'height', default=None)), + }) + self._sort_formats(formats) + thumbnail = self._search_regex( r'image:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False) return { 'id': video_id, 'title': title, - 'url': video_url, - 'ext': 'mp4', 'thumbnail': thumbnail, 'age_limit': self._family_friendly_search(webpage), + 'formats': formats, } diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 2d5599456..3f248b147 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -93,6 +93,7 @@ class ARDMediathekIE(InfoExtractor): duration = int_or_none(media_info.get('_duration')) thumbnail = media_info.get('_previewImage') + is_live = media_info.get('_isLive') is True subtitles = {} subtitle_url = media_info.get('_subtitleUrl') @@ -106,6 +107,7 @@ class ARDMediathekIE(InfoExtractor): 'id': video_id, 'duration': duration, 'thumbnail': thumbnail, + 'is_live': is_live, 'formats': formats, 'subtitles': subtitles, } @@ -166,9 +168,11 @@ class ARDMediathekIE(InfoExtractor): # determine video id from url m = re.match(self._VALID_URL, url) + document_id = None + numid = re.search(r'documentId=([0-9]+)', url) if numid: - video_id = numid.group(1) + document_id = video_id = numid.group(1) else: video_id = m.group('video_id') @@ -228,12 +232,16 @@ class ARDMediathekIE(InfoExtractor): 'formats': formats, } else: # request JSON file + if not document_id: + video_id = self._search_regex( + r'/play/(?:config|media)/(\d+)', webpage, 'media id') info = self._extract_media_info( - 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id) + 'http://www.ardmediathek.de/play/media/%s' % video_id, + webpage, video_id) info.update({ 'id': video_id, - 'title': title, + 'title': self._live_title(title) if info.get('is_live') else title, 'description': description, 'thumbnail': thumbnail, }) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 56baef29d..5cde90c5b 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -9,12 +9,13 @@ from ..compat import ( compat_urllib_parse_urlparse, ) from ..utils import ( + ExtractorError, find_xpath_attr, - unified_strdate, get_element_by_attribute, int_or_none, NO_DEFAULT, qualities, + unified_strdate, ) # There are different sources of video in arte.tv, the extraction process @@ -79,6 +80,13 @@ class ArteTVBaseIE(InfoExtractor): info = self._download_json(json_url, video_id) player_info = info['videoJsonPlayer'] + vsr = player_info['VSR'] + + if not vsr: + raise ExtractorError( + 'Video %s is not available' % player_info.get('VID') or video_id, + expected=True) + upload_date_str = player_info.get('shootingDate') if not upload_date_str: upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] @@ -107,7 +115,7 @@ class ArteTVBaseIE(InfoExtractor): langcode = LANGS.get(lang, lang) formats = [] - for format_id, format_dict in player_info['VSR'].items(): + for format_id, format_dict in vsr.items(): f = dict(format_dict) versionCode = f.get('versionCode') l = re.escape(langcode) diff --git a/youtube_dl/extractor/asiancrush.py b/youtube_dl/extractor/asiancrush.py new file mode 100644 index 000000000..594c88c9c --- /dev/null +++ b/youtube_dl/extractor/asiancrush.py @@ -0,0 +1,93 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .kaltura import KalturaIE +from ..utils import ( + extract_attributes, + remove_end, + urlencode_postdata, +) + + +class AsianCrushIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?asiancrush\.com/video/(?:[^/]+/)?0+(?P<id>\d+)v\b' + _TESTS = [{ + 'url': 'https://www.asiancrush.com/video/012869v/women-who-flirt/', + 'md5': 'c3b740e48d0ba002a42c0b72857beae6', + 'info_dict': { + 'id': '1_y4tmjm5r', + 'ext': 'mp4', + 'title': 'Women Who Flirt', + 'description': 'md5:3db14e9186197857e7063522cb89a805', + 'timestamp': 1496936429, + 'upload_date': '20170608', + 'uploader_id': 'craig@crifkin.com', + }, + }, { + 'url': 'https://www.asiancrush.com/video/she-was-pretty/011886v-pretty-episode-3/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = self._download_json( + 'https://www.asiancrush.com/wp-admin/admin-ajax.php', video_id, + data=urlencode_postdata({ + 'postid': video_id, + 'action': 'get_channel_kaltura_vars', + })) + + entry_id = data['entry_id'] + + return self.url_result( + 'kaltura:%s:%s' % (data['partner_id'], entry_id), + ie=KalturaIE.ie_key(), video_id=entry_id, + video_title=data.get('vid_label')) + + +class AsianCrushPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?asiancrush\.com/series/0+(?P<id>\d+)s\b' + _TEST = { + 'url': 'https://www.asiancrush.com/series/012481s/scholar-walks-night/', + 'info_dict': { + 'id': '12481', + 'title': 'Scholar Who Walks the Night', + 'description': 'md5:7addd7c5132a09fd4741152d96cce886', + }, + 'playlist_count': 20, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [] + + for mobj in re.finditer( + r'<a[^>]+href=(["\'])(?P<url>%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL, + webpage): + attrs = extract_attributes(mobj.group(0)) + if attrs.get('class') == 'clearfix': + entries.append(self.url_result( + mobj.group('url'), ie=AsianCrushIE.ie_key())) + + title = remove_end( + self._html_search_regex( + r'(?s)<h1\b[^>]\bid=["\']movieTitle[^>]+>(.+?)</h1>', webpage, + 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', + default=None) or self._search_regex( + r'<title>([^<]+)</title>', webpage, 'title', fatal=False), + ' | AsianCrush') + + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'twitter:description', webpage, 'description', fatal=False) + + return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py index e48bb8972..393f381c6 100644 --- a/youtube_dl/extractor/audioboom.py +++ b/youtube_dl/extractor/audioboom.py @@ -43,7 +43,7 @@ class AudioBoomIE(InfoExtractor): def from_clip(field): if clip: - clip.get(field) + return clip.get(field) audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property( 'audio', webpage, 'audio url') diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index df2972f26..be41bd5a2 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -14,14 +14,16 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + KNOWN_EXTENSIONS, parse_filesize, unescapeHTML, update_url_query, + unified_strdate, ) class BandcampIE(InfoExtractor): - _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>.*)' + _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>[^/?#&]+)' _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', @@ -47,6 +49,7 @@ class BandcampIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) title = mobj.group('title') webpage = self._download_webpage(url, title) + thumbnail = self._html_search_meta('og:image', webpage, default=None) m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) if not m_download: m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage) @@ -75,6 +78,7 @@ class BandcampIE(InfoExtractor): return { 'id': track_id, 'title': data['title'], + 'thumbnail': thumbnail, 'formats': formats, 'duration': float_or_none(data.get('duration')), } @@ -143,7 +147,7 @@ class BandcampIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'thumbnail': info.get('thumb_url'), + 'thumbnail': info.get('thumb_url') or thumbnail, 'uploader': info.get('artist'), 'artist': artist, 'track': track, @@ -153,7 +157,7 @@ class BandcampIE(InfoExtractor): class BandcampAlbumIE(InfoExtractor): IE_NAME = 'Bandcamp:album' - _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^?#]+)|/?(?:$|[?#]))' + _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' _TESTS = [{ 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -220,6 +224,12 @@ class BandcampAlbumIE(InfoExtractor): 'playlist_count': 2, }] + @classmethod + def suitable(cls, url): + return (False + if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url) + else super(BandcampAlbumIE, cls).suitable(url)) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader_id = mobj.group('subdomain') @@ -232,7 +242,12 @@ class BandcampAlbumIE(InfoExtractor): raise ExtractorError('The page doesn\'t contain any tracks') # Only tracks with duration info have songs entries = [ - self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key()) + self.url_result( + compat_urlparse.urljoin(url, t_path), + ie=BandcampIE.ie_key(), + video_title=self._search_regex( + r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', + elem_content, 'track title', fatal=False)) for elem_content, t_path in track_elements if self._html_search_meta('duration', elem_content, default=None)] @@ -248,3 +263,92 @@ class BandcampAlbumIE(InfoExtractor): 'title': title, 'entries': entries, } + + +class BandcampWeeklyIE(InfoExtractor): + IE_NAME = 'Bandcamp:weekly' + _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://bandcamp.com/?show=224', + 'md5': 'b00df799c733cf7e0c567ed187dea0fd', + 'info_dict': { + 'id': '224', + 'ext': 'opus', + 'title': 'BC Weekly April 4th 2017 - Magic Moments', + 'description': 'md5:5d48150916e8e02d030623a48512c874', + 'duration': 5829.77, + 'release_date': '20170404', + 'series': 'Bandcamp Weekly', + 'episode': 'Magic Moments', + 'episode_number': 208, + 'episode_id': '224', + } + }, { + 'url': 'https://bandcamp.com/?blah/blah@&show=228', + 'only_matching': True + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + blob = self._parse_json( + self._search_regex( + r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage, + 'blob', group='blob'), + video_id, transform_source=unescapeHTML) + + show = blob['bcw_show'] + + # This is desired because any invalid show id redirects to `bandcamp.com` + # which happens to expose the latest Bandcamp Weekly episode. + show_id = int_or_none(show.get('show_id')) or int_or_none(video_id) + + formats = [] + for format_id, format_url in show['audio_stream'].items(): + if not isinstance(format_url, compat_str): + continue + for known_ext in KNOWN_EXTENSIONS: + if known_ext in format_id: + ext = known_ext + break + else: + ext = None + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'ext': ext, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + title = show.get('audio_title') or 'Bandcamp Weekly' + subtitle = show.get('subtitle') + if subtitle: + title += ' - %s' % subtitle + + episode_number = None + seq = blob.get('bcw_seq') + + if seq and isinstance(seq, list): + try: + episode_number = next( + int_or_none(e.get('episode_number')) + for e in seq + if isinstance(e, dict) and int_or_none(e.get('id')) == show_id) + except StopIteration: + pass + + return { + 'id': video_id, + 'title': title, + 'description': show.get('desc') or show.get('short_desc'), + 'duration': float_or_none(show.get('audio_duration')), + 'is_live': False, + 'release_date': unified_strdate(show.get('published_date')), + 'series': 'Bandcamp Weekly', + 'episode': show.get('subtitle'), + 'episode_number': episode_number, + 'episode_id': compat_str(video_id), + 'formats': formats + } diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index dd65b8d86..8b20c03d6 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -6,14 +6,18 @@ import itertools from .common import InfoExtractor from ..utils import ( + clean_html, dict_get, ExtractorError, float_or_none, + get_element_by_class, int_or_none, parse_duration, parse_iso8601, try_get, unescapeHTML, + urlencode_postdata, + urljoin, ) from ..compat import ( compat_etree_fromstring, @@ -25,19 +29,23 @@ from ..compat import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'[pb][\da-z]{7}' + _ID_REGEX = r'[pbw][\da-z]{7}' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ (?: programmes/(?!articles/)| iplayer(?:/[^/]+)?/(?:episode/|playlist/)| - music/clips[/#]| - radio/player/ + music/(?:clips|audiovideo/popular)[/#]| + radio/player/| + events/[^/]+/play/[^/]+/ ) (?P<id>%s)(?!/(?:episodes|broadcasts|clips)) ''' % _ID_REGEX + _LOGIN_URL = 'https://account.bbc.com/signin' + _NETRC_MACHINE = 'bbc' + _MEDIASELECTOR_URLS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails # with geolocation in some cases when it's even not geo restricted at all (e.g. @@ -222,11 +230,49 @@ class BBCCoUkIE(InfoExtractor): }, { 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf', 'only_matching': True, - } - ] + }, { + 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9', + 'only_matching': True, + }] _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading signin page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + post_url = urljoin(self._LOGIN_URL, self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post url', default=self._LOGIN_URL, group='url')) + + response, urlh = self._download_webpage_handle( + post_url, None, 'Logging in', data=urlencode_postdata(login_form), + headers={'Referer': self._LOGIN_URL}) + + if self._LOGIN_URL in urlh.geturl(): + error = clean_html(get_element_by_class('form-message', response)) + if error: + raise ExtractorError( + 'Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + def _real_initialize(self): + self._login() + class MediaSelectionError(Exception): def __init__(self, id): self.id = id @@ -483,6 +529,12 @@ class BBCCoUkIE(InfoExtractor): webpage = self._download_webpage(url, group_id, 'Downloading video page') + error = self._search_regex( + r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<', + webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + programme_id = None duration = None diff --git a/youtube_dl/extractor/beampro.py b/youtube_dl/extractor/beampro.py index f3a9e3278..2eaec1ab4 100644 --- a/youtube_dl/extractor/beampro.py +++ b/youtube_dl/extractor/beampro.py @@ -6,18 +6,33 @@ from ..utils import ( ExtractorError, clean_html, compat_str, + float_or_none, int_or_none, parse_iso8601, try_get, + urljoin, ) -class BeamProLiveIE(InfoExtractor): - IE_NAME = 'Beam:live' - _VALID_URL = r'https?://(?:\w+\.)?beam\.pro/(?P<id>[^/?#&]+)' +class BeamProBaseIE(InfoExtractor): + _API_BASE = 'https://mixer.com/api/v1' _RATINGS = {'family': 0, 'teen': 13, '18+': 18} + + def _extract_channel_info(self, chan): + user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id']) + return { + 'uploader': chan.get('token') or try_get( + chan, lambda x: x['user']['username'], compat_str), + 'uploader_id': compat_str(user_id) if user_id else None, + 'age_limit': self._RATINGS.get(chan.get('audience')), + } + + +class BeamProLiveIE(BeamProBaseIE): + IE_NAME = 'Mixer:live' + _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/(?P<id>[^/?#&]+)' _TEST = { - 'url': 'http://www.beam.pro/niterhayven', + 'url': 'http://mixer.com/niterhayven', 'info_dict': { 'id': '261562', 'ext': 'mp4', @@ -38,11 +53,17 @@ class BeamProLiveIE(InfoExtractor): }, } + _MANIFEST_URL_TEMPLATE = '%s/channels/%%s/manifest.%%s' % BeamProBaseIE._API_BASE + + @classmethod + def suitable(cls, url): + return False if BeamProVodIE.suitable(url) else super(BeamProLiveIE, cls).suitable(url) + def _real_extract(self, url): channel_name = self._match_id(url) chan = self._download_json( - 'https://beam.pro/api/v1/channels/%s' % channel_name, channel_name) + '%s/channels/%s' % (self._API_BASE, channel_name), channel_name) if chan.get('online') is False: raise ExtractorError( @@ -50,24 +71,118 @@ class BeamProLiveIE(InfoExtractor): channel_id = chan['id'] + def manifest_url(kind): + return self._MANIFEST_URL_TEMPLATE % (channel_id, kind) + formats = self._extract_m3u8_formats( - 'https://beam.pro/api/v1/channels/%s/manifest.m3u8' % channel_id, - channel_name, ext='mp4', m3u8_id='hls', fatal=False) + manifest_url('m3u8'), channel_name, ext='mp4', m3u8_id='hls', + fatal=False) + formats.extend(self._extract_smil_formats( + manifest_url('smil'), channel_name, fatal=False)) self._sort_formats(formats) - user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id']) - - return { + info = { 'id': compat_str(chan.get('id') or channel_name), 'title': self._live_title(chan.get('name') or channel_name), 'description': clean_html(chan.get('description')), - 'thumbnail': try_get(chan, lambda x: x['thumbnail']['url'], compat_str), + 'thumbnail': try_get( + chan, lambda x: x['thumbnail']['url'], compat_str), 'timestamp': parse_iso8601(chan.get('updatedAt')), - 'uploader': chan.get('token') or try_get( - chan, lambda x: x['user']['username'], compat_str), - 'uploader_id': compat_str(user_id) if user_id else None, - 'age_limit': self._RATINGS.get(chan.get('audience')), 'is_live': True, 'view_count': int_or_none(chan.get('viewersTotal')), 'formats': formats, } + info.update(self._extract_channel_info(chan)) + + return info + + +class BeamProVodIE(BeamProBaseIE): + IE_NAME = 'Mixer:vod' + _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/[^/?#&]+\?.*?\bvod=(?P<id>\d+)' + _TEST = { + 'url': 'https://mixer.com/willow8714?vod=2259830', + 'md5': 'b2431e6e8347dc92ebafb565d368b76b', + 'info_dict': { + 'id': '2259830', + 'ext': 'mp4', + 'title': 'willow8714\'s Channel', + 'duration': 6828.15, + 'thumbnail': r're:https://.*source\.png$', + 'timestamp': 1494046474, + 'upload_date': '20170506', + 'uploader': 'willow8714', + 'uploader_id': '6085379', + 'age_limit': 13, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + } + + @staticmethod + def _extract_format(vod, vod_type): + if not vod.get('baseUrl'): + return [] + + if vod_type == 'hls': + filename, protocol = 'manifest.m3u8', 'm3u8_native' + elif vod_type == 'raw': + filename, protocol = 'source.mp4', 'https' + else: + assert False + + data = vod.get('data') if isinstance(vod.get('data'), dict) else {} + + format_id = [vod_type] + if isinstance(data.get('Height'), compat_str): + format_id.append('%sp' % data['Height']) + + return [{ + 'url': urljoin(vod['baseUrl'], filename), + 'format_id': '-'.join(format_id), + 'ext': 'mp4', + 'protocol': protocol, + 'width': int_or_none(data.get('Width')), + 'height': int_or_none(data.get('Height')), + 'fps': int_or_none(data.get('Fps')), + 'tbr': int_or_none(data.get('Bitrate'), 1000), + }] + + def _real_extract(self, url): + vod_id = self._match_id(url) + + vod_info = self._download_json( + '%s/recordings/%s' % (self._API_BASE, vod_id), vod_id) + + state = vod_info.get('state') + if state != 'AVAILABLE': + raise ExtractorError( + 'VOD %s is not available (state: %s)' % (vod_id, state), + expected=True) + + formats = [] + thumbnail_url = None + + for vod in vod_info['vods']: + vod_type = vod.get('format') + if vod_type in ('hls', 'raw'): + formats.extend(self._extract_format(vod, vod_type)) + elif vod_type == 'thumbnail': + thumbnail_url = urljoin(vod.get('baseUrl'), 'source.png') + + self._sort_formats(formats) + + info = { + 'id': vod_id, + 'title': vod_info.get('name') or vod_id, + 'duration': float_or_none(vod_info.get('duration')), + 'thumbnail': thumbnail_url, + 'timestamp': parse_iso8601(vod_info.get('createdAt')), + 'view_count': int_or_none(vod_info.get('viewsTotal')), + 'formats': formats, + } + info.update(self._extract_channel_info(vod_info.get('channel') or {})) + + return info diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 80dd8382e..1e57310d6 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -54,6 +54,22 @@ class BiliBiliIE(InfoExtractor): 'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...', }, 'skip': 'Geo-restricted to China', + }, { + # Title with double quotes + 'url': 'http://www.bilibili.com/video/av8903802/', + 'info_dict': { + 'id': '8903802', + 'ext': 'mp4', + 'title': '阿滴英文|英文歌分享#6 "Closer', + 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', + 'uploader': '阿滴英文', + 'uploader_id': '65880958', + 'timestamp': 1488382620, + 'upload_date': '20170301', + }, + 'params': { + 'skip_download': True, # Test metadata only + }, }] _APP_KEY = '84956560bc028eb7' @@ -122,6 +138,11 @@ class BiliBiliIE(InfoExtractor): 'preference': -2 if 'hd.mp4' in backup_url else -3, }) + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': url, + }) + self._sort_formats(formats) entries.append({ @@ -130,7 +151,7 @@ class BiliBiliIE(InfoExtractor): 'formats': formats, }) - title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title') + title = self._html_search_regex('<h1[^>]*>([^<]+)</h1>', webpage, 'title') description = self._html_search_meta('description', webpage) timestamp = unified_timestamp(self._html_search_regex( r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', default=None)) diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py index 9661ade4f..07833532e 100644 --- a/youtube_dl/extractor/bpb.py +++ b/youtube_dl/extractor/bpb.py @@ -33,13 +33,18 @@ class BpbIE(InfoExtractor): title = self._html_search_regex( r'<h2 class="white">(.*?)</h2>', webpage, 'title') video_info_dicts = re.findall( - r"({\s*src:\s*'http://film\.bpb\.de/[^}]+})", webpage) + r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage) formats = [] for video_info in video_info_dicts: - video_info = self._parse_json(video_info, video_id, transform_source=js_to_json) - quality = video_info['quality'] - video_url = video_info['src'] + video_info = self._parse_json( + video_info, video_id, transform_source=js_to_json, fatal=False) + if not video_info: + continue + video_url = video_info.get('src') + if not video_url: + continue + quality = 'high' if '_high' in video_url else 'low' formats.append({ 'url': video_url, 'preference': 10 if quality == 'high' else 0, diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 3f017a2b1..0ed59bcbc 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -5,6 +5,7 @@ import re import json from .common import InfoExtractor +from .adobepass import AdobePassIE from ..compat import ( compat_etree_fromstring, compat_parse_qs, @@ -448,7 +449,7 @@ class BrightcoveLegacyIE(InfoExtractor): return info -class BrightcoveNewIE(InfoExtractor): +class BrightcoveNewIE(AdobePassIE): IE_NAME = 'brightcove:new' _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+|ref:[^&]+)' _TESTS = [{ @@ -602,6 +603,20 @@ class BrightcoveNewIE(InfoExtractor): raise ExtractorError(message, expected=True) raise + errors = json_data.get('errors') + if errors and errors[0].get('error_subcode') == 'TVE_AUTH': + custom_fields = json_data['custom_fields'] + tve_token = self._extract_mvpd_auth( + smuggled_data['source_url'], video_id, + custom_fields['bcadobepassrequestorid'], + custom_fields['bcadobepassresourceid']) + json_data = self._download_json( + api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }, query={ + 'tveToken': tve_token, + }) + title = json_data['name'].strip() formats = [] @@ -667,7 +682,6 @@ class BrightcoveNewIE(InfoExtractor): }) formats.append(f) - errors = json_data.get('errors') if not formats and errors: error = errors[0] raise ExtractorError( @@ -684,7 +698,7 @@ class BrightcoveNewIE(InfoExtractor): is_live = False duration = float_or_none(json_data.get('duration'), 1000) - if duration and duration < 0: + if duration is not None and duration <= 0: is_live = True return { diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py index 75fa92d7c..ec411091e 100644 --- a/youtube_dl/extractor/buzzfeed.py +++ b/youtube_dl/extractor/buzzfeed.py @@ -84,9 +84,10 @@ class BuzzFeedIE(InfoExtractor): continue entries.append(self.url_result(video['url'])) - facebook_url = FacebookIE._extract_url(webpage) - if facebook_url: - entries.append(self.url_result(facebook_url)) + facebook_urls = FacebookIE._extract_urls(webpage) + entries.extend([ + self.url_result(facebook_url) + for facebook_url in facebook_urls]) return { '_type': 'playlist', diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 87ad14e91..9faf40227 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -200,6 +200,7 @@ class CBCWatchBaseIE(InfoExtractor): 'media': 'http://search.yahoo.com/mrss/', 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/', } + _GEO_COUNTRIES = ['CA'] def _call_api(self, path, video_id): url = path if path.startswith('http') else self._API_BASE_URL + path @@ -287,6 +288,11 @@ class CBCWatchBaseIE(InfoExtractor): class CBCWatchVideoIE(CBCWatchBaseIE): IE_NAME = 'cbc.ca:watch:video' _VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TEST = { + # geo-restricted to Canada, bypassable + 'url': 'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235', + 'only_matching': True, + } def _real_extract(self, url): video_id = self._match_id(url) @@ -323,9 +329,10 @@ class CBCWatchIE(CBCWatchBaseIE): IE_NAME = 'cbc.ca:watch' _VALID_URL = r'https?://watch\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)' _TESTS = [{ + # geo-restricted to Canada, bypassable 'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4', 'info_dict': { - 'id': '38e815a-009e3ab12e4', + 'id': '9673749a-5e77-484c-8b62-a1092a6b5168', 'ext': 'mp4', 'title': 'Customer (Dis)Service', 'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87', @@ -337,8 +344,8 @@ class CBCWatchIE(CBCWatchBaseIE): 'skip_download': True, 'format': 'bestvideo', }, - 'skip': 'Geo-restricted to Canada', }, { + # geo-restricted to Canada, bypassable 'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057', 'info_dict': { 'id': '1ed4b385-cd84-49cf-95f0-80f004680057', @@ -346,7 +353,6 @@ class CBCWatchIE(CBCWatchBaseIE): 'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.', }, 'playlist_mincount': 30, - 'skip': 'Geo-restricted to Canada', }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 58f258c54..1268e38ef 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -49,13 +49,13 @@ class CBSIE(CBSBaseIE): 'only_matching': True, }] - def _extract_video_info(self, content_id): + def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): items_data = self._download_xml( 'http://can.cbs.com/thunder/player/videoPlayerService.php', - content_id, query={'partner': 'cbs', 'contentId': content_id}) + content_id, query={'partner': site, 'contentId': content_id}) video_data = xpath_element(items_data, './/item') title = xpath_text(video_data, 'videoTitle', 'title', True) - tp_path = 'dJ5BDC/media/guid/2198311517/%s' % content_id + tp_path = 'dJ5BDC/media/guid/%d/%s' % (mpx_acc, content_id) tp_release_url = 'http://link.theplatform.com/s/' + tp_path asset_types = [] diff --git a/youtube_dl/extractor/cbsinteractive.py b/youtube_dl/extractor/cbsinteractive.py index 57b18e81d..681d63e29 100644 --- a/youtube_dl/extractor/cbsinteractive.py +++ b/youtube_dl/extractor/cbsinteractive.py @@ -3,17 +3,18 @@ from __future__ import unicode_literals import re -from .theplatform import ThePlatformIE +from .cbs import CBSIE from ..utils import int_or_none -class CBSInteractiveIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?(?P<site>cnet|zdnet)\.com/(?:videos|video/share)/(?P<id>[^/?]+)' +class CBSInteractiveIE(CBSIE): + _VALID_URL = r'https?://(?:www\.)?(?P<site>cnet|zdnet)\.com/(?:videos|video(?:/share)?)/(?P<id>[^/?]+)' _TESTS = [{ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', 'info_dict': { - 'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60', - 'ext': 'flv', + 'id': 'R49SYt__yAfmlXR85z4f7gNmCBDcN_00', + 'display_id': 'hands-on-with-microsofts-windows-8-1-update', + 'ext': 'mp4', 'title': 'Hands-on with Microsoft Windows 8.1 Update', 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', @@ -22,13 +23,19 @@ class CBSInteractiveIE(ThePlatformIE): 'timestamp': 1396479627, 'upload_date': '20140402', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', + 'md5': 'f11d27b2fa18597fbf92444d2a9ed386', 'info_dict': { - 'id': '56527b93-d25d-44e3-b738-f989ce2e49ba', - 'ext': 'flv', + 'id': 'kjOJd_OoVJqbg_ZD8MZCOk8Wekb9QccK', + 'display_id': 'whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187', + 'ext': 'mp4', 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', - 'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole', + 'description': 'md5:d2b9a95a5ffe978ae6fbd4cf944d618f', 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', 'uploader': 'Ashley Esqueda', 'duration': 1482, @@ -38,23 +45,28 @@ class CBSInteractiveIE(ThePlatformIE): }, { 'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/', 'info_dict': { - 'id': 'bc1af9f0-a2b5-4e54-880d-0d95525781c0', + 'id': 'k0r4T_ehht4xW_hAOqiVQPuBDPZ8SRjt', + 'display_id': 'video-keeping-android-smartphones-and-tablets-secure', 'ext': 'mp4', 'title': 'Video: Keeping Android smartphones and tablets secure', 'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.', 'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0', 'uploader': 'Adrian Kingsley-Hughes', - 'timestamp': 1448961720, - 'upload_date': '20151201', + 'duration': 731, + 'timestamp': 1449129925, + 'upload_date': '20151203', }, 'params': { # m3u8 download 'skip_download': True, - } + }, + }, { + 'url': 'http://www.zdnet.com/video/huawei-matebook-x-video/', + 'only_matching': True, }] - TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/kYEXFC/%s?mbr=true' + MPX_ACCOUNTS = { - 'cnet': 2288573011, + 'cnet': 2198311517, 'zdnet': 2387448114, } @@ -68,7 +80,8 @@ class CBSInteractiveIE(ThePlatformIE): data = self._parse_json(data_json, display_id) vdata = data.get('video') or data['videos'][0] - video_id = vdata['id'] + video_id = vdata['mpxRefId'] + title = vdata['title'] author = vdata.get('author') if author: @@ -78,20 +91,7 @@ class CBSInteractiveIE(ThePlatformIE): uploader = None uploader_id = None - media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId']) - formats, subtitles = [], {} - for (fkey, vid) in vdata['files'].items(): - if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: - continue - release_url = self.TP_RELEASE_URL_TEMPLATE % vid - if fkey == 'hds': - release_url += '&manifest=f4m' - tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey) - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) - self._sort_formats(formats) - - info = self._extract_theplatform_metadata('kYEXFC/%s' % media_guid_path, video_id) + info = self._extract_video_info(video_id, site, self.MPX_ACCOUNTS[site]) info.update({ 'id': video_id, 'display_id': display_id, @@ -99,7 +99,5 @@ class CBSInteractiveIE(ThePlatformIE): 'duration': int_or_none(vdata.get('duration')), 'uploader': uploader, 'uploader_id': uploader_id, - 'subtitles': subtitles, - 'formats': formats, }) return info diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 17bb9af4f..51df15fac 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -15,19 +15,23 @@ class CBSNewsIE(CBSIE): _TESTS = [ { - 'url': 'http://www.cbsnews.com/news/tesla-and-spacex-elon-musks-industrial-empire/', + # 60 minutes + 'url': 'http://www.cbsnews.com/news/artificial-intelligence-positioned-to-be-a-game-changer/', 'info_dict': { - 'id': 'tesla-and-spacex-elon-musks-industrial-empire', - 'ext': 'flv', - 'title': 'Tesla and SpaceX: Elon Musk\'s industrial empire', - 'thumbnail': 'http://beta.img.cbsnews.com/i/2014/03/30/60147937-2f53-4565-ad64-1bdd6eb64679/60-0330-pelley-640x360.jpg', - 'duration': 791, + 'id': '_B6Ga3VJrI4iQNKsir_cdFo9Re_YJHE_', + 'ext': 'mp4', + 'title': 'Artificial Intelligence', + 'description': 'md5:8818145f9974431e0fb58a1b8d69613c', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1606, + 'uploader': 'CBSI-NEW', + 'timestamp': 1498431900, + 'upload_date': '20170625', }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, - 'skip': 'Subscribers only', }, { 'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', @@ -52,6 +56,22 @@ class CBSNewsIE(CBSIE): 'skip_download': True, }, }, + { + # 48 hours + 'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/', + 'info_dict': { + 'id': 'QpM5BJjBVEAUFi7ydR9LusS69DPLqPJ1', + 'ext': 'mp4', + 'title': 'Cold as Ice', + 'description': 'Can a childhood memory of a friend\'s murder solve a 1957 cold case? "48 Hours" correspondent Erin Moriarty has the latest.', + 'upload_date': '20170604', + 'timestamp': 1496538000, + 'uploader': 'CBSI-NEW', + }, + 'params': { + 'skip_download': True, + }, + }, ] def _real_extract(self, url): @@ -60,12 +80,18 @@ class CBSNewsIE(CBSIE): webpage = self._download_webpage(url, video_id) video_info = self._parse_json(self._html_search_regex( - r'(?:<ul class="media-list items" id="media-related-items"><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'', - webpage, 'video JSON info'), video_id) + r'(?:<ul class="media-list items" id="media-related-items"[^>]*><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'', + webpage, 'video JSON info', default='{}'), video_id, fatal=False) + + if video_info: + item = video_info['item'] if 'item' in video_info else video_info + else: + state = self._parse_json(self._search_regex( + r'data-cbsvideoui-options=(["\'])(?P<json>{.+?})\1', webpage, + 'playlist JSON info', group='json'), video_id)['state'] + item = state['playlist'][state['pid']] - item = video_info['item'] if 'item' in video_info else video_info - guid = item['mpxRefId'] - return self._extract_video_info(guid) + return self._extract_video_info(item['mpxRefId'], 'cbsnews') class CBSNewsLiveVideoIE(InfoExtractor): diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 78b7a923c..0c3af23d5 100755 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -124,7 +124,7 @@ class CDAIE(InfoExtractor): } def extract_format(page, version): - json_str = self._search_regex( + json_str = self._html_search_regex( r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page, '%s player_json' % version, fatal=False, group='player_data') if not json_str: diff --git a/youtube_dl/extractor/charlierose.py b/youtube_dl/extractor/charlierose.py index 2d517f231..42c9af263 100644 --- a/youtube_dl/extractor/charlierose.py +++ b/youtube_dl/extractor/charlierose.py @@ -5,7 +5,7 @@ from ..utils import remove_end class CharlieRoseIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?charlierose\.com/video(?:s|/player)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?charlierose\.com/(?:video|episode)(?:s|/player)/(?P<id>\d+)' _TESTS = [{ 'url': 'https://charlierose.com/videos/27996', 'md5': 'fda41d49e67d4ce7c2411fd2c4702e09', @@ -24,6 +24,9 @@ class CharlieRoseIE(InfoExtractor): }, { 'url': 'https://charlierose.com/videos/27996', 'only_matching': True, + }, { + 'url': 'https://charlierose.com/episodes/30887?autoplay=true', + 'only_matching': True, }] _PLAYER_BASE = 'https://charlierose.com/video/player/%s' diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index 0206d96db..d4769da75 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -5,6 +5,7 @@ import base64 import json from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( clean_html, ExtractorError @@ -70,11 +71,9 @@ class ChilloutzoneIE(InfoExtractor): # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed) if native_platform is None: - youtube_url = self._html_search_regex( - r'<iframe.* src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', - webpage, 'fallback video URL', default=None) - if youtube_url is not None: - return self.url_result(youtube_url, ie='Youtube') + youtube_url = YoutubeIE._extract_url(webpage) + if youtube_url: + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or # the own CDN diff --git a/youtube_dl/extractor/cinchcast.py b/youtube_dl/extractor/cinchcast.py index 562c9bbbb..b861d54b0 100644 --- a/youtube_dl/extractor/cinchcast.py +++ b/youtube_dl/extractor/cinchcast.py @@ -9,12 +9,20 @@ from ..utils import ( class CinchcastIE(InfoExtractor): - _VALID_URL = r'https?://player\.cinchcast\.com/.*?assetId=(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single', + 'info_dict': { + 'id': '5258197', + 'ext': 'mp3', + 'title': 'Train Your Brain to Up Your Game with Coach Mandy', + 'upload_date': '20130816', + }, + }, { # Actual test is run in generic, look for undergroundwellness 'url': 'http://player.cinchcast.com/?platformId=1&assetType=single&assetId=7141703', 'only_matching': True, - } + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/cjsw.py b/youtube_dl/extractor/cjsw.py new file mode 100644 index 000000000..505bdbe16 --- /dev/null +++ b/youtube_dl/extractor/cjsw.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + unescapeHTML, +) + + +class CJSWIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/(?P<program>[^/]+)/episode/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://cjsw.com/program/freshly-squeezed/episode/20170620', + 'md5': 'cee14d40f1e9433632c56e3d14977120', + 'info_dict': { + 'id': '91d9f016-a2e7-46c5-8dcb-7cbcd7437c41', + 'ext': 'mp3', + 'title': 'Freshly Squeezed – Episode June 20, 2017', + 'description': 'md5:c967d63366c3898a80d0c7b0ff337202', + 'series': 'Freshly Squeezed', + 'episode_id': '20170620', + }, + }, { + # no description + 'url': 'http://cjsw.com/program/road-pops/episode/20170707/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + program, episode_id = mobj.group('program', 'id') + audio_id = '%s/%s' % (program, episode_id) + + webpage = self._download_webpage(url, episode_id) + + title = unescapeHTML(self._search_regex( + (r'<h1[^>]+class=["\']episode-header__title["\'][^>]*>(?P<title>[^<]+)', + r'data-audio-title=(["\'])(?P<title>(?:(?!\1).)+)\1'), + webpage, 'title', group='title')) + + audio_url = self._search_regex( + r'<button[^>]+data-audio-src=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'audio url', group='url') + + audio_id = self._search_regex( + r'/([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.mp3', + audio_url, 'audio id', default=audio_id) + + formats = [{ + 'url': audio_url, + 'ext': determine_ext(audio_url, 'mp3'), + 'vcodec': 'none', + }] + + description = self._html_search_regex( + r'<p>(?P<description>.+?)</p>', webpage, 'description', + default=None) + series = self._search_regex( + r'data-showname=(["\'])(?P<name>(?:(?!\1).)+)\1', webpage, + 'series', default=program, group='name') + + return { + 'id': audio_id, + 'title': title, + 'description': description, + 'formats': formats, + 'series': series, + 'episode_id': episode_id, + } diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py deleted file mode 100644 index 0920f6219..000000000 --- a/youtube_dl/extractor/clipfish.py +++ /dev/null @@ -1,67 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, -) - - -class ClipfishIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/', - 'md5': 'b9a5dc46294154c1193e2d10e0c95693', - 'info_dict': { - 'id': '4343170', - 'ext': 'mp4', - 'title': 'S01 E01 - Ugly Americans - Date in der Hölle', - 'description': 'Mark Lilly arbeitet im Sozialdienst der Stadt New York und soll Immigranten bei ihrer Einbürgerung in die USA zur Seite stehen.', - 'upload_date': '20161005', - 'duration': 1291, - 'view_count': int, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - video_info = self._download_json( - 'http://www.clipfish.de/devapi/id/%s?format=json&apikey=hbbtv' % video_id, - video_id)['items'][0] - - formats = [] - - m3u8_url = video_info.get('media_videourl_hls') - if m3u8_url: - formats.append({ - 'url': m3u8_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'), - 'ext': 'mp4', - 'format_id': 'hls', - }) - - mp4_url = video_info.get('media_videourl') - if mp4_url: - formats.append({ - 'url': mp4_url, - 'format_id': 'mp4', - 'width': int_or_none(video_info.get('width')), - 'height': int_or_none(video_info.get('height')), - 'tbr': int_or_none(video_info.get('bitrate')), - }) - - descr = video_info.get('descr') - if descr: - descr = descr.strip() - - return { - 'id': video_id, - 'title': video_info['title'], - 'description': descr, - 'formats': formats, - 'thumbnail': video_info.get('media_content_thumbnail_large') or video_info.get('media_thumbnail'), - 'duration': int_or_none(video_info.get('media_length')), - 'upload_date': unified_strdate(video_info.get('pubDate')), - 'view_count': int_or_none(video_info.get('media_views')) - } diff --git a/youtube_dl/extractor/clippit.py b/youtube_dl/extractor/clippit.py new file mode 100644 index 000000000..a1a7a774c --- /dev/null +++ b/youtube_dl/extractor/clippit.py @@ -0,0 +1,74 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + qualities, +) + +import re + + +class ClippitIE(InfoExtractor): + + _VALID_URL = r'https?://(?:www\.)?clippituser\.tv/c/(?P<id>[a-z]+)' + _TEST = { + 'url': 'https://www.clippituser.tv/c/evmgm', + 'md5': '963ae7a59a2ec4572ab8bf2f2d2c5f09', + 'info_dict': { + 'id': 'evmgm', + 'ext': 'mp4', + 'title': 'Bye bye Brutus. #BattleBots - Clippit', + 'uploader': 'lizllove', + 'uploader_url': 'https://www.clippituser.tv/p/lizllove', + 'timestamp': 1472183818, + 'upload_date': '20160826', + 'description': 'BattleBots | ABC', + 'thumbnail': r're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<title.*>(.+?)</title>', webpage, 'title') + + FORMATS = ('sd', 'hd') + quality = qualities(FORMATS) + formats = [] + for format_id in FORMATS: + url = self._html_search_regex(r'data-%s-file="(.+?)"' % format_id, + webpage, 'url', fatal=False) + if not url: + continue + match = re.search(r'/(?P<height>\d+)\.mp4', url) + formats.append({ + 'url': url, + 'format_id': format_id, + 'quality': quality(format_id), + 'height': int(match.group('height')) if match else None, + }) + + uploader = self._html_search_regex(r'class="username".*>\s+(.+?)\n', + webpage, 'uploader', fatal=False) + uploader_url = ('https://www.clippituser.tv/p/' + uploader + if uploader else None) + + timestamp = self._html_search_regex(r'datetime="(.+?)"', + webpage, 'date', fatal=False) + thumbnail = self._html_search_regex(r'data-image="(.+?)"', + webpage, 'thumbnail', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'uploader': uploader, + 'uploader_url': uploader_url, + 'timestamp': parse_iso8601(timestamp), + 'description': self._og_search_description(webpage), + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index 9bc8dbea4..85ca20ecc 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -30,7 +30,11 @@ class CloudyIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.cloudy.ec/embed.php?id=%s' % video_id, video_id) + 'https://www.cloudy.ec/embed.php', video_id, query={ + 'id': video_id, + 'playerPage': 1, + 'autoplay': 1, + }) info = self._parse_html5_media_entries(url, webpage, video_id)[0] diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 76b5378e9..317a9a76f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -27,6 +27,7 @@ from ..compat import ( compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, + compat_xml_parse_error, ) from ..downloader.f4m import remove_encrypted_media from ..utils import ( @@ -376,7 +377,7 @@ class InfoExtractor(object): cls._VALID_URL_RE = re.compile(cls._VALID_URL) m = cls._VALID_URL_RE.match(url) assert m - return m.group('id') + return compat_str(m.group('id')) @classmethod def working(cls): @@ -420,7 +421,7 @@ class InfoExtractor(object): if country_code: self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) if self._downloader.params.get('verbose', False): - self._downloader.to_stdout( + self._downloader.to_screen( '[debug] Using fake IP %s (%s) as X-Forwarded-For.' % (self._x_forwarded_for_ip, country_code.upper())) @@ -646,15 +647,29 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): + transform_source=None, fatal=True, encoding=None, + data=None, headers={}, query={}): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) + url_or_request, video_id, note, errnote, fatal=fatal, + encoding=encoding, data=data, headers=headers, query=query) if xml_string is False: return xml_string + return self._parse_xml( + xml_string, video_id, transform_source=transform_source, + fatal=fatal) + + def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): if transform_source: xml_string = transform_source(xml_string) - return compat_etree_fromstring(xml_string.encode('utf-8')) + try: + return compat_etree_fromstring(xml_string.encode('utf-8')) + except compat_xml_parse_error as ve: + errmsg = '%s: Failed to parse XML ' % video_id + if fatal: + raise ExtractorError(errmsg, cause=ve) + else: + self.report_warning(errmsg + str(ve)) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', @@ -730,12 +745,12 @@ class InfoExtractor(object): video_info['title'] = video_title return video_info - def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None): - urlrs = orderedSet( + def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None): + urls = orderedSet( self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) for m in matches) return self.playlist_result( - urlrs, playlist_id=video_id, playlist_title=video_title) + urls, playlist_id=playlist_id, playlist_title=playlist_title) @staticmethod def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): @@ -940,7 +955,8 @@ class InfoExtractor(object): def _family_friendly_search(self, html): # See http://schema.org/VideoObject - family_friendly = self._html_search_meta('isFamilyFriendly', html) + family_friendly = self._html_search_meta( + 'isFamilyFriendly', html, default=None) if not family_friendly: return None @@ -1002,17 +1018,17 @@ class InfoExtractor(object): item_type = e.get('@type') if expected_type is not None and expected_type != item_type: return info - if item_type == 'TVEpisode': + if item_type in ('TVEpisode', 'Episode'): info.update({ 'episode': unescapeHTML(e.get('name')), 'episode_number': int_or_none(e.get('episodeNumber')), 'description': unescapeHTML(e.get('description')), }) part_of_season = e.get('partOfSeason') - if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': + if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') - if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': + if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): info['series'] = unescapeHTML(part_of_series.get('name')) elif item_type == 'Article': info.update({ @@ -1022,10 +1038,10 @@ class InfoExtractor(object): }) elif item_type == 'VideoObject': extract_video_object(e) - elif item_type == 'WebPage': - video = e.get('video') - if isinstance(video, dict) and video.get('@type') == 'VideoObject': - extract_video_object(video) + continue + video = e.get('video') + if isinstance(video, dict) and video.get('@type') == 'VideoObject': + extract_video_object(video) break return dict((k, v) for k, v in info.items() if v is not None) @@ -1785,7 +1801,7 @@ class InfoExtractor(object): ms_info['timescale'] = int(timescale) segment_duration = source.get('duration') if segment_duration: - ms_info['segment_duration'] = int(segment_duration) + ms_info['segment_duration'] = float(segment_duration) def extract_Initialization(source): initialization = source.find(_add_ns('Initialization')) @@ -1892,9 +1908,13 @@ class InfoExtractor(object): 'Bandwidth': bandwidth, } + def location_key(location): + return 'url' if re.match(r'^https?://', location) else 'path' + if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) + media_location_key = location_key(media_template) # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ # can't be used at the same time @@ -1904,7 +1924,7 @@ class InfoExtractor(object): segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) representation_ms_info['fragments'] = [{ - 'url': media_template % { + media_location_key: media_template % { 'Number': segment_number, 'Bandwidth': bandwidth, }, @@ -1928,7 +1948,7 @@ class InfoExtractor(object): 'Number': segment_number, } representation_ms_info['fragments'].append({ - 'url': segment_url, + media_location_key: segment_url, 'duration': float_or_none(segment_d, representation_ms_info['timescale']), }) @@ -1952,8 +1972,9 @@ class InfoExtractor(object): for s in representation_ms_info['s']: duration = float_or_none(s['d'], timescale) for r in range(s.get('r', 0) + 1): + segment_uri = representation_ms_info['segment_urls'][segment_index] fragments.append({ - 'url': representation_ms_info['segment_urls'][segment_index], + location_key(segment_uri): segment_uri, 'duration': duration, }) segment_index += 1 @@ -1962,6 +1983,7 @@ class InfoExtractor(object): # No fragments key is present in this case. if 'fragments' in representation_ms_info: f.update({ + 'fragment_base_url': base_url, 'fragments': [], 'protocol': 'http_dash_segments', }) @@ -1969,10 +1991,8 @@ class InfoExtractor(object): initialization_url = representation_ms_info['initialization_url'] if not f.get('url'): f['url'] = initialization_url - f['fragments'].append({'url': initialization_url}) + f['fragments'].append({location_key(initialization_url): initialization_url}) f['fragments'].extend(representation_ms_info['fragments']) - for fragment in f['fragments']: - fragment['url'] = urljoin(base_url, fragment['url']) try: existing_format = next( fo for fo in formats @@ -2001,6 +2021,12 @@ class InfoExtractor(object): compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id) def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): + """ + Parse formats from ISM manifest. + References: + 1. [MS-SSTR]: Smooth Streaming Protocol, + https://msdn.microsoft.com/en-us/library/ff469518.aspx + """ if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None: return [] @@ -2022,8 +2048,11 @@ class InfoExtractor(object): self.report_warning('%s is not a supported codec' % fourcc) continue tbr = int(track.attrib['Bitrate']) // 1000 - width = int_or_none(track.get('MaxWidth')) - height = int_or_none(track.get('MaxHeight')) + # [1] does not mention Width and Height attributes. However, + # they're often present while MaxWidth and MaxHeight are + # missing, so should be used as fallbacks + width = int_or_none(track.get('MaxWidth') or track.get('Width')) + height = int_or_none(track.get('MaxHeight') or track.get('Height')) sampling_rate = int_or_none(track.get('SamplingRate')) track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) @@ -2101,19 +2130,19 @@ class InfoExtractor(object): return f return {} - def _media_formats(src, cur_media_type): + def _media_formats(src, cur_media_type, type_info={}): full_url = absolute_url(src) - ext = determine_ext(full_url) + ext = type_info.get('ext') or determine_ext(full_url) if ext == 'm3u8': is_plain_url = False formats = self._extract_m3u8_formats( full_url, video_id, ext='mp4', entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, - preference=preference) + preference=preference, fatal=False) elif ext == 'mpd': is_plain_url = False formats = self._extract_mpd_formats( - full_url, video_id, mpd_id=mpd_id) + full_url, video_id, mpd_id=mpd_id, fatal=False) else: is_plain_url = True formats = [{ @@ -2123,15 +2152,18 @@ class InfoExtractor(object): return is_plain_url, formats entries = [] + # amp-video and amp-audio are very similar to their HTML5 counterparts + # so we wll include them right here (see + # https://www.ampproject.org/docs/reference/components/amp-video) media_tags = [(media_tag, media_type, '') for media_tag, media_type - in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)] + in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)] media_tags.extend(re.findall( # We only allow video|audio followed by a whitespace or '>'. # Allowing more characters may end up in significant slow down (see # https://github.com/rg3/youtube-dl/issues/11979, example URL: # http://www.porntrex.com/maps/videositemap.xml). - r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage)) + r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage)) for media_tag, media_type, media_content in media_tags: media_info = { 'formats': [], @@ -2149,9 +2181,15 @@ class InfoExtractor(object): src = source_attributes.get('src') if not src: continue - is_plain_url, formats = _media_formats(src, media_type) + f = parse_content_type(source_attributes.get('type')) + is_plain_url, formats = _media_formats(src, media_type, f) if is_plain_url: - f = parse_content_type(source_attributes.get('type')) + # res attribute is not standard but seen several times + # in the wild + f.update({ + 'height': int_or_none(source_attributes.get('res')), + 'format_id': source_attributes.get('label'), + }) f.update(formats[0]) media_info['formats'].append(f) else: @@ -2174,7 +2212,7 @@ class InfoExtractor(object): def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): formats = [] hdcore_sign = 'hdcore=3.7.0' - f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') + f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') hds_host = hosts.get('hds') if hds_host: f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) @@ -2196,8 +2234,9 @@ class InfoExtractor(object): def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) - url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url') - http_base_url = 'http' + url_base + url_base = self._search_regex( + r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url') + http_base_url = '%s:%s' % ('http', url_base) formats = [] if 'm3u8' not in skip_protocols: formats.extend(self._extract_m3u8_formats( @@ -2231,7 +2270,7 @@ class InfoExtractor(object): for protocol in ('rtmp', 'rtsp'): if protocol not in skip_protocols: formats.append({ - 'url': protocol + url_base, + 'url': '%s:%s' % (protocol, url_base), 'format_id': protocol, 'protocol': protocol, }) @@ -2289,6 +2328,8 @@ class InfoExtractor(object): tracks = video_data.get('tracks') if tracks and isinstance(tracks, list): for track in tracks: + if not isinstance(track, dict): + continue if track.get('kind') != 'captions': continue track_url = urljoin(base_url, track.get('file')) @@ -2318,6 +2359,8 @@ class InfoExtractor(object): urls = [] formats = [] for source in jwplayer_sources_data: + if not isinstance(source, dict): + continue source_url = self._proto_relative_url(source.get('file')) if not source_url: continue diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index d3463b874..ed278fefc 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -16,7 +16,6 @@ from ..utils import ( mimetype2ext, orderedSet, parse_iso8601, - remove_end, ) @@ -50,10 +49,17 @@ class CondeNastIE(InfoExtractor): 'wmagazine': 'W Magazine', } - _VALID_URL = r'https?://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys()) + _VALID_URL = r'''(?x)https?://(?:video|www|player(?:-backend)?)\.(?:%s)\.com/ + (?: + (?: + embed(?:js)?| + (?:script|inline)/video + )/(?P<id>[0-9a-f]{24})(?:/(?P<player_id>[0-9a-f]{24}))?(?:.+?\btarget=(?P<target>[^&]+))?| + (?P<type>watch|series|video)/(?P<display_id>[^/?#]+) + )''' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) - EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed(?:js)?)/.+?' % '|'.join(_SITES.keys()) + EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys()) _TESTS = [{ 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', @@ -89,6 +95,12 @@ class CondeNastIE(InfoExtractor): 'upload_date': '20150916', 'timestamp': 1442434955, } + }, { + 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', + 'only_matching': True, + }, { + 'url': 'http://player-backend.cnevids.com/script/video/59138decb57ac36b83000005.js', + 'only_matching': True, }] def _extract_series(self, url, webpage): @@ -104,16 +116,16 @@ class CondeNastIE(InfoExtractor): entries = [self.url_result(build_url(path), 'CondeNast') for path in paths] return self.playlist_result(entries, playlist_title=title) - def _extract_video(self, webpage, url_type): - query = {} - params = self._search_regex( - r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None) - if params: - query.update({ - 'videoId': self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id'), - 'playerId': self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id'), - 'target': self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target'), - }) + def _extract_video_params(self, webpage, display_id): + query = self._parse_json( + self._search_regex( + r'(?s)var\s+params\s*=\s*({.+?})[;,]', webpage, 'player params', + default='{}'), + display_id, transform_source=js_to_json, fatal=False) + if query: + query['videoId'] = self._search_regex( + r'(?:data-video-id=|currentVideoId\s*=\s*)["\']([\da-f]+)', + webpage, 'video id', default=None) else: params = extract_attributes(self._search_regex( r'(<[^>]+data-js="video-player"[^>]+>)', @@ -123,17 +135,40 @@ class CondeNastIE(InfoExtractor): 'playerId': params['data-player'], 'target': params['id'], }) - video_id = query['videoId'] + return query + + def _extract_video(self, params): + video_id = params['videoId'] + video_info = None + + # New API path + query = params.copy() + query['embedType'] = 'inline' info_page = self._download_json( - 'http://player.cnevids.com/player/video.js', - video_id, 'Downloading video info', fatal=False, query=query) + 'http://player.cnevids.com/embed-api.json', video_id, + 'Downloading embed info', fatal=False, query=query) + + # Old fallbacks + if not info_page: + if params.get('playerId'): + info_page = self._download_json( + 'http://player.cnevids.com/player/video.js', video_id, + 'Downloading video info', fatal=False, query=params) if info_page: video_info = info_page.get('video') if not video_info: info_page = self._download_webpage( 'http://player.cnevids.com/player/loader.js', - video_id, 'Downloading loader info', query=query) + video_id, 'Downloading loader info', query=params) + if not video_info: + info_page = self._download_webpage( + 'https://player.cnevids.com/inline/video/%s.js' % video_id, + video_id, 'Downloading inline info', query={ + 'target': params.get('target', 'embedplayer') + }) + + if not video_info: video_info = self._parse_json( self._search_regex( r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'), @@ -161,9 +196,7 @@ class CondeNastIE(InfoExtractor): }) self._sort_formats(formats) - info = self._search_json_ld( - webpage, video_id, fatal=False) if url_type != 'embed' else {} - info.update({ + return { 'id': video_id, 'formats': formats, 'title': title, @@ -174,22 +207,26 @@ class CondeNastIE(InfoExtractor): 'series': video_info.get('series_title'), 'season': video_info.get('season_title'), 'timestamp': parse_iso8601(video_info.get('premiere_date')), - }) - return info + 'categories': video_info.get('categories'), + } def _real_extract(self, url): - site, url_type, item_id = re.match(self._VALID_URL, url).groups() + video_id, player_id, target, url_type, display_id = re.match(self._VALID_URL, url).groups() - # Convert JS embed to regular embed - if url_type == 'embedjs': - parsed_url = compat_urlparse.urlparse(url) - url = compat_urlparse.urlunparse(parsed_url._replace( - path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/'))) - url_type = 'embed' + if video_id: + return self._extract_video({ + 'videoId': video_id, + 'playerId': player_id, + 'target': target, + }) - webpage = self._download_webpage(url, item_id) + webpage = self._download_webpage(url, display_id) if url_type == 'series': return self._extract_series(url, webpage) else: - return self._extract_video(webpage, url_type) + params = self._extract_video_params(webpage, display_id) + info = self._search_json_ld( + webpage, display_id, fatal=False) + info.update(self._extract_video(params)) + return info diff --git a/youtube_dl/extractor/corus.py b/youtube_dl/extractor/corus.py index 7b2f5008b..807a29eea 100644 --- a/youtube_dl/extractor/corus.py +++ b/youtube_dl/extractor/corus.py @@ -8,7 +8,16 @@ from ..utils import int_or_none class CorusIE(ThePlatformFeedIE): - _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:globaltv|etcanada)\.com|(?:hgtv|foodnetwork|slice)\.ca)/(?:video/|(?:[^/]+/)+(?:videos/[a-z0-9-]+-|video\.html\?.*?\bv=))(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?P<domain> + (?:globaltv|etcanada)\.com| + (?:hgtv|foodnetwork|slice|history|showcase)\.ca + ) + /(?:video/|(?:[^/]+/)+(?:videos/[a-z0-9-]+-|video\.html\?.*?\bv=)) + (?P<id>\d+) + ''' _TESTS = [{ 'url': 'http://www.hgtv.ca/shows/bryan-inc/videos/movie-night-popcorn-with-bryan-870923331648/', 'md5': '05dcbca777bf1e58c2acbb57168ad3a6', @@ -27,6 +36,12 @@ class CorusIE(ThePlatformFeedIE): }, { 'url': 'http://etcanada.com/video/873675331955/meet-the-survivor-game-changers-castaways-part-2/', 'only_matching': True, + }, { + 'url': 'http://www.history.ca/the-world-without-canada/video/full-episodes/natural-resources/video.html?v=955054659646#video', + 'only_matching': True, + }, { + 'url': 'http://www.showcase.ca/eyewitness/video/eyewitness++106/video.html?v=955070531919&p=1&s=da#video', + 'only_matching': True, }] _TP_FEEDS = { @@ -50,6 +65,14 @@ class CorusIE(ThePlatformFeedIE): 'feed_id': '5tUJLgV2YNJ5', 'account_id': 2414427935, }, + 'history': { + 'feed_id': 'tQFx_TyyEq4J', + 'account_id': 2369613659, + }, + 'showcase': { + 'feed_id': '9H6qyshBZU3E', + 'account_id': 2414426607, + }, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/cracked.py b/youtube_dl/extractor/cracked.py index 94d03ce2a..f77a68ece 100644 --- a/youtube_dl/extractor/cracked.py +++ b/youtube_dl/extractor/cracked.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( parse_iso8601, str_to_int, @@ -41,11 +42,9 @@ class CrackedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - youtube_url = self._search_regex( - r'<iframe[^>]+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"', - webpage, 'youtube url', default=None) + youtube_url = YoutubeIE._extract_url(webpage) if youtube_url: - return self.url_result(youtube_url, 'Youtube') + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) video_url = self._html_search_regex( [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'], diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index f919ed208..13f425b2b 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -21,9 +21,10 @@ class CrackleIE(InfoExtractor): 'season_number': 8, 'episode_number': 4, 'subtitles': { - 'en-US': [{ - 'ext': 'ttml', - }] + 'en-US': [ + {'ext': 'vtt'}, + {'ext': 'tt'}, + ] }, }, 'params': { diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 2ffa4a7f8..8bdaf0c2c 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -510,7 +510,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text # webpage provide more accurate data than series_title from XML series = self._html_search_regex( - r'id=["\']showmedia_about_episode_num[^>]+>\s*<a[^>]+>([^<]+)', + r'(?s)<h\d[^>]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)</h\d', webpage, 'series', fatal=False) season = xpath_text(metadata, 'series_title') @@ -518,7 +518,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text episode_number = int_or_none(xpath_text(metadata, 'episode_number')) season_number = int_or_none(self._search_regex( - r'(?s)<h4[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h4>\s*<h4>\s*Season (\d+)', + r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)', webpage, 'season number', default=None)) return { diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index d4576160b..171820e27 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -10,6 +10,7 @@ from ..utils import ( smuggle_url, determine_ext, ExtractorError, + extract_attributes, ) from .senateisvp import SenateISVPIE from .ustream import UstreamIE @@ -68,6 +69,7 @@ class CSpanIE(InfoExtractor): 'uploader_id': '12987475', }, }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' def _real_extract(self, url): video_id = self._match_id(url) @@ -78,6 +80,19 @@ class CSpanIE(InfoExtractor): if ustream_url: return self.url_result(ustream_url, UstreamIE.ie_key()) + if '&vod' not in url: + bc = self._search_regex( + r"(<[^>]+id='brightcove-player-embed'[^>]+>)", + webpage, 'brightcove embed', default=None) + if bc: + bc_attr = extract_attributes(bc) + bc_url = self.BRIGHTCOVE_URL_TEMPLATE % ( + bc_attr.get('data-bcaccountid', '3162030207001'), + bc_attr.get('data-noprebcplayerid', 'SyGGpuJy3g'), + bc_attr.get('data-newbcplayerid', 'default'), + bc_attr['data-bcid']) + return self.url_result(smuggle_url(bc_url, {'source_url': url})) + # We first look for clipid, because clipprog always appears before patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')] results = list(filter(None, (re.search(p, webpage) for p in patterns))) diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py index 98c835bf1..af3978035 100644 --- a/youtube_dl/extractor/dailymail.py +++ b/youtube_dl/extractor/dailymail.py @@ -1,17 +1,21 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, determine_protocol, + try_get, unescapeHTML, ) class DailyMailIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/(?:video/[^/]+/video-|embed/video/)(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html', 'md5': 'f6129624562251f628296c3a9ffde124', 'info_dict': { @@ -20,7 +24,16 @@ class DailyMailIE(InfoExtractor): 'title': 'The Mountain appears in sparkling water ad for \'Heavy Bubbles\'', 'description': 'md5:a93d74b6da172dd5dc4d973e0b766a84', } - } + }, { + 'url': 'http://www.dailymail.co.uk/embed/video/1295863.html', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)', + webpage) def _real_extract(self, url): video_id = self._match_id(url) @@ -28,8 +41,14 @@ class DailyMailIE(InfoExtractor): video_data = self._parse_json(self._search_regex( r"data-opts='({.+?})'", webpage, 'video data'), video_id) title = unescapeHTML(video_data['title']) - video_sources = self._download_json(video_data.get( - 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id) + + sources_url = (try_get( + video_data, + (lambda x: x['plugins']['sources']['url'], + lambda x: x['sources']['url']), compat_str) or + 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id) + + video_sources = self._download_json(sources_url, video_id) formats = [] for rendition in video_sources['renditions']: diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 441114d19..e9d0dd19c 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -38,7 +38,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): class DailymotionIE(DailymotionBaseInfoExtractor): - _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:embed|swf|#)/)?video/(?P<id>[^/?_]+)' + _VALID_URL = r'(?i)https?://(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|#)/)?video|swf)/(?P<id>[^/?_]+)' IE_NAME = 'dailymotion' _FORMATS = [ @@ -49,87 +49,82 @@ class DailymotionIE(DailymotionBaseInfoExtractor): ('stream_h264_hd1080_url', 'hd180'), ] - _TESTS = [ - { - 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', - 'md5': '074b95bdee76b9e3654137aee9c79dfe', - 'info_dict': { - 'id': 'x5kesuj', - 'ext': 'mp4', - 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller', - 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller', - 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', - 'duration': 187, - 'timestamp': 1493651285, - 'upload_date': '20170501', - 'uploader': 'Deadline', - 'uploader_id': 'x1xm8ri', - 'age_limit': 0, - 'view_count': int, - }, + _TESTS = [{ + 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', + 'md5': '074b95bdee76b9e3654137aee9c79dfe', + 'info_dict': { + 'id': 'x5kesuj', + 'ext': 'mp4', + 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller', + 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller', + 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', + 'duration': 187, + 'timestamp': 1493651285, + 'upload_date': '20170501', + 'uploader': 'Deadline', + 'uploader_id': 'x1xm8ri', + 'age_limit': 0, + 'view_count': int, }, - { - 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', - 'md5': '2137c41a8e78554bb09225b8eb322406', - 'info_dict': { - 'id': 'x2iuewm', - 'ext': 'mp4', - 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', - 'description': 'Several come bundled with the Steam Controller.', - 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', - 'duration': 74, - 'timestamp': 1425657362, - 'upload_date': '20150306', - 'uploader': 'IGN', - 'uploader_id': 'xijv66', - 'age_limit': 0, - 'view_count': int, - }, - 'skip': 'video gone', + }, { + 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', + 'md5': '2137c41a8e78554bb09225b8eb322406', + 'info_dict': { + 'id': 'x2iuewm', + 'ext': 'mp4', + 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', + 'description': 'Several come bundled with the Steam Controller.', + 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', + 'duration': 74, + 'timestamp': 1425657362, + 'upload_date': '20150306', + 'uploader': 'IGN', + 'uploader_id': 'xijv66', + 'age_limit': 0, + 'view_count': int, }, + 'skip': 'video gone', + }, { # Vevo video - { - 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi', - 'info_dict': { - 'title': 'Roar (Official)', - 'id': 'USUV71301934', - 'ext': 'mp4', - 'uploader': 'Katy Perry', - 'upload_date': '20130905', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'VEVO is only available in some countries', + 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi', + 'info_dict': { + 'title': 'Roar (Official)', + 'id': 'USUV71301934', + 'ext': 'mp4', + 'uploader': 'Katy Perry', + 'upload_date': '20130905', }, + 'params': { + 'skip_download': True, + }, + 'skip': 'VEVO is only available in some countries', + }, { # age-restricted video - { - 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', - 'md5': '0d667a7b9cebecc3c89ee93099c4159d', - 'info_dict': { - 'id': 'xyh2zz', - 'ext': 'mp4', - 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', - 'uploader': 'HotWaves1012', - 'age_limit': 18, - }, - 'skip': 'video gone', + 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', + 'md5': '0d667a7b9cebecc3c89ee93099c4159d', + 'info_dict': { + 'id': 'xyh2zz', + 'ext': 'mp4', + 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', + 'uploader': 'HotWaves1012', + 'age_limit': 18, }, + 'skip': 'video gone', + }, { # geo-restricted, player v5 - { - 'url': 'http://www.dailymotion.com/video/xhza0o', - 'only_matching': True, - }, + 'url': 'http://www.dailymotion.com/video/xhza0o', + 'only_matching': True, + }, { # with subtitles - { - 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', - 'only_matching': True, - }, - { - 'url': 'http://www.dailymotion.com/swf/video/x3n92nf', - 'only_matching': True, - } - ] + 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', + 'only_matching': True, + }, { + 'url': 'http://www.dailymotion.com/swf/video/x3n92nf', + 'only_matching': True, + }, { + 'url': 'http://www.dailymotion.com/swf/x3ss1m_funny-magic-trick-barry-and-stuart_fun', + 'only_matching': True, + }] @staticmethod def _extract_urls(webpage): @@ -152,7 +147,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): view_count_str = self._search_regex( (r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"', r'video_views_count[^>]+>\s+([\s\d\,.]+)'), - webpage, 'view count', fatal=False) + webpage, 'view count', default=None) if view_count_str: view_count_str = re.sub(r'\s', '', view_count_str) view_count = str_to_int(view_count_str) @@ -164,7 +159,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor): [r'buildPlayer\(({.+?})\);\n', # See https://github.com/rg3/youtube-dl/issues/7826 r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);', r'buildPlayer\(({.+?})\);', - r'var\s+config\s*=\s*({.+?});'], + r'var\s+config\s*=\s*({.+?});', + # New layout regex (see https://github.com/rg3/youtube-dl/issues/13580) + r'__PLAYER_CONFIG__\s*=\s*({.+?});'], webpage, 'player v5', default=None) if player_v5: player = self._parse_json(player_v5, video_id) @@ -328,7 +325,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): IE_NAME = 'dailymotion:playlist' - _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/' + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>[^/?#&]+)' _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"' _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s' _TESTS = [{ diff --git a/youtube_dl/extractor/disney.py b/youtube_dl/extractor/disney.py index 939d1338c..968c4c7fd 100644 --- a/youtube_dl/extractor/disney.py +++ b/youtube_dl/extractor/disney.py @@ -15,7 +15,7 @@ from ..utils import ( class DisneyIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://(?P<domain>(?:[^/]+\.)?(?:disney\.[a-z]{2,3}(?:\.[a-z]{2})?|disney(?:(?:me|latino)\.com|turkiye\.com\.tr)|(?:starwars|marvelkids)\.com))/(?:(?:embed/|(?:[^/]+/)+[\w-]+-)(?P<id>[a-z0-9]{24})|(?:[^/]+/)?(?P<display_id>[^/?#]+))''' + https?://(?P<domain>(?:[^/]+\.)?(?:disney\.[a-z]{2,3}(?:\.[a-z]{2})?|disney(?:(?:me|latino)\.com|turkiye\.com\.tr|channel\.de)|(?:starwars|marvelkids)\.com))/(?:(?:embed/|(?:[^/]+/)+[\w-]+-)(?P<id>[a-z0-9]{24})|(?:[^/]+/)?(?P<display_id>[^/?#]+))''' _TESTS = [{ # Disney.EmbedVideo 'url': 'http://video.disney.com/watch/moana-trailer-545ed1857afee5a0ec239977', @@ -69,6 +69,9 @@ class DisneyIE(InfoExtractor): 'url': 'http://disneyjunior.en.disneyme.com/dj/watch-my-friends-tigger-and-pooh-promo', 'only_matching': True, }, { + 'url': 'http://disneychannel.de/sehen/soy-luna-folge-118-5518518987ba27f3cc729268', + 'only_matching': True, + }, { 'url': 'http://disneyjunior.disney.com/galactech-the-galactech-grab-galactech-an-admiral-rescue', 'only_matching': True, }] diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py index a78cb8a2a..c05f601e2 100644 --- a/youtube_dl/extractor/dispeak.py +++ b/youtube_dl/extractor/dispeak.py @@ -13,7 +13,7 @@ from ..utils import ( class DigitallySpeakingIE(InfoExtractor): - _VALID_URL = r'https?://(?:evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml' + _VALID_URL = r'https?://(?:s?evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml' _TESTS = [{ # From http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface @@ -28,6 +28,10 @@ class DigitallySpeakingIE(InfoExtractor): # From http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC 'url': 'http://events.digitallyspeaking.com/gdc/sf11/xml/12396_1299111843500GMPX.xml', 'only_matching': True, + }, { + # From http://www.gdcvault.com/play/1013700/Advanced-Material + 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml', + 'only_matching': True, }] def _parse_mp4(self, metadata): diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index d22133d24..9757f4422 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -3,11 +3,14 @@ from __future__ import unicode_literals import time import hashlib +import re from .common import InfoExtractor from ..utils import ( ExtractorError, unescapeHTML, + unified_strdate, + urljoin, ) @@ -117,3 +120,82 @@ class DouyuTVIE(InfoExtractor): 'uploader': uploader, 'is_live': True, } + + +class DouyuShowIE(InfoExtractor): + _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)' + + _TESTS = [{ + 'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw', + 'md5': '0c2cfd068ee2afe657801269b2d86214', + 'info_dict': { + 'id': 'rjNBdvnVXNzvE2yw', + 'ext': 'mp4', + 'title': '陈一发儿:砒霜 我有个室友系列!04-01 22点场', + 'duration': 7150.08, + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': '陈一发儿', + 'uploader_id': 'XrZwYelr5wbK', + 'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK', + 'upload_date': '20170402', + }, + }, { + 'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw', + 'only_matching': True, + }] + + def _real_extract(self, url): + url = url.replace('vmobile.', 'v.') + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + room_info = self._parse_json(self._search_regex( + r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id) + + video_info = None + + for trial in range(5): + # Sometimes Douyu rejects our request. Let's try it more times + try: + video_info = self._download_json( + 'https://vmobile.douyu.com/video/getInfo', video_id, + query={'vid': video_id}, + headers={ + 'Referer': url, + 'x-requested-with': 'XMLHttpRequest', + }) + break + except ExtractorError: + self._sleep(1, video_id) + + if not video_info: + raise ExtractorError('Can\'t fetch video info') + + formats = self._extract_m3u8_formats( + video_info['data']['video_url'], video_id, + entry_protocol='m3u8_native', ext='mp4') + + upload_date = unified_strdate(self._html_search_regex( + r'<em>上传时间:</em><span>([^<]+)</span>', webpage, + 'upload date', fatal=False)) + + uploader = uploader_id = uploader_url = None + mobj = re.search( + r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"', + webpage) + if mobj: + uploader_id, uploader = mobj.groups() + uploader_url = urljoin(url, '/author/' + uploader_id) + + return { + 'id': video_id, + 'title': room_info['name'], + 'formats': formats, + 'duration': room_info.get('duration'), + 'thumbnail': room_info.get('pic'), + 'upload_date': upload_date, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + } diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 87c5dd63e..76e784105 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -7,16 +7,18 @@ import time from .common import InfoExtractor from ..compat import ( - compat_urlparse, compat_HTTPError, + compat_str, + compat_urlparse, ) from ..utils import ( - USER_AGENTS, ExtractorError, int_or_none, - unified_strdate, remove_end, + try_get, + unified_strdate, update_url_query, + USER_AGENTS, ) @@ -183,28 +185,44 @@ class DPlayItIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - info_url = self._search_regex( - r'url\s*:\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)', - webpage, 'video id') - title = remove_end(self._og_search_title(webpage), ' | Dplay') - try: - info = self._download_json( - info_url, display_id, headers={ - 'Authorization': 'Bearer %s' % self._get_cookies(url).get( - 'dplayit_token').value, - 'Referer': url, - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): - info = self._parse_json(e.cause.read().decode('utf-8'), display_id) - error = info['errors'][0] - if error.get('code') == 'access.denied.geoblocked': - self.raise_geo_restricted( - msg=error.get('detail'), countries=self._GEO_COUNTRIES) - raise ExtractorError(info['errors'][0]['detail'], expected=True) - raise + video_id = None + + info = self._search_regex( + r'playback_json\s*:\s*JSON\.parse\s*\(\s*("(?:\\.|[^"\\])+?")', + webpage, 'playback JSON', default=None) + if info: + for _ in range(2): + info = self._parse_json(info, display_id, fatal=False) + if not info: + break + else: + video_id = try_get(info, lambda x: x['data']['id']) + + if not info: + info_url = self._search_regex( + r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)', + webpage, 'info url') + + video_id = info_url.rpartition('/')[-1] + + try: + info = self._download_json( + info_url, display_id, headers={ + 'Authorization': 'Bearer %s' % self._get_cookies(url).get( + 'dplayit_token').value, + 'Referer': url, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): + info = self._parse_json(e.cause.read().decode('utf-8'), display_id) + error = info['errors'][0] + if error.get('code') == 'access.denied.geoblocked': + self.raise_geo_restricted( + msg=error.get('detail'), countries=self._GEO_COUNTRIES) + raise ExtractorError(info['errors'][0]['detail'], expected=True) + raise hls_url = info['data']['attributes']['streaming']['hls']['url'] @@ -230,7 +248,7 @@ class DPlayItIE(InfoExtractor): season_number = episode_number = upload_date = None return { - 'id': info_url.rpartition('/')[-1], + 'id': compat_str(video_id or display_id), 'display_id': display_id, 'title': title, 'description': self._og_search_description(webpage), diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index e7abc8889..9a498d72a 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -12,6 +12,7 @@ from ..utils import ( ExtractorError, clean_html, int_or_none, + remove_end, sanitized_Request, urlencode_postdata ) @@ -72,15 +73,15 @@ class DramaFeverIE(DramaFeverBaseIE): 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/', 'info_dict': { 'id': '4512.1', - 'ext': 'mp4', - 'title': 'Cooking with Shin 4512.1', + 'ext': 'flv', + 'title': 'Cooking with Shin', 'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0', 'episode': 'Episode 1', 'episode_number': 1, 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1404336058, 'upload_date': '20140702', - 'duration': 343, + 'duration': 344, }, 'params': { # m3u8 download @@ -90,15 +91,15 @@ class DramaFeverIE(DramaFeverBaseIE): 'url': 'http://www.dramafever.com/drama/4826/4/Mnet_Asian_Music_Awards_2015/?ap=1', 'info_dict': { 'id': '4826.4', - 'ext': 'mp4', - 'title': 'Mnet Asian Music Awards 2015 4826.4', + 'ext': 'flv', + 'title': 'Mnet Asian Music Awards 2015', 'description': 'md5:3ff2ee8fedaef86e076791c909cf2e91', 'episode': 'Mnet Asian Music Awards 2015 - Part 3', 'episode_number': 4, 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1450213200, 'upload_date': '20151215', - 'duration': 5602, + 'duration': 5359, }, 'params': { # m3u8 download @@ -122,6 +123,10 @@ class DramaFeverIE(DramaFeverBaseIE): countries=self._GEO_COUNTRIES) raise + # title is postfixed with video id for some reason, removing + if info.get('title'): + info['title'] = remove_end(info['title'], video_id).strip() + series_id, episode_number = video_id.split('.') episode_info = self._download_json( # We only need a single episode info, so restricting page size to one episode diff --git a/youtube_dl/extractor/drbonanza.py b/youtube_dl/extractor/drbonanza.py index 79ec212c8..164e97c36 100644 --- a/youtube_dl/extractor/drbonanza.py +++ b/youtube_dl/extractor/drbonanza.py @@ -1,135 +1,59 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor from ..utils import ( - int_or_none, - parse_iso8601, + js_to_json, + parse_duration, + unescapeHTML, ) class DRBonanzaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dr\.dk/bonanza/(?:[^/]+/)+(?:[^/])+?(?:assetId=(?P<id>\d+))?(?:[#&]|$)' - - _TESTS = [{ - 'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517', + _VALID_URL = r'https?://(?:www\.)?dr\.dk/bonanza/[^/]+/\d+/[^/]+/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' + _TEST = { + 'url': 'http://www.dr.dk/bonanza/serie/154/matador/40312/matador---0824-komme-fremmede-', 'info_dict': { - 'id': '65517', + 'id': '40312', + 'display_id': 'matador---0824-komme-fremmede-', 'ext': 'mp4', - 'title': 'Talkshowet - Leonard Cohen', - 'description': 'md5:8f34194fb30cd8c8a30ad8b27b70c0ca', - 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', - 'timestamp': 1295537932, - 'upload_date': '20110120', - 'duration': 3664, - }, - 'params': { - 'skip_download': True, # requires rtmp - }, - }, { - 'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410', - 'md5': '6dfe039417e76795fb783c52da3de11d', - 'info_dict': { - 'id': '59410', - 'ext': 'mp3', - 'title': 'EM fodbold 1992 Danmark - Tyskland finale Transmission', - 'description': 'md5:501e5a195749480552e214fbbed16c4e', + 'title': 'MATADOR - 08:24. "Komme fremmede".', + 'description': 'md5:77b4c1ac4d4c1b9d610ab4395212ff84', 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', - 'timestamp': 1223274900, - 'upload_date': '20081006', - 'duration': 7369, + 'duration': 4613, }, - }] + } def _real_extract(self, url): - url_id = self._match_id(url) - webpage = self._download_webpage(url, url_id) - - if url_id: - info = json.loads(self._html_search_regex(r'({.*?%s.*})' % url_id, webpage, 'json')) - else: - # Just fetch the first video on that page - info = json.loads(self._html_search_regex(r'bonanzaFunctions.newPlaylist\(({.*})\)', webpage, 'json')) - - asset_id = str(info['AssetId']) - title = info['Title'].rstrip(' \'\"-,.:;!?') - duration = int_or_none(info.get('Duration'), scale=1000) - # First published online. "FirstPublished" contains the date for original airing. - timestamp = parse_iso8601( - re.sub(r'\.\d+$', '', info['Created'])) - - def parse_filename_info(url): - match = re.search(r'/\d+_(?P<width>\d+)x(?P<height>\d+)x(?P<bitrate>\d+)K\.(?P<ext>\w+)$', url) - if match: - return { - 'width': int(match.group('width')), - 'height': int(match.group('height')), - 'vbr': int(match.group('bitrate')), - 'ext': match.group('ext') - } - match = re.search(r'/\d+_(?P<bitrate>\d+)K\.(?P<ext>\w+)$', url) - if match: - return { - 'vbr': int(match.group('bitrate')), - 'ext': match.group(2) - } - return {} + mobj = re.match(self._VALID_URL, url) + video_id, display_id = mobj.group('id', 'display_id') - video_types = ['VideoHigh', 'VideoMid', 'VideoLow'] - preferencemap = { - 'VideoHigh': -1, - 'VideoMid': -2, - 'VideoLow': -3, - 'Audio': -4, - } + webpage = self._download_webpage(url, display_id) - formats = [] - for file in info['Files']: - if info['Type'] == 'Video': - if file['Type'] in video_types: - format = parse_filename_info(file['Location']) - format.update({ - 'url': file['Location'], - 'format_id': file['Type'].replace('Video', ''), - 'preference': preferencemap.get(file['Type'], -10), - }) - if format['url'].startswith('rtmp'): - rtmp_url = format['url'] - format['rtmp_live'] = True # --resume does not work - if '/bonanza/' in rtmp_url: - format['play_path'] = rtmp_url.split('/bonanza/')[1] - formats.append(format) - elif file['Type'] == 'Thumb': - thumbnail = file['Location'] - elif info['Type'] == 'Audio': - if file['Type'] == 'Audio': - format = parse_filename_info(file['Location']) - format.update({ - 'url': file['Location'], - 'format_id': file['Type'], - 'vcodec': 'none', - }) - formats.append(format) - elif file['Type'] == 'Thumb': - thumbnail = file['Location'] + info = self._parse_html5_media_entries( + url, webpage, display_id, m3u8_id='hls', + m3u8_entry_protocol='m3u8_native')[0] + self._sort_formats(info['formats']) - description = '%s\n%s\n%s\n' % ( - info['Description'], info['Actors'], info['Colophon']) + asset = self._parse_json( + self._search_regex( + r'(?s)currentAsset\s*=\s*({.+?})\s*</script', webpage, 'asset'), + display_id, transform_source=js_to_json) - self._sort_formats(formats) + title = unescapeHTML(asset['AssetTitle']).strip() - display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id - display_id = re.sub(r'-+', '-', display_id) + def extract(field): + return self._search_regex( + r'<div[^>]+>\s*<p>%s:<p>\s*</div>\s*<div[^>]+>\s*<p>([^<]+)</p>' % field, + webpage, field, default=None) - return { - 'id': asset_id, + info.update({ + 'id': asset.get('AssetId') or video_id, 'display_id': display_id, 'title': title, - 'formats': formats, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - } + 'description': extract('Programinfo'), + 'duration': parse_duration(extract('Tid')), + 'thumbnail': asset.get('AssetImageUrl'), + }) + return info diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index 1eca82b3b..c5d56a9ad 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -44,8 +44,23 @@ class DrTuberIE(InfoExtractor): webpage = self._download_webpage( 'http://www.drtuber.com/video/%s' % video_id, display_id) - video_url = self._html_search_regex( - r'<source src="([^"]+)"', webpage, 'video URL') + video_data = self._download_json( + 'http://www.drtuber.com/player_config_json/', video_id, query={ + 'vid': video_id, + 'embed': 0, + 'aid': 0, + 'domain_id': 0, + }) + + formats = [] + for format_id, video_url in video_data['files'].items(): + if video_url: + formats.append({ + 'format_id': format_id, + 'quality': 2 if format_id == 'hq' else 1, + 'url': video_url + }) + self._sort_formats(formats) title = self._html_search_regex( (r'class="title_watch"[^>]*><(?:p|h\d+)[^>]*>([^<]+)<', @@ -75,7 +90,7 @@ class DrTuberIE(InfoExtractor): return { 'id': video_id, 'display_id': display_id, - 'url': video_url, + 'formats': formats, 'title': title, 'thumbnail': thumbnail, 'like_count': like_count, diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index e4917014a..69effba58 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -20,7 +20,7 @@ class DRTVIE(InfoExtractor): IE_NAME = 'drtv' _TESTS = [{ 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', - 'md5': '25e659cccc9a2ed956110a299fdf5983', + 'md5': '7ae17b4e18eb5d29212f424a7511c184', 'info_dict': { 'id': 'klassen-darlig-taber-10', 'ext': 'mp4', @@ -30,21 +30,37 @@ class DRTVIE(InfoExtractor): 'upload_date': '20160823', 'duration': 606.84, }, - 'params': { - 'skip_download': True, - }, }, { + # embed 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', - 'md5': '2c37175c718155930f939ef59952474a', 'info_dict': { 'id': 'christiania-pusher-street-ryddes-drdkrjpo', 'ext': 'mp4', 'title': 'LIVE Christianias rydning af Pusher Street er i gang', - 'description': '- Det er det fedeste, der er sket i 20 år, fortæller christianit til DR Nyheder.', + 'description': 'md5:2a71898b15057e9b97334f61d04e6eb5', 'timestamp': 1472800279, 'upload_date': '20160902', 'duration': 131.4, }, + 'params': { + 'skip_download': True, + }, + }, { + # with SignLanguage formats + 'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder', + 'info_dict': { + 'id': 'historien-om-danmark-stenalder', + 'ext': 'mp4', + 'title': 'Historien om Danmark: Stenalder (1)', + 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a', + 'timestamp': 1490401996, + 'upload_date': '20170325', + 'duration': 3502.04, + 'formats': 'mincount:20', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -88,7 +104,7 @@ class DRTVIE(InfoExtractor): elif kind in ('VideoResource', 'AudioResource'): duration = float_or_none(asset.get('DurationInMilliseconds'), 1000) restricted_to_denmark = asset.get('RestrictedToDenmark') - spoken_subtitles = asset.get('Target') == 'SpokenSubtitles' + asset_target = asset.get('Target') for link in asset.get('Links', []): uri = link.get('Uri') if not uri: @@ -96,13 +112,13 @@ class DRTVIE(InfoExtractor): target = link.get('Target') format_id = target or '' preference = None - if spoken_subtitles: + if asset_target in ('SpokenSubtitles', 'SignLanguage'): preference = -1 - format_id += '-spoken-subtitles' + format_id += '-%s' % asset_target if target == 'HDS': f4m_formats = self._extract_f4m_formats( uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', - video_id, preference, f4m_id=format_id) + video_id, preference, f4m_id=format_id, fatal=False) if kind == 'AudioResource': for f in f4m_formats: f['vcodec'] = 'none' @@ -110,7 +126,8 @@ class DRTVIE(InfoExtractor): elif target == 'HLS': formats.extend(self._extract_m3u8_formats( uri, video_id, 'mp4', entry_protocol='m3u8_native', - preference=preference, m3u8_id=format_id)) + preference=preference, m3u8_id=format_id, + fatal=False)) else: bitrate = link.get('Bitrate') if bitrate: diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py index 974c69dbc..e85c58bd5 100644 --- a/youtube_dl/extractor/dvtv.py +++ b/youtube_dl/extractor/dvtv.py @@ -5,9 +5,12 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, js_to_json, + mimetype2ext, unescapeHTML, - ExtractorError, ) @@ -24,14 +27,7 @@ class DVTVIE(InfoExtractor): 'id': 'dc0768de855511e49e4b0025900fea04', 'ext': 'mp4', 'title': 'Vondra o Českém století: Při pohledu na Havla mi bylo trapně', - } - }, { - 'url': 'http://video.aktualne.cz/dvtv/stropnicky-policie-vrbetice-preventivne-nekontrolovala/r~82ed4322849211e4a10c0025900fea04/', - 'md5': '6388f1941b48537dbd28791f712af8bf', - 'info_dict': { - 'id': '72c02230849211e49f60002590604f2e', - 'ext': 'mp4', - 'title': 'Stropnický: Policie Vrbětice preventivně nekontrolovala', + 'duration': 1484, } }, { 'url': 'http://video.aktualne.cz/dvtv/dvtv-16-12-2014-utok-talibanu-boj-o-kliniku-uprchlici/r~973eb3bc854e11e498be002590604f2e/', @@ -44,55 +40,100 @@ class DVTVIE(InfoExtractor): 'info_dict': { 'id': 'b0b40906854d11e4bdad0025900fea04', 'ext': 'mp4', - 'title': 'Drtinová Veselovský TV 16. 12. 2014: Témata dne' + 'title': 'Drtinová Veselovský TV 16. 12. 2014: Témata dne', + 'description': 'md5:0916925dea8e30fe84222582280b47a0', + 'timestamp': 1418760010, + 'upload_date': '20141216', } }, { 'md5': '5f7652a08b05009c1292317b449ffea2', 'info_dict': { 'id': '420ad9ec854a11e4bdad0025900fea04', 'ext': 'mp4', - 'title': 'Školní masakr možná změní boj s Talibanem, říká novinářka' + 'title': 'Školní masakr možná změní boj s Talibanem, říká novinářka', + 'description': 'md5:ff2f9f6de73c73d7cef4f756c1c1af42', + 'timestamp': 1418760010, + 'upload_date': '20141216', } }, { 'md5': '498eb9dfa97169f409126c617e2a3d64', 'info_dict': { 'id': '95d35580846a11e4b6d20025900fea04', 'ext': 'mp4', - 'title': 'Boj o kliniku: Veřejný zájem, nebo právo na majetek?' + 'title': 'Boj o kliniku: Veřejný zájem, nebo právo na majetek?', + 'description': 'md5:889fe610a70fee5511dc3326a089188e', + 'timestamp': 1418760010, + 'upload_date': '20141216', } }, { 'md5': 'b8dc6b744844032dab6ba3781a7274b9', 'info_dict': { 'id': '6fe14d66853511e4833a0025900fea04', 'ext': 'mp4', - 'title': 'Pánek: Odmítání syrských uprchlíků je ostudou české vlády' + 'title': 'Pánek: Odmítání syrských uprchlíků je ostudou české vlády', + 'description': 'md5:544f86de6d20c4815bea11bf2ac3004f', + 'timestamp': 1418760010, + 'upload_date': '20141216', } }], }, { + 'url': 'https://video.aktualne.cz/dvtv/zeman-si-jen-leci-mindraky-sobotku-nenavidi-a-babis-se-mu-te/r~960cdb3a365a11e7a83b0025900fea04/', + 'md5': 'f8efe9656017da948369aa099788c8ea', + 'info_dict': { + 'id': '3c496fec365911e7a6500025900fea04', + 'ext': 'mp4', + 'title': 'Zeman si jen léčí mindráky, Sobotku nenávidí a Babiš se mu teď hodí, tvrdí Kmenta', + 'duration': 1103, + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://video.aktualne.cz/v-cechach-poprve-zazni-zelenkova-zrestaurovana-mse/r~45b4b00483ec11e4883b002590604f2e/', 'only_matching': True, }] def _parse_video_metadata(self, js, video_id): - metadata = self._parse_json(js, video_id, transform_source=js_to_json) + data = self._parse_json(js, video_id, transform_source=js_to_json) - formats = [] - for video in metadata['sources']: - ext = video['type'][6:] - formats.append({ - 'url': video['file'], - 'ext': ext, - 'format_id': '%s-%s' % (ext, video['label']), - 'height': int(video['label'].rstrip('p')), - 'fps': 25, - }) + title = unescapeHTML(data['title']) + formats = [] + for video in data['sources']: + video_url = video.get('file') + if not video_url: + continue + video_type = video.get('type') + ext = determine_ext(video_url, mimetype2ext(video_type)) + if video_type == 'application/vnd.apple.mpegurl' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif video_type == 'application/dash+xml' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + else: + label = video.get('label') + height = self._search_regex( + r'^(\d+)[pP]', label or '', 'height', default=None) + format_id = ['http'] + for f in (ext, label): + if f: + format_id.append(f) + formats.append({ + 'url': video_url, + 'format_id': '-'.join(format_id), + 'height': int_or_none(height), + }) self._sort_formats(formats) return { - 'id': metadata['mediaid'], - 'title': unescapeHTML(metadata['title']), - 'thumbnail': self._proto_relative_url(metadata['image'], 'http:'), + 'id': data.get('mediaid') or video_id, + 'title': title, + 'description': data.get('description'), + 'thumbnail': data.get('image'), + 'duration': int_or_none(data.get('duration')), + 'timestamp': int_or_none(data.get('pubtime')), 'formats': formats } @@ -103,7 +144,7 @@ class DVTVIE(InfoExtractor): # single video item = self._search_regex( - r"(?s)embedData[0-9a-f]{32}\['asset'\]\s*=\s*(\{.+?\});", + r'(?s)embedData[0-9a-f]{32}\[["\']asset["\']\]\s*=\s*(\{.+?\});', webpage, 'video', default=None, fatal=False) if item: @@ -113,6 +154,8 @@ class DVTVIE(InfoExtractor): items = re.findall( r"(?s)BBX\.context\.assets\['[0-9a-f]{32}'\]\.push\(({.+?})\);", webpage) + if not items: + items = re.findall(r'(?s)var\s+asset\s*=\s*({.+?});\n', webpage) if items: return { diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 76d39adac..42789278e 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -11,6 +11,7 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, + unsmuggle_url, ) @@ -50,6 +51,10 @@ class EaglePlatformIE(InfoExtractor): 'view_count': int, }, 'skip': 'Georestricted', + }, { + # referrer protected video (https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/) + 'url': 'eagleplatform:tvrainru.media.eagleplatform.com:582306', + 'only_matching': True, }] @staticmethod @@ -60,16 +65,40 @@ class EaglePlatformIE(InfoExtractor): webpage) if mobj is not None: return mobj.group('url') - # Basic usage embedding (see http://dultonmedia.github.io/eplayer/) + PLAYER_JS_RE = r''' + <script[^>]+ + src=(?P<qjs>["\'])(?:https?:)?//(?P<host>(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs) + .+? + ''' + # "Basic usage" embedding (see http://dultonmedia.github.io/eplayer/) mobj = re.search( r'''(?xs) - <script[^>]+ - src=(?P<q1>["\'])(?:https?:)?//(?P<host>.+?\.media\.eagleplatform\.com)/player/player\.js(?P=q1) - .+? + %s <div[^>]+ - class=(?P<q2>["\'])eagleplayer(?P=q2)[^>]+ + class=(?P<qclass>["\'])eagleplayer(?P=qclass)[^>]+ data-id=["\'](?P<id>\d+) - ''', webpage) + ''' % PLAYER_JS_RE, webpage) + if mobj is not None: + return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() + # Generalization of "Javascript code usage", "Combined usage" and + # "Usage without attaching to DOM" embeddings (see + # http://dultonmedia.github.io/eplayer/) + mobj = re.search( + r'''(?xs) + %s + <script> + .+? + new\s+EaglePlayer\( + (?:[^,]+\s*,\s*)? + { + .+? + \bid\s*:\s*["\']?(?P<id>\d+) + .+? + } + \s*\) + .+? + </script> + ''' % PLAYER_JS_RE, webpage) if mobj is not None: return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() @@ -79,9 +108,10 @@ class EaglePlatformIE(InfoExtractor): if status != 200: raise ExtractorError(' '.join(response['errors']), expected=True) - def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', *args, **kwargs): + def _download_json(self, url_or_request, video_id, *args, **kwargs): try: - response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note) + response = super(EaglePlatformIE, self)._download_json( + url_or_request, video_id, *args, **kwargs) except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError): response = self._parse_json(ee.cause.read().decode('utf-8'), video_id) @@ -93,11 +123,24 @@ class EaglePlatformIE(InfoExtractor): return self._download_json(url_or_request, video_id, note)['data'][0] def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + mobj = re.match(self._VALID_URL, url) host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id') + headers = {} + query = { + 'id': video_id, + } + + referrer = smuggled_data.get('referrer') + if referrer: + headers['Referer'] = referrer + query['referrer'] = referrer + player_data = self._download_json( - 'http://%s/api/player_data?id=%s' % (host, video_id), video_id) + 'http://%s/api/player_data' % host, video_id, + headers=headers, query=query) media = player_data['data']['playlist']['viewports'][0]['medialist'][0] diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index db921465e..e4a3046af 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -1,15 +1,18 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, + unified_timestamp, +) class EggheadCourseIE(InfoExtractor): IE_DESC = 'egghead.io course' IE_NAME = 'egghead:course' - _VALID_URL = r'https://egghead\.io/courses/(?P<id>[a-zA-Z_0-9-]+)' + _VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)' _TEST = { 'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript', 'playlist_count': 29, @@ -22,18 +25,60 @@ class EggheadCourseIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - title = self._html_search_regex(r'<h1 class="title">([^<]+)</h1>', webpage, 'title') - ul = self._search_regex(r'(?s)<ul class="series-lessons-list">(.*?)</ul>', webpage, 'session list') + course = self._download_json( + 'https://egghead.io/api/v1/series/%s' % playlist_id, playlist_id) + + entries = [ + self.url_result( + 'wistia:%s' % lesson['wistia_id'], ie='Wistia', + video_id=lesson['wistia_id'], video_title=lesson.get('title')) + for lesson in course['lessons'] if lesson.get('wistia_id')] + + return self.playlist_result( + entries, playlist_id, course.get('title'), + course.get('description')) + + +class EggheadLessonIE(InfoExtractor): + IE_DESC = 'egghead.io lesson' + IE_NAME = 'egghead:lesson' + _VALID_URL = r'https://egghead\.io/lessons/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', + 'info_dict': { + 'id': 'fv5yotjxcg', + 'ext': 'mp4', + 'title': 'Create linear data flow with container style types (Box)', + 'description': 'md5:9aa2cdb6f9878ed4c39ec09e85a8150e', + 'thumbnail': r're:^https?:.*\.jpg$', + 'timestamp': 1481296768, + 'upload_date': '20161209', + 'duration': 304, + 'view_count': 0, + 'tags': ['javascript', 'free'], + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + lesson_id = self._match_id(url) - found = re.findall(r'(?s)<a class="[^"]*"\s*href="([^"]+)">\s*<li class="item', ul) - entries = [self.url_result(m) for m in found] + lesson = self._download_json( + 'https://egghead.io/api/v1/lessons/%s' % lesson_id, lesson_id) return { - '_type': 'playlist', - 'id': playlist_id, - 'title': title, - 'description': self._og_search_description(webpage), - 'entries': entries, + '_type': 'url_transparent', + 'ie_key': 'Wistia', + 'url': 'wistia:%s' % lesson['wistia_id'], + 'id': lesson['wistia_id'], + 'title': lesson.get('title'), + 'description': lesson.get('summary'), + 'thumbnail': lesson.get('thumb_nail'), + 'timestamp': unified_timestamp(lesson.get('published_at')), + 'duration': int_or_none(lesson.get('duration')), + 'view_count': int_or_none(lesson.get('plays_count')), + 'tags': try_get(lesson, lambda x: x['tag_list'], list), } diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index 8795e0ddf..7a7436068 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -10,7 +10,25 @@ from ..utils import ( class ESPNIE(InfoExtractor): - _VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/video/clip(?:\?.*?\bid=|/_/id/)(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:(?:\w+\.)+)?espn\.go| + (?:www\.)?espn + )\.com/ + (?: + (?: + video/clip| + watch/player + ) + (?: + \?.*?\bid=| + /_/id/ + ) + ) + (?P<id>\d+) + ''' + _TESTS = [{ 'url': 'http://espn.go.com/video/clip?id=10365079', 'info_dict': { @@ -25,21 +43,35 @@ class ESPNIE(InfoExtractor): 'skip_download': True, }, }, { - # intl video, from http://www.espnfc.us/video/mls-highlights/150/video/2743663/must-see-moments-best-of-the-mls-season - 'url': 'http://espn.go.com/video/clip?id=2743663', + 'url': 'https://broadband.espn.go.com/video/clip?id=18910086', 'info_dict': { - 'id': '2743663', + 'id': '18910086', 'ext': 'mp4', - 'title': 'Must-See Moments: Best of the MLS season', - 'description': 'md5:4c2d7232beaea572632bec41004f0aeb', - 'timestamp': 1449446454, - 'upload_date': '20151207', + 'title': 'Kyrie spins around defender for two', + 'description': 'md5:2b0f5bae9616d26fba8808350f0d2b9b', + 'timestamp': 1489539155, + 'upload_date': '20170315', }, 'params': { 'skip_download': True, }, 'expected_warnings': ['Unable to download f4m manifest'], }, { + 'url': 'http://nonredline.sports.espn.go.com/video/clip?id=19744672', + 'only_matching': True, + }, { + 'url': 'https://cdn.espn.go.com/video/clip/_/id/19771774', + 'only_matching': True, + }, { + 'url': 'http://www.espn.com/watch/player?id=19141491', + 'only_matching': True, + }, { + 'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875', + 'only_matching': True, + }, { + 'url': 'http://www.espn.com/watch/player/_/id/19141491', + 'only_matching': True, + }, { 'url': 'http://www.espn.com/video/clip?id=10365079', 'only_matching': True, }, { diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c0020dd7d..a3a97e940 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -45,6 +45,7 @@ from .anvato import AnvatoIE from .anysex import AnySexIE from .aol import AolIE from .allocine import AllocineIE +from .aliexpress import AliExpressLiveIE from .aparat import AparatIE from .appleconnect import AppleConnectIE from .appletrailers import ( @@ -71,6 +72,10 @@ from .arte import ( TheOperaPlatformIE, ArteTVPlaylistIE, ) +from .asiancrush import ( + AsianCrushIE, + AsianCrushPlaylistIE, +) from .atresplayer import AtresPlayerIE from .atttechchannel import ATTTechChannelIE from .atvat import ATVAtIE @@ -90,7 +95,7 @@ from .azmedien import ( ) from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE -from .bandcamp import BandcampIE, BandcampAlbumIE +from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE from .bbc import ( BBCCoUkIE, BBCCoUkArticleIE, @@ -98,7 +103,10 @@ from .bbc import ( BBCCoUkPlaylistIE, BBCIE, ) -from .beampro import BeamProLiveIE +from .beampro import ( + BeamProLiveIE, + BeamProVodIE, +) from .beeg import BeegIE from .behindkink import BehindKinkIE from .bellmedia import BellMediaIE @@ -178,8 +186,9 @@ from .chirbit import ( ChirbitProfileIE, ) from .cinchcast import CinchcastIE -from .clipfish import ClipfishIE +from .cjsw import CJSWIE from .cliphunter import CliphunterIE +from .clippit import ClippitIE from .cliprs import ClipRsIE from .clipsyndicate import ClipsyndicateIE from .closertotruth import CloserToTruthIE @@ -251,7 +260,10 @@ from .democracynow import DemocracynowIE from .dfb import DFBIE from .dhm import DHMIE from .dotsub import DotsubIE -from .douyutv import DouyuTVIE +from .douyutv import ( + DouyuShowIE, + DouyuTVIE, +) from .dplay import ( DPlayIE, DPlayItIE, @@ -287,7 +299,10 @@ from .dw import ( from .eagleplatform import EaglePlatformIE from .ebaumsworld import EbaumsWorldIE from .echomsk import EchoMskIE -from .egghead import EggheadCourseIE +from .egghead import ( + EggheadCourseIE, + EggheadLessonIE, +) from .ehow import EHowIE from .eighttracks import EightTracksIE from .einthusan import EinthusanIE @@ -337,7 +352,12 @@ from .flipagram import FlipagramIE from .folketinget import FolketingetIE from .footyroom import FootyRoomIE from .formula1 import Formula1IE -from .fourtube import FourTubeIE +from .fourtube import ( + FourTubeIE, + PornTubeIE, + PornerBrosIE, + FuxIE, +) from .fox import FOXIE from .fox9 import FOX9IE from .foxgay import FoxgayIE @@ -350,9 +370,9 @@ from .foxsports import FoxSportsIE from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( - PluzzIE, - FranceTvInfoIE, FranceTVIE, + FranceTVEmbedIE, + FranceTVInfoIE, GenerationQuoiIE, CultureboxIE, ) @@ -386,7 +406,6 @@ from .globo import ( from .go import GoIE from .go90 import Go90IE from .godtube import GodTubeIE -from .godtv import GodTVIE from .golem import GolemIE from .googledrive import GoogleDriveIE from .googleplus import GooglePlusIE @@ -460,6 +479,7 @@ from .jamendo import ( ) from .jeuxvideo import JeuxVideoIE from .jove import JoveIE +from .joj import JojIE from .jwplatform import JWPlatformIE from .jpopsukitv import JpopsukiIE from .kaltura import KalturaIE @@ -490,6 +510,7 @@ from .la7 import LA7IE from .laola1tv import ( Laola1TvEmbedIE, Laola1TvIE, + ITTFIE, ) from .lci import LCIIE from .lcp import ( @@ -517,7 +538,10 @@ from .limelight import ( LimelightChannelListIE, ) from .litv import LiTVIE -from .liveleak import LiveLeakIE +from .liveleak import ( + LiveLeakIE, + LiveLeakEmbedIE, +) from .livestream import ( LivestreamIE, LivestreamOriginalIE, @@ -540,9 +564,12 @@ from .mangomolo import ( MangomoloVideoIE, MangomoloLiveIE, ) +from .manyvids import ManyVidsIE from .matchtv import MatchTVIE from .mdr import MDRIE +from .mediaset import MediasetIE from .medici import MediciIE +from .megaphone import MegaphoneIE from .meipai import MeipaiIE from .melonvod import MelonVODIE from .meta import METAIE @@ -569,7 +596,6 @@ from .mixcloud import ( ) from .mlb import MLBIE from .mnet import MnetIE -from .mpora import MporaIE from .moevideo import MoeVideoIE from .mofosex import MofosexIE from .mojvideo import MojvideoIE @@ -630,7 +656,10 @@ from .neteasemusic import ( NetEaseMusicProgramIE, NetEaseMusicDjRadioIE, ) -from .newgrounds import NewgroundsIE +from .newgrounds import ( + NewgroundsIE, + NewgroundsPlaylistIE, +) from .newstube import NewstubeIE from .nextmedia import ( NextMediaIE, @@ -638,6 +667,10 @@ from .nextmedia import ( AppleDailyIE, NextTVIE, ) +from .nexx import ( + NexxIE, + NexxEmbedIE, +) from .nfb import NFBIE from .nfl import NFLIE from .nhk import NhkVodIE @@ -651,6 +684,7 @@ from .nick import ( NickIE, NickDeIE, NickNightIE, + NickRuIE, ) from .niconico import NiconicoIE, NiconicoPlaylistIE from .ninecninemedia import ( @@ -663,6 +697,7 @@ from .nintendo import NintendoIE from .njpwworld import NJPWWorldIE from .nobelprize import NobelPrizeIE from .noco import NocoIE +from .nonktube import NonkTubeIE from .noovo import NoovoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE @@ -732,8 +767,9 @@ from .openload import OpenloadIE from .ora import OraTVIE from .orf import ( ORFTVthekIE, - ORFOE1IE, ORFFM4IE, + ORFFM4StoryIE, + ORFOE1IE, ORFIPTVIE, ) from .packtpub import ( @@ -745,6 +781,7 @@ from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE +from .pearvideo import PearVideoIE from .people import PeopleIE from .periscope import ( PeriscopeIE, @@ -810,11 +847,16 @@ from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE from .rai import ( RaiPlayIE, + RaiPlayLiveIE, RaiIE, ) from .rbmaradio import RBMARadioIE from .rds import RDSIE from .redbulltv import RedBullTVIE +from .reddit import ( + RedditIE, + RedditRIE, +) from .redtube import RedTubeIE from .regiotv import RegioTVIE from .rentv import ( @@ -858,9 +900,11 @@ from .rutube import ( RutubeEmbedIE, RutubeMovieIE, RutubePersonIE, + RutubePlaylistIE, ) from .rutv import RUTVIE from .ruutu import RuutuIE +from .ruv import RuvIE from .sandia import SandiaIE from .safari import ( SafariIE, @@ -907,8 +951,9 @@ from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE, + SoundcloudTrackStationIE, SoundcloudPlaylistIE, - SoundcloudSearchIE + SoundcloudSearchIE, ) from .soundgasm import ( SoundgasmIE, @@ -957,6 +1002,7 @@ from .tagesschau import ( TagesschauIE, ) from .tass import TassIE +from .tastytrade import TastyTradeIE from .tbs import TBSIE from .tdslifeway import TDSLifewayIE from .teachertube import ( @@ -965,7 +1011,6 @@ from .teachertube import ( ) from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE -from .teamfourstar import TeamFourStarIE from .techtalks import TechTalksIE from .ted import TEDIE from .tele13 import Tele13IE @@ -1014,11 +1059,6 @@ from .trilulilu import TriluliluIE from .trutv import TruTVIE from .tube8 import Tube8IE from .tubitv import TubiTvIE -from .tudou import ( - TudouIE, - TudouPlaylistIE, - TudouAlbumIE, -) from .tumblr import TumblrIE from .tunein import ( TuneInClipIE, @@ -1098,6 +1138,10 @@ from .uplynk import ( UplynkIE, UplynkPreplayIE, ) +from .upskill import ( + UpskillIE, + UpskillCourseIE, +) from .urort import UrortIE from .urplay import URPlayIE from .usanetwork import USANetworkIE @@ -1125,6 +1169,7 @@ from .vgtv import ( from .vh1 import VH1IE from .vice import ( ViceIE, + ViceArticleIE, ViceShowIE, ) from .viceland import VicelandIE @@ -1187,12 +1232,14 @@ from .vk import ( ) from .vlive import ( VLiveIE, - VLiveChannelIE + VLiveChannelIE, + VLivePlaylistIE ) from .vodlocker import VodlockerIE from .vodpl import VODPlIE from .vodplatform import VODPlatformIE from .voicerepublic import VoiceRepublicIE +from .voot import VootIE from .voxmedia import VoxMediaIE from .vporn import VpornIE from .vrt import VRTIE @@ -1214,6 +1261,7 @@ from .washingtonpost import ( WashingtonPostArticleIE, ) from .wat import WatIE +from .watchbox import WatchBoxIE from .watchindianporn import WatchIndianPornIE from .wdr import ( WDRIE, @@ -1263,12 +1311,12 @@ from .yahoo import ( YahooIE, YahooSearchIE, ) -from .yam import YamIE from .yandexmusic import ( YandexMusicTrackIE, YandexMusicAlbumIE, YandexMusicPlaylistIE, ) +from .yandexdisk import YandexDiskIE from .yesjapan import YesJapanIE from .yinyuetai import YinYueTaiIE from .ynet import YnetIE diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index b69c1ede0..4b3f6cc86 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -203,19 +203,19 @@ class FacebookIE(InfoExtractor): }] @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage) - if mobj is not None: - return mobj.group('url') - + def _extract_urls(webpage): + urls = [] + for mobj in re.finditer( + r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1', + webpage): + urls.append(mobj.group('url')) # Facebook API embed # see https://developers.facebook.com/docs/plugins/embedded-video-player - mobj = re.search(r'''(?x)<div[^>]+ + for mobj in re.finditer(r'''(?x)<div[^>]+ class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ - data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage) - if mobj is not None: - return mobj.group('url') + data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage): + urls.append(mobj.group('url')) + return urls def _login(self): (useremail, password) = self._get_login_info() diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 081c71842..4803a22c8 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -102,6 +102,8 @@ class FirstTVIE(InfoExtractor): 'format_id': f.get('name'), 'tbr': tbr, 'source_preference': quality(f.get('name')), + # quality metadata of http formats may be incorrect + 'preference': -1, }) # m3u8 URL format is reverse engineered from [1] (search for # master.m3u8). dashEdges (that is currently balancer-vod.1tv.ru) diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py index 15736c9fe..9f9863746 100644 --- a/youtube_dl/extractor/fivetv.py +++ b/youtube_dl/extractor/fivetv.py @@ -43,7 +43,7 @@ class FiveTVIE(InfoExtractor): 'info_dict': { 'id': 'glavnoe', 'ext': 'mp4', - 'title': 'Итоги недели с 8 по 14 июня 2015 года', + 'title': r're:^Итоги недели с \d+ по \d+ \w+ \d{4} года$', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -70,7 +70,8 @@ class FiveTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_url = self._search_regex( - r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"', + [r'<div[^>]+?class="flowplayer[^>]+?data-href="([^"]+)"', + r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'], webpage, 'video url') title = self._og_search_title(webpage, default=None) or self._search_regex( diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index a8e1bf42a..9f166efd4 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -1,7 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode +from ..compat import ( + compat_str, + compat_urllib_parse_urlencode, +) from ..utils import ( ExtractorError, int_or_none, @@ -81,7 +84,7 @@ class FlickrIE(InfoExtractor): formats = [] for stream in streams['stream']: - stream_type = str(stream.get('type')) + stream_type = compat_str(stream.get('type')) formats.append({ 'format_id': stream_type, 'url': stream['_content'], diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index 9776c8422..ad273a0e7 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -3,39 +3,22 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( parse_duration, parse_iso8601, - sanitized_Request, str_to_int, ) -class FourTubeIE(InfoExtractor): - IE_NAME = '4tube' - _VALID_URL = r'https?://(?:www\.)?4tube\.com/videos/(?P<id>\d+)' +class FourTubeBaseIE(InfoExtractor): + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + kind, video_id, display_id = mobj.group('kind', 'id', 'display_id') - _TEST = { - 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', - 'md5': '6516c8ac63b03de06bc8eac14362db4f', - 'info_dict': { - 'id': '209733', - 'ext': 'mp4', - 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black', - 'uploader': 'WCP Club', - 'uploader_id': 'wcp-club', - 'upload_date': '20131031', - 'timestamp': 1383263892, - 'duration': 583, - 'view_count': int, - 'like_count': int, - 'categories': list, - 'age_limit': 18, - } - } + if kind == 'm' or not display_id: + url = self._URL_TEMPLATE % video_id - def _real_extract(self, url): - video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_meta('name', webpage) @@ -43,10 +26,10 @@ class FourTubeIE(InfoExtractor): 'uploadDate', webpage)) thumbnail = self._html_search_meta('thumbnailUrl', webpage) uploader_id = self._html_search_regex( - r'<a class="item-to-subscribe" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">', + r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/([^/"]+)" title="Go to [^"]+ page">', webpage, 'uploader id', fatal=False) uploader = self._html_search_regex( - r'<a class="item-to-subscribe" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">', + r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/[^/"]+" title="Go to ([^"]+) page">', webpage, 'uploader', fatal=False) categories_html = self._search_regex( @@ -60,10 +43,10 @@ class FourTubeIE(InfoExtractor): view_count = str_to_int(self._search_regex( r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([0-9,]+)">', - webpage, 'view count', fatal=False)) + webpage, 'view count', default=None)) like_count = str_to_int(self._search_regex( r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserLikes:([0-9,]+)">', - webpage, 'like count', fatal=False)) + webpage, 'like count', default=None)) duration = parse_duration(self._html_search_meta('duration', webpage)) media_id = self._search_regex( @@ -85,14 +68,14 @@ class FourTubeIE(InfoExtractor): media_id = params[0] sources = ['%s' % p for p in params[2]] - token_url = 'http://tkn.4tube.com/{0}/desktop/{1}'.format( + token_url = 'https://tkn.kodicdn.com/{0}/desktop/{1}'.format( media_id, '+'.join(sources)) - headers = { - b'Content-Type': b'application/x-www-form-urlencoded', - b'Origin': b'http://www.4tube.com', - } - token_req = sanitized_Request(token_url, b'{}', headers) - tokens = self._download_json(token_req, video_id) + + parsed_url = compat_urlparse.urlparse(url) + tokens = self._download_json(token_url, video_id, data=b'', headers={ + 'Origin': '%s://%s' % (parsed_url.scheme, parsed_url.hostname), + 'Referer': url, + }) formats = [{ 'url': tokens[format]['token'], 'format_id': format + 'p', @@ -115,3 +98,126 @@ class FourTubeIE(InfoExtractor): 'duration': duration, 'age_limit': 18, } + + +class FourTubeIE(FourTubeBaseIE): + IE_NAME = '4tube' + _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?4tube\.com/(?:videos|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' + _URL_TEMPLATE = 'https://www.4tube.com/videos/%s/video' + _TESTS = [{ + 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', + 'md5': '6516c8ac63b03de06bc8eac14362db4f', + 'info_dict': { + 'id': '209733', + 'ext': 'mp4', + 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black', + 'uploader': 'WCP Club', + 'uploader_id': 'wcp-club', + 'upload_date': '20131031', + 'timestamp': 1383263892, + 'duration': 583, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + }, + }, { + 'url': 'http://www.4tube.com/embed/209733', + 'only_matching': True, + }, { + 'url': 'http://m.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', + 'only_matching': True, + }] + + +class FuxIE(FourTubeBaseIE): + _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?fux\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' + _URL_TEMPLATE = 'https://www.fux.com/video/%s/video' + _TESTS = [{ + 'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow', + 'info_dict': { + 'id': '195359', + 'ext': 'mp4', + 'title': 'Awesome fucking in the kitchen ends with cum swallow', + 'uploader': 'alenci2342', + 'uploader_id': 'alenci2342', + 'upload_date': '20131230', + 'timestamp': 1388361660, + 'duration': 289, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.fux.com/embed/195359', + 'only_matching': True, + }, { + 'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow', + 'only_matching': True, + }] + + +class PornTubeIE(FourTubeBaseIE): + _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?porntube\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)' + _URL_TEMPLATE = 'https://www.porntube.com/videos/video_%s' + _TESTS = [{ + 'url': 'https://www.porntube.com/videos/teen-couple-doing-anal_7089759', + 'info_dict': { + 'id': '7089759', + 'ext': 'mp4', + 'title': 'Teen couple doing anal', + 'uploader': 'Alexy', + 'uploader_id': 'Alexy', + 'upload_date': '20150606', + 'timestamp': 1433595647, + 'duration': 5052, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.porntube.com/embed/7089759', + 'only_matching': True, + }, { + 'url': 'https://m.porntube.com/videos/teen-couple-doing-anal_7089759', + 'only_matching': True, + }] + + +class PornerBrosIE(FourTubeBaseIE): + _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?pornerbros\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)' + _URL_TEMPLATE = 'https://www.pornerbros.com/videos/video_%s' + _TESTS = [{ + 'url': 'https://www.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369', + 'md5': '6516c8ac63b03de06bc8eac14362db4f', + 'info_dict': { + 'id': '181369', + 'ext': 'mp4', + 'title': 'Skinny brunette takes big cock down her anal hole', + 'uploader': 'PornerBros HD', + 'uploader_id': 'pornerbros-hd', + 'upload_date': '20130130', + 'timestamp': 1359527401, + 'duration': 1224, + 'view_count': int, + 'categories': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.pornerbros.com/embed/181369', + 'only_matching': True, + }, { + 'url': 'https://m.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369', + 'only_matching': True, + }] diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 159fdf9c4..facc665f6 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -3,56 +3,99 @@ from __future__ import unicode_literals from .adobepass import AdobePassIE from ..utils import ( - smuggle_url, - update_url_query, + int_or_none, + parse_age_limit, + parse_duration, + try_get, + unified_timestamp, ) class FOXIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.fox.com/watch/255180355939/7684182528', + _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)' + _TESTS = [{ + # clip + 'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/', 'md5': 'ebd296fcc41dd4b19f8115d8461a3165', 'info_dict': { - 'id': '255180355939', + 'id': '4b765a60490325103ea69888fb2bd4e8', 'ext': 'mp4', - 'title': 'Official Trailer: Gotham', - 'description': 'Tracing the rise of the great DC Comics Super-Villains and vigilantes, Gotham reveals an entirely new chapter that has never been told.', - 'duration': 129, - 'timestamp': 1400020798, - 'upload_date': '20140513', - 'uploader': 'NEWA-FNG-FOXCOM', + 'title': 'Aftermath: Bruce Wayne Develops Into The Dark Knight', + 'description': 'md5:549cd9c70d413adb32ce2a779b53b486', + 'duration': 102, + 'timestamp': 1504291893, + 'upload_date': '20170901', + 'creator': 'FOX', + 'series': 'Gotham', }, - 'add_ie': ['ThePlatform'], - } + 'params': { + 'skip_download': True, + }, + }, { + # episode, geo-restricted + 'url': 'https://www.fox.com/watch/087036ca7f33c8eb79b08152b4dd75c1/', + 'only_matching': True, + }, { + # episode, geo-restricted, tv provided required + 'url': 'https://www.fox.com/watch/30056b295fb57f7452aeeb4920bc3024/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - settings = self._parse_json(self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings'), video_id) - fox_pdk_player = settings['fox_pdk_player'] - release_url = fox_pdk_player['release_url'] - query = { - 'mbr': 'true', - 'switch': 'http' - } - if fox_pdk_player.get('access') == 'locked': - ap_p = settings['foxAdobePassProvider'] - rating = ap_p.get('videoRating') - if rating == 'n/a': - rating = None - resource = self._get_mvpd_resource('fbc-fox', None, ap_p['videoGUID'], rating) - query['auth'] = self._extract_mvpd_auth(url, video_id, 'fbc-fox', resource) - - info = self._search_json_ld(webpage, video_id, fatal=False) - info.update({ - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), - 'id': video_id, - }) - return info + video = self._download_json( + 'https://api.fox.com/fbc-content/v1_4/video/%s' % video_id, + video_id, headers={ + 'apikey': 'abdcbed02c124d393b39e818a4312055', + 'Content-Type': 'application/json', + 'Referer': url, + }) + + title = video['name'] + + m3u8_url = self._download_json( + video['videoRelease']['url'], video_id)['playURL'] + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + description = video.get('description') + duration = int_or_none(video.get('durationInSeconds')) or int_or_none( + video.get('duration')) or parse_duration(video.get('duration')) + timestamp = unified_timestamp(video.get('datePublished')) + age_limit = parse_age_limit(video.get('contentRating')) + + data = try_get( + video, lambda x: x['trackingData']['properties'], dict) or {} + + creator = data.get('brand') or data.get('network') or video.get('network') + + series = video.get('seriesName') or data.get( + 'seriesName') or data.get('show') + season_number = int_or_none(video.get('seasonNumber')) + episode = video.get('name') + episode_number = int_or_none(video.get('episodeNumber')) + release_year = int_or_none(video.get('releaseYear')) + + if data.get('authRequired'): + # TODO: AP + pass + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'age_limit': age_limit, + 'creator': creator, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + 'release_year': release_year, + 'formats': formats, + } diff --git a/youtube_dl/extractor/foxgay.py b/youtube_dl/extractor/foxgay.py index e887ae488..512a10645 100644 --- a/youtube_dl/extractor/foxgay.py +++ b/youtube_dl/extractor/foxgay.py @@ -5,6 +5,7 @@ import itertools from .common import InfoExtractor from ..utils import ( get_element_by_id, + int_or_none, remove_end, ) @@ -46,7 +47,7 @@ class FoxgayIE(InfoExtractor): formats = [{ 'url': source, - 'height': resolution, + 'height': int_or_none(resolution), } for source, resolution in zip( video_data['sources'], video_data.get('resolutions', itertools.repeat(None)))] diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 48d43ae58..2bcbb3e39 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -21,11 +21,13 @@ from .dailymotion import ( class FranceTVBaseInfoExtractor(InfoExtractor): - def _extract_video(self, video_id, catalogue): + def _extract_video(self, video_id, catalogue=None): info = self._download_json( - 'http://webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=%s&catalogue=%s' - % (video_id, catalogue), - video_id, 'Downloading video JSON') + 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/', + video_id, 'Downloading video JSON', query={ + 'idDiffusion': video_id, + 'catalogue': catalogue or '', + }) if info.get('status') == 'NOK': raise ExtractorError( @@ -109,27 +111,100 @@ class FranceTVBaseInfoExtractor(InfoExtractor): } -class PluzzIE(FranceTVBaseInfoExtractor): - IE_NAME = 'pluzz.francetv.fr' - _VALID_URL = r'https?://(?:m\.)?pluzz\.francetv\.fr/videos/(?P<id>.+?)\.html' +class FranceTVIE(FranceTVBaseInfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)*(?P<id>[^/]+)\.html' - # Can't use tests, videos expire in 7 days + _TESTS = [{ + 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', + 'info_dict': { + 'id': '157550144', + 'ext': 'mp4', + 'title': '13h15, le dimanche... - Les mystères de Jésus', + 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', + 'timestamp': 1494156300, + 'upload_date': '20170507', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, { + # france3 + 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html', + 'only_matching': True, + }, { + # france4 + 'url': 'https://www.france.tv/france-4/hero-corp/saison-1/134151-apres-le-calme.html', + 'only_matching': True, + }, { + # france5 + 'url': 'https://www.france.tv/france-5/c-a-dire/saison-10/137013-c-a-dire.html', + 'only_matching': True, + }, { + # franceo + 'url': 'https://www.france.tv/france-o/archipels/132249-mon-ancetre-l-esclave.html', + 'only_matching': True, + }, { + # france2 live + 'url': 'https://www.france.tv/france-2/direct.html', + 'only_matching': True, + }, { + 'url': 'https://www.france.tv/documentaires/histoire/136517-argentine-les-500-bebes-voles-de-la-dictature.html', + 'only_matching': True, + }, { + 'url': 'https://www.france.tv/jeux-et-divertissements/divertissements/133965-le-web-contre-attaque.html', + 'only_matching': True, + }, { + 'url': 'https://mobile.france.tv/france-5/c-dans-l-air/137347-emission-du-vendredi-12-mai-2017.html', + 'only_matching': True, + }, { + 'url': 'https://www.france.tv/142749-rouge-sang.html', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._html_search_meta( - 'id_video', webpage, 'video id', default=None) + catalogue = None + video_id = self._search_regex( + r'data-main-video=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'video id', default=None, group='id') + if not video_id: - video_id = self._search_regex( - r'data-diffusion=["\'](\d+)', webpage, 'video id') + video_id, catalogue = self._html_search_regex( + r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', + webpage, 'video ID').split('@') + return self._extract_video(video_id, catalogue) - return self._extract_video(video_id, 'Pluzz') +class FranceTVEmbedIE(FranceTVBaseInfoExtractor): + _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P<id>[^&]+)' -class FranceTvInfoIE(FranceTVBaseInfoExtractor): + _TEST = { + 'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961', + 'info_dict': { + 'id': 'NI_983319', + 'ext': 'mp4', + 'title': 'Le Pen Reims', + 'upload_date': '20170505', + 'timestamp': 1493981780, + 'duration': 16, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id, + video_id) + + return self._extract_video(video['video_id'], video.get('catalog')) + + +class FranceTVInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<title>[^/?#&.]+)' @@ -233,124 +308,6 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): return self._extract_video(video_id, catalogue) -class FranceTVIE(FranceTVBaseInfoExtractor): - IE_NAME = 'francetv' - IE_DESC = 'France 2, 3, 4, 5 and Ô' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:www\.)?france[2345o]\.fr/ - (?: - emissions/[^/]+/(?:videos|diffusions)| - emission/[^/]+| - videos| - jt - ) - /| - embed\.francetv\.fr/\?ue= - ) - (?P<id>[^/?]+) - ''' - - _TESTS = [ - # france2 - { - 'url': 'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', - 'md5': 'c03fc87cb85429ffd55df32b9fc05523', - 'info_dict': { - 'id': '109169362', - 'ext': 'flv', - 'title': '13h15, le dimanche...', - 'description': 'md5:9a0932bb465f22d377a449be9d1a0ff7', - 'upload_date': '20140914', - 'timestamp': 1410693600, - }, - }, - # france3 - { - 'url': 'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575', - 'md5': '679bb8f8921f8623bd658fa2f8364da0', - 'info_dict': { - 'id': '000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au', - 'ext': 'mp4', - 'title': 'Le scandale du prix des médicaments', - 'description': 'md5:1384089fbee2f04fc6c9de025ee2e9ce', - 'upload_date': '20131113', - 'timestamp': 1384380000, - }, - }, - # france4 - { - 'url': 'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', - 'md5': 'a182bf8d2c43d88d46ec48fbdd260c1c', - 'info_dict': { - 'id': 'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', - 'ext': 'mp4', - 'title': 'Hero Corp Making of - Extrait 1', - 'description': 'md5:c87d54871b1790679aec1197e73d650a', - 'upload_date': '20131106', - 'timestamp': 1383766500, - }, - }, - # france5 - { - 'url': 'http://www.france5.fr/emissions/c-a-dire/videos/quels_sont_les_enjeux_de_cette_rentree_politique__31-08-2015_908948?onglet=tous&page=1', - 'md5': 'f6c577df3806e26471b3d21631241fd0', - 'info_dict': { - 'id': '123327454', - 'ext': 'flv', - 'title': 'C à dire ?! - Quels sont les enjeux de cette rentrée politique ?', - 'description': 'md5:4a0d5cb5dce89d353522a84462bae5a4', - 'upload_date': '20150831', - 'timestamp': 1441035120, - }, - }, - # franceo - { - 'url': 'http://www.franceo.fr/jt/info-soir/18-07-2015', - 'md5': '47d5816d3b24351cdce512ad7ab31da8', - 'info_dict': { - 'id': '125377621', - 'ext': 'flv', - 'title': 'Infô soir', - 'description': 'md5:01b8c6915a3d93d8bbbd692651714309', - 'upload_date': '20150718', - 'timestamp': 1437241200, - 'duration': 414, - }, - }, - { - # francetv embed - 'url': 'http://embed.francetv.fr/?ue=8d7d3da1e3047c42ade5a5d7dfd3fc87', - 'info_dict': { - 'id': 'EV_30231', - 'ext': 'flv', - 'title': 'Alcaline, le concert avec Calogero', - 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff', - 'upload_date': '20150226', - 'timestamp': 1424989860, - 'duration': 5400, - }, - }, - { - 'url': 'http://www.france4.fr/emission/highlander/diffusion-du-17-07-2015-04h05', - 'only_matching': True, - }, - { - 'url': 'http://www.franceo.fr/videos/125377617', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_id, catalogue = self._html_search_regex( - r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', - webpage, 'video ID').split('@') - return self._extract_video(video_id, catalogue) - - class GenerationQuoiIE(InfoExtractor): IE_NAME = 'france2.fr:generation-quoi' _VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P<id>[^/?#]+)' diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index e44a2a87f..8c37509ec 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -2,15 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_urllib_parse_unquote_plus, -) +from ..compat import compat_HTTPError from ..utils import ( determine_ext, int_or_none, js_to_json, - sanitized_Request, ExtractorError, urlencode_postdata ) @@ -20,6 +16,7 @@ class FunimationIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/shows/[^/]+/(?P<id>[^/?#&]+)' _NETRC_MACHINE = 'funimation' + _TOKEN = None _TESTS = [{ 'url': 'https://www.funimation.com/shows/hacksign/role-play/', @@ -38,56 +35,38 @@ class FunimationIE(InfoExtractor): }, { 'url': 'https://www.funimation.com/shows/attack-on-titan-junior-high/broadcast-dub-preview/', 'info_dict': { - 'id': '9635', + 'id': '210051', 'display_id': 'broadcast-dub-preview', 'ext': 'mp4', 'title': 'Attack on Titan: Junior High - Broadcast Dub Preview', - 'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803', 'thumbnail': r're:https?://.*\.(?:jpg|png)', }, - 'skip': 'Access without user interaction is forbidden by CloudFlare', + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/', 'only_matching': True, }] - _LOGIN_URL = 'http://www.funimation.com/login' - - def _extract_cloudflare_session_ua(self, url): - ci_session_cookie = self._get_cookies(url).get('ci_session') - if ci_session_cookie: - ci_session = compat_urllib_parse_unquote_plus(ci_session_cookie.value) - # ci_session is a string serialized by PHP function serialize() - # This case is simple enough to use regular expressions only - return self._search_regex( - r'"user_agent";s:\d+:"([^"]+)"', ci_session, 'user agent', - default=None) - def _login(self): (username, password) = self._get_login_info() if username is None: return - data = urlencode_postdata({ - 'email_field': username, - 'password_field': password, - }) - user_agent = self._extract_cloudflare_session_ua(self._LOGIN_URL) - if not user_agent: - user_agent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0' - login_request = sanitized_Request(self._LOGIN_URL, data, headers={ - 'User-Agent': user_agent, - 'Content-Type': 'application/x-www-form-urlencoded' - }) - login_page = self._download_webpage( - login_request, None, 'Logging in as %s' % username) - if any(p in login_page for p in ('funimation.com/logout', '>Log Out<')): - return - error = self._html_search_regex( - r'(?s)<div[^>]+id=["\']errorMessages["\'][^>]*>(.+?)</div>', - login_page, 'error messages', default=None) - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - raise ExtractorError('Unable to log in') + try: + data = self._download_json( + 'https://prod-api-funimationnow.dadcdigital.com/api/auth/login/', + None, 'Logging in as %s' % username, data=urlencode_postdata({ + 'username': username, + 'password': password, + })) + self._TOKEN = data['token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + error = self._parse_json(e.cause.read().decode(), None)['error'] + raise ExtractorError(error, expected=True) + raise def _real_initialize(self): self._login() @@ -125,9 +104,12 @@ class FunimationIE(InfoExtractor): description = self._html_search_meta(['description', 'og:description'], webpage, fatal=True) try: + headers = {} + if self._TOKEN: + headers['Authorization'] = 'Token %s' % self._TOKEN sources = self._download_json( 'https://prod-api-funimationnow.dadcdigital.com/api/source/catalog/video/%s/signed/' % video_id, - video_id)['items'] + video_id, headers=headers)['items'] except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: error = self._parse_json(e.cause.read(), video_id)['errors'][0] diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 49409369c..f85e7de14 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -1,10 +1,14 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + unified_timestamp, +) class FunnyOrDieIE(InfoExtractor): @@ -18,6 +22,10 @@ class FunnyOrDieIE(InfoExtractor): 'title': 'Heart-Shaped Box: Literal Video Version', 'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338', 'thumbnail': r're:^http:.*\.jpg$', + 'uploader': 'DASjr', + 'timestamp': 1317904928, + 'upload_date': '20111006', + 'duration': 318.3, }, }, { 'url': 'http://www.funnyordie.com/embed/e402820827', @@ -27,6 +35,8 @@ class FunnyOrDieIE(InfoExtractor): 'title': 'Please Use This Song (Jon Lajoie)', 'description': 'Please use this to sell something. www.jonlajoie.com', 'thumbnail': r're:^http:.*\.jpg$', + 'timestamp': 1398988800, + 'upload_date': '20140502', }, 'params': { 'skip_download': True, @@ -100,15 +110,53 @@ class FunnyOrDieIE(InfoExtractor): 'url': 'http://www.funnyordie.com%s' % src, }] - post_json = self._search_regex( - r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details') - post = json.loads(post_json) + timestamp = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp', default=None)) + + uploader = self._html_search_regex( + r'<h\d[^>]+\bclass=["\']channel-preview-name[^>]+>(.+?)</h', + webpage, 'uploader', default=None) + + title, description, thumbnail, duration = [None] * 4 + + medium = self._parse_json( + self._search_regex( + r'jsonMedium\s*=\s*({.+?});', webpage, 'JSON medium', + default='{}'), + video_id, fatal=False) + if medium: + title = medium.get('title') + duration = float_or_none(medium.get('duration')) + if not timestamp: + timestamp = unified_timestamp(medium.get('publishDate')) + + post = self._parse_json( + self._search_regex( + r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details', + default='{}'), + video_id, fatal=False) + if post: + if not title: + title = post.get('name') + description = post.get('description') + thumbnail = post.get('picture') + + if not title: + title = self._og_search_title(webpage) + if not description: + description = self._og_search_description(webpage) + if not duration: + duration = int_or_none(self._html_search_meta( + ('video:duration', 'duration'), webpage, 'duration', default=False)) return { 'id': video_id, - 'title': post['name'], - 'description': post.get('description'), - 'thumbnail': post.get('picture'), + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'timestamp': timestamp, + 'duration': duration, 'formats': formats, 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/gaskrank.py b/youtube_dl/extractor/gaskrank.py index 36ba7d8cf..1726a6704 100644 --- a/youtube_dl/extractor/gaskrank.py +++ b/youtube_dl/extractor/gaskrank.py @@ -6,62 +6,52 @@ from .common import InfoExtractor from ..utils import ( float_or_none, int_or_none, - js_to_json, unified_strdate, ) class GaskrankIE(InfoExtractor): - """InfoExtractor for gaskrank.tv""" - _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv/(?P<categories>[^/]+)/(?P<id>[^/]+)\.html?' - _TESTS = [ - { - 'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm', - 'md5': '1ae88dbac97887d85ebd1157a95fc4f9', - 'info_dict': { - 'id': '201601/26955', - 'ext': 'mp4', - 'title': 'Strike! Einparken können nur Männer - Flurschaden hält sich in Grenzen *lol*', - 'thumbnail': r're:^https?://.*\.jpg$', - 'categories': ['motorrad-fun'], - 'display_id': 'strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden', - 'uploader_id': 'Bikefun', - 'upload_date': '20170110', - 'uploader_url': None, - } - }, - { - 'url': 'http://www.gaskrank.tv/tv/racing/isle-of-man-tt-2011-michael-du-15920.htm', - 'md5': 'c33ee32c711bc6c8224bfcbe62b23095', - 'info_dict': { - 'id': '201106/15920', - 'ext': 'mp4', - 'title': 'Isle of Man - Michael Dunlop vs Guy Martin - schwindelig kucken', - 'thumbnail': r're:^https?://.*\.jpg$', - 'categories': ['racing'], - 'display_id': 'isle-of-man-tt-2011-michael-du-15920', - 'uploader_id': 'IOM', - 'upload_date': '20160506', - 'uploader_url': 'www.iomtt.com', - } + _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv/(?P<categories>[^/]+)/(?P<id>[^/]+)\.htm' + _TESTS = [{ + 'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm', + 'md5': '1ae88dbac97887d85ebd1157a95fc4f9', + 'info_dict': { + 'id': '201601/26955', + 'ext': 'mp4', + 'title': 'Strike! Einparken können nur Männer - Flurschaden hält sich in Grenzen *lol*', + 'thumbnail': r're:^https?://.*\.jpg$', + 'categories': ['motorrad-fun'], + 'display_id': 'strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden', + 'uploader_id': 'Bikefun', + 'upload_date': '20170110', + 'uploader_url': None, } - ] + }, { + 'url': 'http://www.gaskrank.tv/tv/racing/isle-of-man-tt-2011-michael-du-15920.htm', + 'md5': 'c33ee32c711bc6c8224bfcbe62b23095', + 'info_dict': { + 'id': '201106/15920', + 'ext': 'mp4', + 'title': 'Isle of Man - Michael Dunlop vs Guy Martin - schwindelig kucken', + 'thumbnail': r're:^https?://.*\.jpg$', + 'categories': ['racing'], + 'display_id': 'isle-of-man-tt-2011-michael-du-15920', + 'uploader_id': 'IOM', + 'upload_date': '20170523', + 'uploader_url': 'www.iomtt.com', + } + }] def _real_extract(self, url): - """extract information from gaskrank.tv""" - def fix_json(code): - """Removes trailing comma in json: {{},} --> {{}}""" - return re.sub(r',\s*}', r'}', js_to_json(code)) - display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'title', webpage, fatal=True) + categories = [re.match(self._VALID_URL, url).group('categories')] - title = self._search_regex( - r'movieName\s*:\s*\'([^\']*)\'', - webpage, 'title') - thumbnail = self._search_regex( - r'poster\s*:\s*\'([^\']*)\'', - webpage, 'thumbnail', default=None) mobj = re.search( r'Video von:\s*(?P<uploader_id>[^|]*?)\s*\|\s*vom:\s*(?P<upload_date>[0-9][0-9]\.[0-9][0-9]\.[0-9][0-9][0-9][0-9])', @@ -89,29 +79,14 @@ class GaskrankIE(InfoExtractor): if average_rating: average_rating = float_or_none(average_rating.replace(',', '.')) - playlist = self._parse_json( - self._search_regex( - r'playlist\s*:\s*\[([^\]]*)\]', - webpage, 'playlist', default='{}'), - display_id, transform_source=fix_json, fatal=False) - video_id = self._search_regex( r'https?://movies\.gaskrank\.tv/([^-]*?)(-[^\.]*)?\.mp4', - playlist.get('0').get('src'), 'video id') - - formats = [] - for key in playlist: - formats.append({ - 'url': playlist[key]['src'], - 'format_id': key, - 'quality': playlist[key].get('quality')}) - self._sort_formats(formats, field_preference=['format_id']) + webpage, 'video id', default=display_id) - return { + entry = self._parse_html5_media_entries(url, webpage, video_id)[0] + entry.update({ 'id': video_id, 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, 'categories': categories, 'display_id': display_id, 'uploader_id': uploader_id, @@ -120,4 +95,7 @@ class GaskrankIE(InfoExtractor): 'tags': tags, 'view_count': view_count, 'average_rating': average_rating, - } + }) + self._sort_formats(entry['formats']) + + return entry diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 3136427db..f71d9092e 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -75,6 +75,19 @@ class GDCVaultIE(InfoExtractor): 'format': 'jp', # The japanese audio } }, + { + # gdc-player.html + 'url': 'http://www.gdcvault.com/play/1435/An-American-engine-in-Tokyo', + 'info_dict': { + 'id': '1435', + 'display_id': 'An-American-engine-in-Tokyo', + 'ext': 'flv', + 'title': 'An American Engine in Tokyo:/nThe collaboration of Epic Games and Square Enix/nFor THE LAST REMINANT', + }, + 'params': { + 'skip_download': True, # Requires rtmpdump + }, + }, ] def _login(self, webpage_url, display_id): @@ -128,7 +141,7 @@ class GDCVaultIE(InfoExtractor): 'title': title, } - PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/player.*?\.html.*?".*?</iframe>' + PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/(?:gdc-)?player.*?\.html.*?".*?</iframe>' xml_root = self._html_search_regex( PLAYER_REGEX, start_page, 'xml root', default=None) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b06f43446..b83c18380 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -10,6 +10,7 @@ from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import ( compat_etree_fromstring, + compat_str, compat_urllib_parse_unquote, compat_urlparse, compat_xml_parse_error, @@ -35,6 +36,10 @@ from .brightcove import ( BrightcoveLegacyIE, BrightcoveNewIE, ) +from .nexx import ( + NexxIE, + NexxEmbedIE, +) from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE @@ -56,6 +61,7 @@ from .dailymotion import ( DailymotionIE, DailymotionCloudIE, ) +from .dailymail import DailyMailIE from .onionstudios import OnionStudiosIE from .viewlift import ViewLiftEmbedIE from .mtv import MTVServicesEmbeddedIE @@ -88,6 +94,11 @@ from .rutube import RutubeIE from .limelight import LimelightBaseIE from .anvato import AnvatoIE from .washingtonpost import WashingtonPostIE +from .wistia import WistiaIE +from .mediaset import MediasetIE +from .joj import JojIE +from .megaphone import MegaphoneIE +from .vzaar import VzaarIE class GenericIE(InfoExtractor): @@ -565,6 +576,19 @@ class GenericIE(InfoExtractor): }, 'skip': 'movie expired', }, + # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js + { + 'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/', + 'info_dict': { + 'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2', + 'ext': 'mp4', + 'title': 'Steampunk Fest Comes to Honesdale', + 'duration': 43.276, + }, + 'params': { + 'skip_download': True, + } + }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', @@ -756,6 +780,20 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Dailymotion'], }, + # DailyMail embed + { + 'url': 'http://www.bumm.sk/krimi/2017/07/05/biztonsagi-kamera-buktatta-le-az-agg-ferfit-utlegelo-apolot', + 'info_dict': { + 'id': '1495629', + 'ext': 'mp4', + 'title': 'Care worker punches elderly dementia patient in head 11 times', + 'description': 'md5:3a743dee84e57e48ec68bf67113199a5', + }, + 'add_ie': ['DailyMail'], + 'params': { + 'skip_download': True, + }, + }, # YouTube embed { 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html', @@ -1182,7 +1220,7 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, - # Eagle.Platform embed (generic URL) + # EaglePlatform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used @@ -1196,8 +1234,26 @@ class GenericIE(InfoExtractor): 'view_count': int, 'age_limit': 0, }, + 'params': { + 'skip_download': True, + }, + }, + # referrer protected EaglePlatform embed + { + 'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/', + 'info_dict': { + 'id': '582306', + 'ext': 'mp4', + 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 3382, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, }, - # ClipYou (Eagle.Platform) embed (custom URL) + # ClipYou (EaglePlatform) embed (custom URL) { 'url': 'http://muz-tv.ru/play/7129/', # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used @@ -1209,6 +1265,9 @@ class GenericIE(InfoExtractor): 'duration': 216, 'view_count': int, }, + 'params': { + 'skip_download': True, + }, }, # Pladform embed { @@ -1460,14 +1519,27 @@ class GenericIE(InfoExtractor): # LiveLeak embed { 'url': 'http://www.wykop.pl/link/3088787/', - 'md5': 'ace83b9ed19b21f68e1b50e844fdf95d', + 'md5': '7619da8c820e835bef21a1efa2a0fc71', 'info_dict': { 'id': '874_1459135191', 'ext': 'mp4', 'title': 'Man shows poor quality of new apartment building', 'description': 'The wall is like a sand pile.', 'uploader': 'Lake8737', - } + }, + 'add_ie': [LiveLeakIE.ie_key()], + }, + # Another LiveLeak embed pattern (#13336) + { + 'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/', + 'info_dict': { + 'id': '2eb_1496309988', + 'ext': 'mp4', + 'title': 'Thief robs place where everyone was armed', + 'description': 'md5:694d73ee79e535953cf2488562288eee', + 'uploader': 'brazilwtf', + }, + 'add_ie': [LiveLeakIE.ie_key()], }, # Duplicated embedded video URLs { @@ -1509,6 +1581,22 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['BrightcoveLegacy'], }, + # Nexx embed + { + 'url': 'https://www.funk.net/serien/5940e15073f6120001657956/items/593efbb173f6120001657503', + 'info_dict': { + 'id': '247746', + 'ext': 'mp4', + 'title': "Yesterday's Jam (OV)", + 'description': 'md5:09bc0984723fed34e2581624a84e05f0', + 'timestamp': 1492594816, + 'upload_date': '20170419', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, # Facebook <iframe> embed { 'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html', @@ -1519,6 +1607,21 @@ class GenericIE(InfoExtractor): 'title': 'Facebook video #599637780109885', }, }, + # Facebook <iframe> embed, plugin video + { + 'url': 'http://5pillarsuk.com/2017/06/07/tariq-ramadan-disagrees-with-pr-exercise-by-imams-refusing-funeral-prayers-for-london-attackers/', + 'info_dict': { + 'id': '1754168231264132', + 'ext': 'mp4', + 'title': 'About the Imams and Religious leaders refusing to perform funeral prayers for...', + 'uploader': 'Tariq Ramadan (official)', + 'timestamp': 1496758379, + 'upload_date': '20170606', + }, + 'params': { + 'skip_download': True, + }, + }, # Facebook API embed { 'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/', @@ -1696,6 +1799,21 @@ class GenericIE(InfoExtractor): 'playlist_mincount': 5, }, { + # Limelight embed (LimelightPlayerUtil.embed) + 'url': 'https://tv5.ca/videos?v=xuu8qowr291ri', + 'info_dict': { + 'id': '95d035dc5c8a401588e9c0e6bd1e9c92', + 'ext': 'mp4', + 'title': '07448641', + 'timestamp': 1499890639, + 'upload_date': '20170712', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['LimelightMedia'], + }, + { 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/', 'info_dict': { 'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest', @@ -1718,6 +1836,49 @@ class GenericIE(InfoExtractor): }, 'add_ie': [WashingtonPostIE.ie_key()], }, + { + # Mediaset embed + 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml', + 'info_dict': { + 'id': '720642', + 'ext': 'mp4', + 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [MediasetIE.ie_key()], + }, + { + # JOJ.sk embeds + 'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok', + 'info_dict': { + 'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok', + 'title': 'Slovenskom sa prehnala vlna silných búrok', + }, + 'playlist_mincount': 5, + 'add_ie': [JojIE.ie_key()], + }, + { + # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video) + 'url': 'https://tvrain.ru/amp/418921/', + 'md5': 'cc00413936695987e8de148b67d14f1d', + 'info_dict': { + 'id': '418921', + 'ext': 'mp4', + 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', + }, + }, + { + # vzaar embed + 'url': 'http://help.vzaar.com/article/165-embedding-video', + 'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4', + 'info_dict': { + 'id': '8707641', + 'ext': 'mp4', + 'title': 'Building A Business Online: Principal Chairs Q & A', + }, + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -1867,7 +2028,7 @@ class GenericIE(InfoExtractor): if head_response is not False: # Check for redirect - new_url = head_response.geturl() + new_url = compat_str(head_response.geturl()) if url != new_url: self.report_following_redirect(new_url) if force_videoid: @@ -1892,14 +2053,14 @@ class GenericIE(InfoExtractor): content_type = head_response.headers.get('Content-Type', '').lower() m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: - format_id = m.group('format_id') + format_id = compat_str(m.group('format_id')) if format_id.endswith('mpegurl'): formats = self._extract_m3u8_formats(url, video_id, 'mp4') elif format_id == 'f4m': formats = self._extract_f4m_formats(url, video_id) else: formats = [{ - 'format_id': m.group('format_id'), + 'format_id': format_id, 'url': url, 'vcodec': 'none' if m.group('type') == 'audio' else None }] @@ -1968,7 +2129,7 @@ class GenericIE(InfoExtractor): elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( doc, video_id, - mpd_base_url=full_response.geturl().rpartition('/')[0], + mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0], mpd_url=url) self._sort_formats(info_dict['formats']) return info_dict @@ -2017,6 +2178,13 @@ class GenericIE(InfoExtractor): video_description = self._og_search_description(webpage, default=None) video_thumbnail = self._og_search_thumbnail(webpage, default=None) + info_dict.update({ + 'title': video_title, + 'description': video_description, + 'thumbnail': video_thumbnail, + 'age_limit': age_limit, + }) + # Look for Brightcove Legacy Studio embeds bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: @@ -2038,6 +2206,16 @@ class GenericIE(InfoExtractor): if bc_urls: return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew') + # Look for Nexx embeds + nexx_urls = NexxIE._extract_urls(webpage) + if nexx_urls: + return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key()) + + # Look for Nexx iFrame embeds + nexx_embed_urls = NexxEmbedIE._extract_urls(webpage) + if nexx_embed_urls: + return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key()) + # Look for ThePlatform embeds tp_urls = ThePlatformIE._extract_urls(webpage) if tp_urls: @@ -2065,36 +2243,11 @@ class GenericIE(InfoExtractor): if vid_me_embed_url is not None: return self.url_result(vid_me_embed_url, 'Vidme') - # Look for embedded YouTube player - matches = re.findall(r'''(?x) - (?: - <iframe[^>]+?src=| - data-video-url=| - <embed[^>]+?src=| - embedSWF\(?:\s*| - <object[^>]+data=| - new\s+SWFObject\( - ) - (["\']) - (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v|p)/.+?) - \1''', webpage) - if matches: + # Look for YouTube embeds + youtube_urls = YoutubeIE._extract_urls(webpage) + if youtube_urls: return self.playlist_from_matches( - matches, video_id, video_title, lambda m: unescapeHTML(m[1])) - - # Look for lazyYT YouTube embed - matches = re.findall( - r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m)) - - # Look for Wordpress "YouTube Video Importer" plugin - matches = re.findall(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ - data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1]) + youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key()) matches = DailymotionIE._extract_urls(webpage) if matches: @@ -2110,58 +2263,27 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) - # Look for embedded Wistia player - match = re.search( - r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) - if match: - embed_url = self._proto_relative_url( - unescapeHTML(match.group('url'))) - return { - '_type': 'url_transparent', - 'url': embed_url, - 'ie_key': 'Wistia', - 'uploader': video_uploader, - } + # Look for DailyMail embeds + dailymail_urls = DailyMailIE._extract_urls(webpage) + if dailymail_urls: + return self.playlist_from_matches( + dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) - match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) - if match: + # Look for embedded Wistia player + wistia_url = WistiaIE._extract_url(webpage) + if wistia_url: return { '_type': 'url_transparent', - 'url': 'wistia:%s' % match.group('id'), - 'ie_key': 'Wistia', + 'url': self._proto_relative_url(wistia_url), + 'ie_key': WistiaIE.ie_key(), 'uploader': video_uploader, } - match = re.search( - r'''(?sx) - <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? - <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2 - ''', webpage) - if match: - return self.url_result(self._proto_relative_url( - 'wistia:%s' % match.group('id')), 'Wistia') - # Look for SVT player svt_url = SVTIE._extract_url(webpage) if svt_url: return self.url_result(svt_url, 'SVT') - # Look for embedded condenast player - matches = re.findall( - r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")', - webpage) - if matches: - return { - '_type': 'playlist', - 'entries': [{ - '_type': 'url', - 'ie_key': 'CondeNast', - 'url': ma, - } for ma in matches], - 'title': video_title, - 'id': video_id, - } - # Look for Bandcamp pages with custom domain mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) if mobj is not None: @@ -2198,6 +2320,7 @@ class GenericIE(InfoExtractor): # Look for Ooyala videos mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or + re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) if mobj is not None: @@ -2243,9 +2366,9 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url')) # Look for embedded Facebook player - facebook_url = FacebookIE._extract_url(webpage) - if facebook_url is not None: - return self.url_result(facebook_url, 'Facebook') + facebook_urls = FacebookIE._extract_urls(webpage) + if facebook_urls: + return self.playlist_from_matches(facebook_urls, video_id, video_title) # Look for embedded VK player mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) @@ -2442,12 +2565,12 @@ class GenericIE(InfoExtractor): if kaltura_url: return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) - # Look for Eagle.Platform embeds + # Look for EaglePlatform embeds eagleplatform_url = EaglePlatformIE._extract_url(webpage) if eagleplatform_url: - return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key()) + return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key()) - # Look for ClipYou (uses Eagle.Platform) embeds + # Look for ClipYou (uses EaglePlatform) embeds mobj = re.search( r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) if mobj is not None: @@ -2555,29 +2678,6 @@ class GenericIE(InfoExtractor): return self.playlist_result( limelight_urls, video_id, video_title, video_description) - mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage) - if mobj: - lm = { - 'Media': 'media', - 'Channel': 'channel', - 'ChannelList': 'channel_list', - } - return self.url_result(smuggle_url('limelight:%s:%s' % ( - lm[mobj.group(1)], mobj.group(2)), {'source_url': url}), - 'Limelight%s' % mobj.group(1), mobj.group(2)) - - mobj = re.search( - r'''(?sx) - <object[^>]+class=(["\'])LimelightEmbeddedPlayerFlash\1[^>]*>.*? - <param[^>]+ - name=(["\'])flashVars\2[^>]+ - value=(["\'])(?:(?!\3).)*mediaId=(?P<id>[a-z0-9]{32}) - ''', webpage) - if mobj: - return self.url_result(smuggle_url( - 'limelight:media:%s' % mobj.group('id'), - {'source_url': url}), 'LimelightMedia', mobj.group('id')) - # Look for Anvato embeds anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id) if anvato_urls: @@ -2645,9 +2745,9 @@ class GenericIE(InfoExtractor): self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) # Look for LiveLeak embeds - liveleak_url = LiveLeakIE._extract_url(webpage) - if liveleak_url: - return self.url_result(liveleak_url, 'LiveLeak') + liveleak_urls = LiveLeakIE._extract_urls(webpage) + if liveleak_urls: + return self.playlist_from_matches(liveleak_urls, video_id, video_title) # Look for 3Q SDN embeds threeqsdn_url = ThreeQSDNIE._extract_url(webpage) @@ -2699,7 +2799,7 @@ class GenericIE(InfoExtractor): rutube_urls = RutubeIE._extract_urls(webpage) if rutube_urls: return self.playlist_from_matches( - rutube_urls, ie=RutubeIE.ie_key()) + rutube_urls, video_id, video_title, ie=RutubeIE.ie_key()) # Look for WashingtonPost embeds wapo_urls = WashingtonPostIE._extract_urls(webpage) @@ -2707,18 +2807,44 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) - # Looking for http://schema.org/VideoObject - json_ld = self._search_json_ld( - webpage, video_id, default={}, expected_type='VideoObject') - if json_ld.get('url'): - info_dict.update({ - 'title': video_title or info_dict['title'], - 'description': video_description, - 'thumbnail': video_thumbnail, - 'age_limit': age_limit - }) - info_dict.update(json_ld) - return info_dict + # Look for Mediaset embeds + mediaset_urls = MediasetIE._extract_urls(webpage) + if mediaset_urls: + return self.playlist_from_matches( + mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) + + # Look for JOJ.sk embeds + joj_urls = JojIE._extract_urls(webpage) + if joj_urls: + return self.playlist_from_matches( + joj_urls, video_id, video_title, ie=JojIE.ie_key()) + + # Look for megaphone.fm embeds + mpfn_urls = MegaphoneIE._extract_urls(webpage) + if mpfn_urls: + return self.playlist_from_matches( + mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key()) + + # Look for vzaar embeds + vzaar_urls = VzaarIE._extract_urls(webpage) + if vzaar_urls: + return self.playlist_from_matches( + vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key()) + + def merge_dicts(dict1, dict2): + merged = {} + for k, v in dict1.items(): + if v is not None: + merged[k] = v + for k, v in dict2.items(): + if v is None: + continue + if (k not in merged or + (isinstance(v, compat_str) and v and + isinstance(merged[k], compat_str) and + not merged[k])): + merged[k] = v + return merged # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') @@ -2736,9 +2862,13 @@ class GenericIE(InfoExtractor): if jwplayer_data: info = self._parse_jwplayer_data( jwplayer_data, video_id, require_title=False, base_url=url) - if not info.get('title'): - info['title'] = video_title - return info + return merge_dicts(info, info_dict) + + # Looking for http://schema.org/VideoObject + json_ld = self._search_json_ld( + webpage, video_id, default={}, expected_type='VideoObject') + if json_ld.get('url'): + return merge_dicts(json_ld, info_dict) def check_video(vurl): if YoutubeIE.suitable(vurl): diff --git a/youtube_dl/extractor/gfycat.py b/youtube_dl/extractor/gfycat.py index 884700c52..45ccc11c1 100644 --- a/youtube_dl/extractor/gfycat.py +++ b/youtube_dl/extractor/gfycat.py @@ -82,7 +82,7 @@ class GfycatIE(InfoExtractor): video_url = gfy.get('%sUrl' % format_id) if not video_url: continue - filesize = gfy.get('%sSize' % format_id) + filesize = int_or_none(gfy.get('%sSize' % format_id)) formats.append({ 'url': video_url, 'format_id': format_id, diff --git a/youtube_dl/extractor/giantbomb.py b/youtube_dl/extractor/giantbomb.py index 29b684d35..6a1b1e96e 100644 --- a/youtube_dl/extractor/giantbomb.py +++ b/youtube_dl/extractor/giantbomb.py @@ -5,9 +5,10 @@ import json from .common import InfoExtractor from ..utils import ( - unescapeHTML, - qualities, + determine_ext, int_or_none, + qualities, + unescapeHTML, ) @@ -15,7 +16,7 @@ class GiantBombIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?giantbomb\.com/videos/(?P<display_id>[^/]+)/(?P<id>\d+-\d+)' _TEST = { 'url': 'http://www.giantbomb.com/videos/quick-look-destiny-the-dark-below/2300-9782/', - 'md5': '57badeface303ecf6b98b812de1b9018', + 'md5': 'c8ea694254a59246a42831155dec57ac', 'info_dict': { 'id': '2300-9782', 'display_id': 'quick-look-destiny-the-dark-below', @@ -51,11 +52,16 @@ class GiantBombIE(InfoExtractor): for format_id, video_url in video['videoStreams'].items(): if format_id == 'f4m_stream': continue - if video_url.endswith('.f4m'): + ext = determine_ext(video_url) + if ext == 'f4m': f4m_formats = self._extract_f4m_formats(video_url + '?hdcore=3.3.1', display_id) if f4m_formats: f4m_formats[0]['quality'] = quality(format_id) formats.extend(f4m_formats) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, display_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) else: formats.append({ 'url': video_url, diff --git a/youtube_dl/extractor/godtv.py b/youtube_dl/extractor/godtv.py deleted file mode 100644 index c5d3b4e6a..000000000 --- a/youtube_dl/extractor/godtv.py +++ /dev/null @@ -1,66 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from .ooyala import OoyalaIE -from ..utils import js_to_json - - -class GodTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?god\.tv(?:/[^/]+)*/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://god.tv/jesus-image/video/jesus-conference-2016/randy-needham', - 'info_dict': { - 'id': 'lpd3g2MzE6D1g8zFAKz8AGpxWcpu6o_3', - 'ext': 'mp4', - 'title': 'Randy Needham', - 'duration': 3615.08, - }, - 'params': { - 'skip_download': True, - } - }, { - 'url': 'http://god.tv/playlist/bible-study', - 'info_dict': { - 'id': 'bible-study', - }, - 'playlist_mincount': 37, - }, { - 'url': 'http://god.tv/node/15097', - 'only_matching': True, - }, { - 'url': 'http://god.tv/live/africa', - 'only_matching': True, - }, { - 'url': 'http://god.tv/liveevents', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - settings = self._parse_json( - self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'settings', default='{}'), - display_id, transform_source=js_to_json, fatal=False) - - ooyala_id = None - - if settings: - playlist = settings.get('playlist') - if playlist and isinstance(playlist, list): - entries = [ - OoyalaIE._build_url_result(video['content_id']) - for video in playlist if video.get('content_id')] - if entries: - return self.playlist_result(entries, display_id) - ooyala_id = settings.get('ooyala', {}).get('content_id') - - if not ooyala_id: - ooyala_id = self._search_regex( - r'["\']content_id["\']\s*:\s*(["\'])(?P<id>[\w-]+)\1', - webpage, 'ooyala id', group='id') - - return OoyalaIE._build_url_result(ooyala_id) diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py index 2bfb99040..47a068e74 100644 --- a/youtube_dl/extractor/golem.py +++ b/youtube_dl/extractor/golem.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( + compat_str, compat_urlparse, ) from ..utils import ( @@ -46,7 +47,7 @@ class GolemIE(InfoExtractor): continue formats.append({ - 'format_id': e.tag, + 'format_id': compat_str(e.tag), 'url': compat_urlparse.urljoin(self._PREFIX, url), 'height': self._int(e.get('height'), 'height'), 'width': self._int(e.get('width'), 'width'), diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index fec36cbbb..3bf462d63 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -4,17 +4,30 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, int_or_none, lowercase_escape, + update_url_query, ) class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:docs|drive)\.google\.com/ + (?: + (?:uc|open)\?.*?id=| + file/d/ + )| + video\.google\.com/get_player\?.*?docid= + ) + (?P<id>[a-zA-Z0-9_-]{28,}) + ''' _TESTS = [{ 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', - 'md5': 'd109872761f7e7ecf353fa108c0dbe1e', + 'md5': '5c602afbbf2c1db91831f5d82f678554', 'info_dict': { 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'ext': 'mp4', @@ -22,8 +35,30 @@ class GoogleDriveIE(InfoExtractor): 'duration': 45, } }, { + # video can't be watched anonymously due to view count limit reached, + # but can be downloaded (see https://github.com/rg3/youtube-dl/issues/14046) + 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view', + 'md5': 'bfbd670d03a470bb1e6d4a257adec12e', + 'info_dict': { + 'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ', + 'ext': 'mp4', + 'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4', + } + }, { # video id is longer than 28 characters 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', + 'info_dict': { + 'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ', + 'ext': 'mp4', + 'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4', + 'duration': 189, + }, + 'only_matching': True, + }, { + 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28', + 'only_matching': True, + }, { + 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28', 'only_matching': True, }] _FORMATS_EXT = { @@ -44,6 +79,13 @@ class GoogleDriveIE(InfoExtractor): '46': 'webm', '59': 'mp4', } + _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext' + _CAPTIONS_ENTRY_TAG = { + 'subtitles': 'track', + 'automatic_captions': 'target', + } + _caption_formats_ext = [] + _captions_xml = None @staticmethod def _extract_url(webpage): @@ -53,41 +95,183 @@ class GoogleDriveIE(InfoExtractor): if mobj: return 'https://drive.google.com/file/d/%s' % mobj.group('id') + def _download_subtitles_xml(self, video_id, subtitles_id, hl): + if self._captions_xml: + return + self._captions_xml = self._download_xml( + self._BASE_URL_CAPTIONS, video_id, query={ + 'id': video_id, + 'vid': subtitles_id, + 'hl': hl, + 'v': video_id, + 'type': 'list', + 'tlangs': '1', + 'fmts': '1', + 'vssids': '1', + }, note='Downloading subtitles XML', + errnote='Unable to download subtitles XML', fatal=False) + if self._captions_xml: + for f in self._captions_xml.findall('format'): + if f.attrib.get('fmt_code') and not f.attrib.get('default'): + self._caption_formats_ext.append(f.attrib['fmt_code']) + + def _get_captions_by_type(self, video_id, subtitles_id, caption_type, + origin_lang_code=None): + if not subtitles_id or not caption_type: + return + captions = {} + for caption_entry in self._captions_xml.findall( + self._CAPTIONS_ENTRY_TAG[caption_type]): + caption_lang_code = caption_entry.attrib.get('lang_code') + if not caption_lang_code: + continue + caption_format_data = [] + for caption_format in self._caption_formats_ext: + query = { + 'vid': subtitles_id, + 'v': video_id, + 'fmt': caption_format, + 'lang': (caption_lang_code if origin_lang_code is None + else origin_lang_code), + 'type': 'track', + 'name': '', + 'kind': '', + } + if origin_lang_code is not None: + query.update({'tlang': caption_lang_code}) + caption_format_data.append({ + 'url': update_url_query(self._BASE_URL_CAPTIONS, query), + 'ext': caption_format, + }) + captions[caption_lang_code] = caption_format_data + return captions + + def _get_subtitles(self, video_id, subtitles_id, hl): + if not subtitles_id or not hl: + return + self._download_subtitles_xml(video_id, subtitles_id, hl) + if not self._captions_xml: + return + return self._get_captions_by_type(video_id, subtitles_id, 'subtitles') + + def _get_automatic_captions(self, video_id, subtitles_id, hl): + if not subtitles_id or not hl: + return + self._download_subtitles_xml(video_id, subtitles_id, hl) + if not self._captions_xml: + return + track = self._captions_xml.find('track') + if track is None: + return + origin_lang_code = track.attrib.get('lang_code') + if not origin_lang_code: + return + return self._get_captions_by_type( + video_id, subtitles_id, 'automatic_captions', origin_lang_code) + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://docs.google.com/file/d/%s' % video_id, video_id) - reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) - if reason: - raise ExtractorError(reason) - - title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title') + title = self._search_regex( + r'"title"\s*,\s*"([^"]+)', webpage, 'title', + default=None) or self._og_search_title(webpage) duration = int_or_none(self._search_regex( - r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', default=None)) - fmt_stream_map = self._search_regex( - r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt stream map').split(',') - fmt_list = self._search_regex(r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',') + r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', + default=None)) formats = [] - for fmt, fmt_stream in zip(fmt_list, fmt_stream_map): - fmt_id, fmt_url = fmt_stream.split('|') - resolution = fmt.split('/')[1] - width, height = resolution.split('x') - formats.append({ - 'url': lowercase_escape(fmt_url), - 'format_id': fmt_id, - 'resolution': resolution, - 'width': int_or_none(width), - 'height': int_or_none(height), - 'ext': self._FORMATS_EXT[fmt_id], + fmt_stream_map = self._search_regex( + r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, + 'fmt stream map', default='').split(',') + fmt_list = self._search_regex( + r'"fmt_list"\s*,\s*"([^"]+)', webpage, + 'fmt_list', default='').split(',') + if fmt_stream_map and fmt_list: + resolutions = {} + for fmt in fmt_list: + mobj = re.search( + r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt) + if mobj: + resolutions[mobj.group('format_id')] = ( + int(mobj.group('width')), int(mobj.group('height'))) + + for fmt_stream in fmt_stream_map: + fmt_stream_split = fmt_stream.split('|') + if len(fmt_stream_split) < 2: + continue + format_id, format_url = fmt_stream_split[:2] + f = { + 'url': lowercase_escape(format_url), + 'format_id': format_id, + 'ext': self._FORMATS_EXT[format_id], + } + resolution = resolutions.get(format_id) + if resolution: + f.update({ + 'width': resolution[0], + 'height': resolution[1], + }) + formats.append(f) + + source_url = update_url_query( + 'https://drive.google.com/uc', { + 'id': video_id, + 'export': 'download', }) + urlh = self._request_webpage( + source_url, video_id, note='Requesting source file', + errnote='Unable to request source file', fatal=False) + if urlh: + def add_source_format(src_url): + formats.append({ + 'url': src_url, + 'ext': determine_ext(title, 'mp4').lower(), + 'format_id': 'source', + 'quality': 1, + }) + if urlh.headers.get('Content-Disposition'): + add_source_format(source_url) + else: + confirmation_webpage = self._webpage_read_content( + urlh, url, video_id, note='Downloading confirmation page', + errnote='Unable to confirm download', fatal=False) + if confirmation_webpage: + confirm = self._search_regex( + r'confirm=([^&"\']+)', confirmation_webpage, + 'confirmation code', fatal=False) + if confirm: + add_source_format(update_url_query(source_url, { + 'confirm': confirm, + })) + + if not formats: + reason = self._search_regex( + r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) + if reason: + raise ExtractorError(reason, expected=True) + self._sort_formats(formats) + hl = self._search_regex( + r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None) + subtitles_id = None + ttsurl = self._search_regex( + r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None) + if ttsurl: + # the video Id for subtitles will be the last value in the ttsurl + # query string + subtitles_id = ttsurl.encode('utf-8').decode( + 'unicode_escape').split('=')[-1] + return { 'id': video_id, 'title': title, 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'duration': duration, 'formats': formats, + 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl), + 'automatic_captions': self.extract_automatic_captions( + video_id, subtitles_id, hl), } diff --git a/youtube_dl/extractor/hgtv.py b/youtube_dl/extractor/hgtv.py index e854300c7..a4f332565 100644 --- a/youtube_dl/extractor/hgtv.py +++ b/youtube_dl/extractor/hgtv.py @@ -7,14 +7,19 @@ from .common import InfoExtractor class HGTVComShowIE(InfoExtractor): IE_NAME = 'hgtv.com:show' _VALID_URL = r'https?://(?:www\.)?hgtv\.com/shows/[^/]+/(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'http://www.hgtv.com/shows/flip-or-flop/flip-or-flop-full-episodes-videos', + _TESTS = [{ + # data-module="video" + 'url': 'http://www.hgtv.com/shows/flip-or-flop/flip-or-flop-full-episodes-season-4-videos', 'info_dict': { - 'id': 'flip-or-flop-full-episodes-videos', + 'id': 'flip-or-flop-full-episodes-season-4-videos', 'title': 'Flip or Flop Full Episodes', }, 'playlist_mincount': 15, - } + }, { + # data-deferred-module="video" + 'url': 'http://www.hgtv.com/shows/good-bones/episodes/an-old-victorian-house-gets-a-new-facelift', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -23,7 +28,7 @@ class HGTVComShowIE(InfoExtractor): config = self._parse_json( self._search_regex( - r'(?s)data-module=["\']video["\'][^>]*>.*?<script[^>]+type=["\']text/x-config["\'][^>]*>(.+?)</script', + r'(?s)data-(?:deferred-)?module=["\']video["\'][^>]*>.*?<script[^>]+type=["\']text/x-config["\'][^>]*>(.+?)</script', webpage, 'video config'), display_id)['channels'][0] diff --git a/youtube_dl/extractor/hitbox.py b/youtube_dl/extractor/hitbox.py index e21ebb8fb..1d905dc81 100644 --- a/youtube_dl/extractor/hitbox.py +++ b/youtube_dl/extractor/hitbox.py @@ -16,8 +16,8 @@ from ..utils import ( class HitboxIE(InfoExtractor): IE_NAME = 'hitbox' - _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/video/(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?:[^/]+/)*videos?/(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.hitbox.tv/video/203213', 'info_dict': { 'id': '203213', @@ -38,13 +38,15 @@ class HitboxIE(InfoExtractor): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'https://www.smashcast.tv/hitboxlive/videos/203213', + 'only_matching': True, + }] def _extract_metadata(self, url, video_id): thumb_base = 'https://edge.sf.hitbox.tv' metadata = self._download_json( - '%s/%s' % (url, video_id), video_id, - 'Downloading metadata JSON') + '%s/%s' % (url, video_id), video_id, 'Downloading metadata JSON') date = 'media_live_since' media_type = 'livestream' @@ -63,14 +65,15 @@ class HitboxIE(InfoExtractor): views = int_or_none(video_meta.get('media_views')) timestamp = parse_iso8601(video_meta.get(date), ' ') categories = [video_meta.get('category_name')] - thumbs = [ - {'url': thumb_base + video_meta.get('media_thumbnail'), - 'width': 320, - 'height': 180}, - {'url': thumb_base + video_meta.get('media_thumbnail_large'), - 'width': 768, - 'height': 432}, - ] + thumbs = [{ + 'url': thumb_base + video_meta.get('media_thumbnail'), + 'width': 320, + 'height': 180 + }, { + 'url': thumb_base + video_meta.get('media_thumbnail_large'), + 'width': 768, + 'height': 432 + }] return { 'id': video_id, @@ -90,7 +93,7 @@ class HitboxIE(InfoExtractor): video_id = self._match_id(url) player_config = self._download_json( - 'https://www.hitbox.tv/api/player/config/video/%s' % video_id, + 'https://www.smashcast.tv/api/player/config/video/%s' % video_id, video_id, 'Downloading video JSON') formats = [] @@ -121,8 +124,7 @@ class HitboxIE(InfoExtractor): self._sort_formats(formats) metadata = self._extract_metadata( - 'https://www.hitbox.tv/api/media/video', - video_id) + 'https://www.smashcast.tv/api/media/video', video_id) metadata['formats'] = formats return metadata @@ -130,8 +132,8 @@ class HitboxIE(InfoExtractor): class HitboxLiveIE(HitboxIE): IE_NAME = 'hitbox:live' - _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/(?!video)(?P<id>.+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?P<id>[^/?#&]+)' + _TESTS = [{ 'url': 'http://www.hitbox.tv/dimak', 'info_dict': { 'id': 'dimak', @@ -146,13 +148,20 @@ class HitboxLiveIE(HitboxIE): # live 'skip_download': True, }, - } + }, { + 'url': 'https://www.smashcast.tv/dimak', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if HitboxIE.suitable(url) else super(HitboxLiveIE, cls).suitable(url) def _real_extract(self, url): video_id = self._match_id(url) player_config = self._download_json( - 'https://www.hitbox.tv/api/player/config/live/%s' % video_id, + 'https://www.smashcast.tv/api/player/config/live/%s' % video_id, video_id) formats = [] @@ -197,8 +206,7 @@ class HitboxLiveIE(HitboxIE): self._sort_formats(formats) metadata = self._extract_metadata( - 'https://www.hitbox.tv/api/media/live', - video_id) + 'https://www.smashcast.tv/api/media/live', video_id) metadata['formats'] = formats metadata['is_live'] = True metadata['title'] = self._live_title(metadata.get('title')) diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index c45c68c1d..c1367cf51 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -89,6 +89,11 @@ class IGNIE(InfoExtractor): 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', 'only_matching': True, }, + { + # videoId pattern + 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', + 'only_matching': True, + }, ] def _find_video_id(self, webpage): @@ -98,6 +103,8 @@ class IGNIE(InfoExtractor): r'data-video-id="(.+?)"', r'<object id="vid_(.+?)"', r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"', + r'videoId"\s*:\s*"(.+?)"', + r'videoId["\']\s*:\s*["\']([^"\']+?)["\']', ] return self._search_regex(res_id, webpage, 'video id', default=None) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index f95c00c73..3ff672a89 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -13,7 +13,7 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video/[^/]+/|title/tt\d+.*?#lb-|videoplayer/)vi(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title).+?[/-]vi(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.imdb.com/video/imdb/vi2524815897', @@ -35,6 +35,9 @@ class ImdbIE(InfoExtractor): }, { 'url': 'http://www.imdb.com/videoplayer/vi1562949145', 'only_matching': True, + }, { + 'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index f3156804d..26c48e4b8 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -59,12 +59,18 @@ class ITVIE(InfoExtractor): def _add_sub_element(element, name): return etree.SubElement(element, _add_ns(name)) + production_id = ( + params.get('data-video-autoplay-id') or + '%s#001' % ( + params.get('data-video-episode-id') or + video_id.replace('a', '/'))) + req_env = etree.Element(_add_ns('soapenv:Envelope')) _add_sub_element(req_env, 'soapenv:Header') body = _add_sub_element(req_env, 'soapenv:Body') get_playlist = _add_sub_element(body, ('tem:GetPlaylist')) request = _add_sub_element(get_playlist, 'tem:request') - _add_sub_element(request, 'itv:ProductionId').text = params['data-video-id'] + _add_sub_element(request, 'itv:ProductionId').text = production_id _add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper() vodcrid = _add_sub_element(request, 'itv:Vodcrid') _add_sub_element(vodcrid, 'com:Id') diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py new file mode 100755 index 000000000..a764023e9 --- /dev/null +++ b/youtube_dl/extractor/joj.py @@ -0,0 +1,100 @@ +# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ js_to_json,
+ try_get,
+)
+
+
+class JojIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ (?:
+ joj:|
+ https?://media\.joj\.sk/embed/
+ )
+ (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})
+ '''
+ _TESTS = [{
+ 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932',
+ 'info_dict': {
+ 'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932',
+ 'ext': 'mp4',
+ 'title': 'NOVÉ BÝVANIE',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 3118,
+ }
+ }, {
+ 'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return re.findall(
+ r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//media\.joj\.sk/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
+ webpage)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'https://media.joj.sk/embed/%s' % video_id, video_id)
+
+ title = self._search_regex(
+ (r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
+ r'<title>(?P<title>[^<]+)'), webpage, 'title',
+ default=None, group='title') or self._og_search_title(webpage)
+
+ bitrates = self._parse_json(
+ self._search_regex(
+ r'(?s)bitrates\s*=\s*({.+?});', webpage, 'bitrates',
+ default='{}'),
+ video_id, transform_source=js_to_json, fatal=False)
+
+ formats = []
+ for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []:
+ if isinstance(format_url, compat_str):
+ height = self._search_regex(
+ r'(\d+)[pP]\.', format_url, 'height', default=None)
+ formats.append({
+ 'url': format_url,
+ 'format_id': '%sp' % height if height else None,
+ 'height': int(height),
+ })
+ if not formats:
+ playlist = self._download_xml(
+ 'https://media.joj.sk/services/Video.php?clip=%s' % video_id,
+ video_id)
+ for file_el in playlist.findall('./files/file'):
+ path = file_el.get('path')
+ if not path:
+ continue
+ format_id = file_el.get('id') or file_el.get('label')
+ formats.append({
+ 'url': 'http://n16.joj.sk/storage/%s' % path.replace(
+ 'dat/', '', 1),
+ 'format_id': format_id,
+ 'height': int_or_none(self._search_regex(
+ r'(\d+)[pP]', format_id or path, 'height',
+ default=None)),
+ })
+ self._sort_formats(formats)
+
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ duration = int_or_none(self._search_regex(
+ r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/jove.py b/youtube_dl/extractor/jove.py index f9a034b78..27e0e37f6 100644 --- a/youtube_dl/extractor/jove.py +++ b/youtube_dl/extractor/jove.py @@ -65,9 +65,9 @@ class JoveIE(InfoExtractor): webpage, 'description', fatal=False) publish_date = unified_strdate(self._html_search_meta( 'citation_publication_date', webpage, 'publish date', fatal=False)) - comment_count = self._html_search_regex( + comment_count = int(self._html_search_regex( r'<meta name="num_comments" content="(\d+) Comments?"', - webpage, 'comment count', fatal=False) + webpage, 'comment count', fatal=False)) return { 'id': video_id, diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 41c1f3d96..138d4844d 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -324,7 +324,7 @@ class KalturaIE(InfoExtractor): if captions: for caption in captions.get('objects', []): # Continue if caption is not ready - if f.get('status') != 2: + if caption.get('status') != 2: continue if not caption.get('id'): continue diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py index 4e9eb67bf..f236a2f78 100644 --- a/youtube_dl/extractor/karrierevideos.py +++ b/youtube_dl/extractor/karrierevideos.py @@ -48,7 +48,7 @@ class KarriereVideosIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = (self._html_search_meta('title', webpage, default=None) or - self._search_regex(r'<h1 class="title">([^<]+)</h1>')) + self._search_regex(r'<h1 class="title">([^<]+)</h1>', webpage, 'video title')) video_id = self._search_regex( r'/config/video/(.+?)\.xml', webpage, 'video id') diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 3190b187c..c7f813370 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import json + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -8,15 +10,15 @@ from ..utils import ( urlencode_postdata, xpath_element, xpath_text, - urljoin, update_url_query, + js_to_json, ) class Laola1TvEmbedIE(InfoExtractor): IE_NAME = 'laola1tv:embed' _VALID_URL = r'https?://(?:www\.)?laola1\.tv/titanplayer\.php\?.*?\bvideoid=(?P<id>\d+)' - _TEST = { + _TESTS = [{ # flashvars.premium = "false"; 'url': 'https://www.laola1.tv/titanplayer.php?videoid=708065&type=V&lang=en&portal=int&customer=1024', 'info_dict': { @@ -26,7 +28,30 @@ class Laola1TvEmbedIE(InfoExtractor): 'uploader': 'ITTF - International Table Tennis Federation', 'upload_date': '20161211', }, - } + }] + + def _extract_token_url(self, stream_access_url, video_id, data): + return self._download_json( + stream_access_url, video_id, headers={ + 'Content-Type': 'application/json', + }, data=json.dumps(data).encode())['data']['stream-access'][0] + + def _extract_formats(self, token_url, video_id): + token_doc = self._download_xml( + token_url, video_id, 'Downloading token', + headers=self.geo_verification_headers()) + + token_attrib = xpath_element(token_doc, './/token').attrib + + if token_attrib['status'] != '0': + raise ExtractorError( + 'Token error: %s' % token_attrib['comment'], expected=True) + + formats = self._extract_akamai_formats( + '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']), + video_id) + self._sort_formats(formats) + return formats def _real_extract(self, url): video_id = self._match_id(url) @@ -68,29 +93,16 @@ class Laola1TvEmbedIE(InfoExtractor): else: data_abo = urlencode_postdata( dict((i, v) for i, v in enumerate(_v('req_liga_abos').split(',')))) - token_url = self._download_json( - 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', - video_id, query={ + stream_access_url = update_url_query( + 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', { 'videoId': _v('id'), 'target': self._search_regex(r'vs_target = (\d+);', webpage, 'vs target'), 'label': _v('label'), 'area': _v('area'), - }, data=data_abo)['data']['stream-access'][0] - - token_doc = self._download_xml( - token_url, video_id, 'Downloading token', - headers=self.geo_verification_headers()) - - token_attrib = xpath_element(token_doc, './/token').attrib + }) + token_url = self._extract_token_url(stream_access_url, video_id, data_abo) - if token_attrib['status'] != '0': - raise ExtractorError( - 'Token error: %s' % token_attrib['comment'], expected=True) - - formats = self._extract_akamai_formats( - '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']), - video_id) - self._sort_formats(formats) + formats = self._extract_formats(token_url, video_id) categories_str = _v('meta_sports') categories = categories_str.split(',') if categories_str else [] @@ -107,7 +119,7 @@ class Laola1TvEmbedIE(InfoExtractor): } -class Laola1TvIE(InfoExtractor): +class Laola1TvIE(Laola1TvEmbedIE): IE_NAME = 'laola1tv' _VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P<id>[^/?#&]+)' _TESTS = [{ @@ -164,13 +176,60 @@ class Laola1TvIE(InfoExtractor): if 'Dieser Livestream ist bereits beendet.' in webpage: raise ExtractorError('This live stream has already finished.', expected=True) - iframe_url = urljoin(url, self._search_regex( - r'<iframe[^>]*?id="videoplayer"[^>]*?src="([^"]+)"', - webpage, 'iframe url')) + conf = self._parse_json(self._search_regex( + r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'), + display_id, js_to_json) + + video_id = conf['videoid'] + + config = self._download_json(conf['configUrl'], video_id, query={ + 'videoid': video_id, + 'partnerid': conf['partnerid'], + 'language': conf.get('language', ''), + 'portal': conf.get('portalid', ''), + }) + error = config.get('error') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + video_data = config['video'] + title = video_data['title'] + is_live = video_data.get('isLivestream') and video_data.get('isLive') + meta = video_data.get('metaInformation') + sports = meta.get('sports') + categories = sports.split(',') if sports else [] + + token_url = self._extract_token_url( + video_data['streamAccess'], video_id, + video_data['abo']['required']) + + formats = self._extract_formats(token_url, video_id) return { - '_type': 'url', + 'id': video_id, 'display_id': display_id, - 'url': iframe_url, - 'ie_key': 'Laola1TvEmbed', + 'title': self._live_title(title) if is_live else title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('image'), + 'categories': categories, + 'formats': formats, + 'is_live': is_live, } + + +class ITTFIE(InfoExtractor): + _VALID_URL = r'https?://tv\.ittf\.com/video/[^/]+/(?P<id>\d+)' + _TEST = { + 'url': 'https://tv.ittf.com/video/peng-wang-wei-matsudaira-kenta/951802', + 'only_matching': True, + } + + def _real_extract(self, url): + return self.url_result( + update_url_query('https://www.laola1.tv/titanplayer.php', { + 'videoid': self._match_id(url), + 'type': 'V', + 'lang': 'en', + 'portal': 'int', + 'customer': 1024, + }), Laola1TvEmbedIE.ie_key()) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 0a5a3956c..ad65b2759 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -26,14 +26,16 @@ class LimelightBaseIE(InfoExtractor): 'Channel': 'channel', 'ChannelList': 'channel_list', } + + def smuggle(url): + return smuggle_url(url, {'source_url': source_url}) + entries = [] for kind, video_id in re.findall( r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage): entries.append(cls.url_result( - smuggle_url( - 'limelight:%s:%s' % (lm[kind], video_id), - {'source_url': source_url}), + smuggle('limelight:%s:%s' % (lm[kind], video_id)), 'Limelight%s' % kind, video_id)) for mobj in re.finditer( # As per [1] class attribute should be exactly equal to @@ -49,10 +51,15 @@ class LimelightBaseIE(InfoExtractor): ''', webpage): kind, video_id = mobj.group('kind'), mobj.group('id') entries.append(cls.url_result( - smuggle_url( - 'limelight:%s:%s' % (kind, video_id), - {'source_url': source_url}), + smuggle('limelight:%s:%s' % (kind, video_id)), 'Limelight%s' % kind.capitalize(), video_id)) + # http://support.3playmedia.com/hc/en-us/articles/115009517327-Limelight-Embedding-the-Audio-Description-Plugin-with-the-Limelight-Player-on-Your-Web-Page) + for video_id in re.findall( + r'(?s)LimelightPlayerUtil\.embed\s*\(\s*{.*?\bmediaId["\']\s*:\s*["\'](?P<id>[a-z0-9]{32})', + webpage): + entries.append(cls.url_result( + smuggle('limelight:media:%s' % video_id), + LimelightMediaIE.ie_key(), video_id)) return entries def _call_playlist_service(self, item_id, method, fatal=True, referer=None): diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index c7de65353..246aac576 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor @@ -11,10 +10,10 @@ class LiveLeakIE(InfoExtractor): _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)' _TESTS = [{ 'url': 'http://www.liveleak.com/view?i=757_1364311680', - 'md5': '50f79e05ba149149c1b4ea961223d5b3', + 'md5': '0813c2430bea7a46bf13acf3406992f4', 'info_dict': { 'id': '757_1364311680', - 'ext': 'flv', + 'ext': 'mp4', 'description': 'extremely bad day for this guy..!', 'uploader': 'ljfriel2', 'title': 'Most unlucky car accident', @@ -22,7 +21,7 @@ class LiveLeakIE(InfoExtractor): } }, { 'url': 'http://www.liveleak.com/view?i=f93_1390833151', - 'md5': 'b13a29626183c9d33944e6a04f41aafc', + 'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf', 'info_dict': { 'id': 'f93_1390833151', 'ext': 'mp4', @@ -32,6 +31,7 @@ class LiveLeakIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$' } }, { + # Prochan embed 'url': 'http://www.liveleak.com/view?i=4f7_1392687779', 'md5': '42c6d97d54f1db107958760788c5f48f', 'info_dict': { @@ -41,11 +41,13 @@ class LiveLeakIE(InfoExtractor): 'uploader': 'CapObveus', 'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck', 'age_limit': 18, - } + }, + 'skip': 'Video is dead', }, { # Covers https://github.com/rg3/youtube-dl/pull/5983 + # Multiple resolutions 'url': 'http://www.liveleak.com/view?i=801_1409392012', - 'md5': '0b3bec2d888c20728ca2ad3642f0ef15', + 'md5': 'c3a449dbaca5c0d1825caecd52a57d7b', 'info_dict': { 'id': '801_1409392012', 'ext': 'mp4', @@ -70,15 +72,20 @@ class LiveLeakIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.liveleak.com/view?i=677_1439397581', + 'info_dict': { + 'id': '677_1439397581', + 'title': 'Fuel Depot in China Explosion caught on video', + }, + 'playlist_count': 3, }] @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src="https?://(?:\w+\.)?liveleak\.com/ll_embed\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)', + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[if]=[\w_]+[^"]+)"', webpage) - if mobj: - return 'http://www.liveleak.com/view?i=%s' % mobj.group('id') def _real_extract(self, url): video_id = self._match_id(url) @@ -93,57 +100,70 @@ class LiveLeakIE(InfoExtractor): webpage, 'age limit', default=None)) video_thumbnail = self._og_search_thumbnail(webpage) - sources_raw = self._search_regex( - r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None) - if sources_raw is None: - alt_source = self._search_regex( - r'(file: ".*?"),', webpage, 'video URL', default=None) - if alt_source: - sources_raw = '[{ %s}]' % alt_source + entries = self._parse_html5_media_entries(url, webpage, video_id) + if not entries: + # Maybe an embed? + embed_url = self._search_regex( + r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"', + webpage, 'embed URL') + return { + '_type': 'url_transparent', + 'url': embed_url, + 'id': video_id, + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader, + 'age_limit': age_limit, + } + + for idx, info_dict in enumerate(entries): + for a_format in info_dict['formats']: + if not a_format.get('height'): + a_format['height'] = int_or_none(self._search_regex( + r'([0-9]+)p\.mp4', a_format['url'], 'height label', + default=None)) + + self._sort_formats(info_dict['formats']) + + # Don't append entry ID for one-video pages to keep backward compatibility + if len(entries) > 1: + info_dict['id'] = '%s_%s' % (video_id, idx + 1) else: - # Maybe an embed? - embed_url = self._search_regex( - r'<iframe[^>]+src="(https?://(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"', - webpage, 'embed URL') - return { - '_type': 'url_transparent', - 'url': embed_url, - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader, - 'age_limit': age_limit, - } - - sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw) - sources = json.loads(sources_json) - - formats = [{ - 'format_id': '%s' % i, - 'format_note': s.get('label'), - 'url': s['file'], - } for i, s in enumerate(sources)] - - for i, s in enumerate(sources): - # Removing '.h264_*.mp4' gives the raw video, which is essentially - # the same video without the LiveLeak logo at the top (see - # https://github.com/rg3/youtube-dl/pull/4768) - orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file']) - if s['file'] != orig_url: - formats.append({ - 'format_id': 'original-%s' % i, - 'format_note': s.get('label'), - 'url': orig_url, - 'preference': 1, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader, - 'formats': formats, - 'age_limit': age_limit, - 'thumbnail': video_thumbnail, - } + info_dict['id'] = video_id + + info_dict.update({ + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader, + 'age_limit': age_limit, + 'thumbnail': video_thumbnail, + }) + + return self.playlist_result(entries, video_id, video_title) + + +class LiveLeakEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P<kind>[if])=(?P<id>[\w_]+)' + + # See generic.py for actual test cases + _TESTS = [{ + 'url': 'https://www.liveleak.com/ll_embed?i=874_1459135191', + 'only_matching': True, + }, { + 'url': 'https://www.liveleak.com/ll_embed?f=ab065df993c1', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + kind, video_id = mobj.group('kind', 'id') + + if kind == 'f': + webpage = self._download_webpage(url, video_id) + liveleak_url = self._search_regex( + r'logourl\s*:\s*(?P<q1>[\'"])(?P<url>%s)(?P=q1)' % LiveLeakIE._VALID_URL, + webpage, 'LiveLeak URL', group='url') + elif kind == 'i': + liveleak_url = 'http://www.liveleak.com/view?i=%s' % video_id + + return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key()) diff --git a/youtube_dl/extractor/manyvids.py b/youtube_dl/extractor/manyvids.py new file mode 100644 index 000000000..b94b3c2ab --- /dev/null +++ b/youtube_dl/extractor/manyvids.py @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class ManyVidsIE(InfoExtractor): + _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P<id>\d+)' + _TEST = { + 'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/', + 'md5': '03f11bb21c52dd12a05be21a5c7dcc97', + 'info_dict': { + 'id': '133957', + 'ext': 'mp4', + 'title': 'everthing about me (Preview)', + 'view_count': int, + 'like_count': int, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + r'data-(?:video-filepath|meta-video)\s*=s*(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'video URL', group='url') + + title = '%s (Preview)' % self._html_search_regex( + r'<h2[^>]+class="m-a-0"[^>]*>([^<]+)', webpage, 'title') + + like_count = int_or_none(self._search_regex( + r'data-likes=["\'](\d+)', webpage, 'like count', default=None)) + view_count = int_or_none(self._html_search_regex( + r'(?s)<span[^>]+class="views-wrapper"[^>]*>(.+?)</span', webpage, + 'view count', default=None)) + + return { + 'id': video_id, + 'title': title, + 'view_count': view_count, + 'like_count': like_count, + 'formats': [{ + 'url': video_url, + }], + } diff --git a/youtube_dl/extractor/medialaan.py b/youtube_dl/extractor/medialaan.py index 6e067474b..4c32fbc2c 100644 --- a/youtube_dl/extractor/medialaan.py +++ b/youtube_dl/extractor/medialaan.py @@ -17,7 +17,7 @@ from ..utils import ( class MedialaanIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// - (?:www\.)? + (?:www\.|nieuws\.)? (?: (?P<site_id>vtm|q2|vtmkzoom)\.be/ (?: @@ -85,6 +85,22 @@ class MedialaanIE(InfoExtractor): # clip 'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio', 'only_matching': True, + }, { + # http/s redirect + 'url': 'https://vtmkzoom.be/video?aid=45724', + 'info_dict': { + 'id': '257136373657000', + 'ext': 'mp4', + 'title': 'K3 Dansstudio Ushuaia afl.6', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires account credentials', + }, { + # nieuws.vtm.be + 'url': 'https://nieuws.vtm.be/stadion/stadion/genk-nog-moeilijk-programma', + 'only_matching': True, }] def _real_initialize(self): @@ -146,6 +162,8 @@ class MedialaanIE(InfoExtractor): video_id, transform_source=lambda s: '[%s]' % s, fatal=False) if player: video = player[-1] + if video['videoUrl'] in ('http', 'https'): + return self.url_result(video['url'], MedialaanIE.ie_key()) info = { 'id': video_id, 'url': video['videoUrl'], diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py new file mode 100644 index 000000000..9760eafd5 --- /dev/null +++ b/youtube_dl/extractor/mediaset.py @@ -0,0 +1,118 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + parse_duration, + try_get, + unified_strdate, +) + + +class MediasetIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + mediaset:| + https?:// + (?:www\.)?video\.mediaset\.it/ + (?: + (?:video|on-demand)/(?:[^/]+/)+[^/]+_| + player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid= + ) + )(?P<id>[0-9]+) + ''' + _TESTS = [{ + # full episode + 'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html', + 'md5': '9b75534d42c44ecef7bf1ffeacb7f85d', + 'info_dict': { + 'id': '661824', + 'ext': 'mp4', + 'title': 'Quarta puntata', + 'description': 'md5:7183696d6df570e3412a5ef74b27c5e2', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1414, + 'creator': 'mediaset', + 'upload_date': '20161107', + 'series': 'Hello Goodbye', + 'categories': ['reality'], + }, + 'expected_warnings': ['is not a supported codec'], + }, { + # clip + 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', + 'only_matching': True, + }, { + # iframe simple + 'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true', + 'only_matching': True, + }, { + # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) + 'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true', + 'only_matching': True, + }, { + 'url': 'mediaset:661824', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>https?://(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=\d+.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_list = self._download_json( + 'http://cdnsel01.mediaset.net/GetCdn.aspx', + video_id, 'Downloading video CDN JSON', query={ + 'streamid': video_id, + 'format': 'json', + })['videoList'] + + formats = [] + for format_url in video_list: + if '.ism' in format_url: + formats.extend(self._extract_ism_formats( + format_url, video_id, ism_id='mss', fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': determine_ext(format_url), + }) + self._sort_formats(formats) + + mediainfo = self._download_json( + 'http://plr.video.mediaset.it/html/metainfo.sjson', + video_id, 'Downloading video info JSON', query={ + 'id': video_id, + })['video'] + + title = mediainfo['title'] + + creator = try_get( + mediainfo, lambda x: x['brand-info']['publisher'], compat_str) + category = try_get( + mediainfo, lambda x: x['brand-info']['category'], compat_str) + categories = [category] if category else None + + return { + 'id': video_id, + 'title': title, + 'description': mediainfo.get('short-description'), + 'thumbnail': mediainfo.get('thumbnail'), + 'duration': parse_duration(mediainfo.get('duration')), + 'creator': creator, + 'upload_date': unified_strdate(mediainfo.get('production-date')), + 'webpage_url': mediainfo.get('url'), + 'series': mediainfo.get('brand-value'), + 'categories': categories, + 'formats': formats, + } diff --git a/youtube_dl/extractor/megaphone.py b/youtube_dl/extractor/megaphone.py new file mode 100644 index 000000000..60e3caf0d --- /dev/null +++ b/youtube_dl/extractor/megaphone.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import js_to_json + + +class MegaphoneIE(InfoExtractor): + IE_NAME = 'megaphone.fm' + IE_DESC = 'megaphone.fm embedded players' + _VALID_URL = r'https://player\.megaphone\.fm/(?P<id>[A-Z0-9]+)' + _TEST = { + 'url': 'https://player.megaphone.fm/GLT9749789991?"', + 'md5': '4816a0de523eb3e972dc0dda2c191f96', + 'info_dict': { + 'id': 'GLT9749789991', + 'ext': 'mp3', + 'title': '#97 What Kind Of Idiot Gets Phished?', + 'thumbnail': 're:^https://.*\.png.*$', + 'duration': 1776.26375, + 'author': 'Reply All', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._og_search_property('audio:title', webpage) + author = self._og_search_property('audio:artist', webpage) + thumbnail = self._og_search_thumbnail(webpage) + + episode_json = self._search_regex(r'(?s)var\s+episode\s*=\s*(\{.+?\});', webpage, 'episode JSON') + episode_data = self._parse_json(episode_json, video_id, js_to_json) + video_url = self._proto_relative_url(episode_data['mediaUrl'], 'https:') + + formats = [{ + 'url': video_url, + }] + + return { + 'id': video_id, + 'thumbnail': thumbnail, + 'title': title, + 'author': author, + 'duration': episode_data['duration'], + 'formats': formats, + } + + @classmethod + def _extract_urls(cls, webpage): + return [m[0] for m in re.findall( + r'<iframe[^>]*?\ssrc=["\'](%s)' % cls._VALID_URL, webpage)] diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 28b743cca..964dc542c 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -136,11 +136,9 @@ class MiTeleIE(InfoExtractor): video_id, 'Downloading gigya script') # Get a appKey/uuid for getting the session key - appKey_var = self._search_regex( - r'value\s*\(\s*["\']appGridApplicationKey["\']\s*,\s*([0-9a-f]+)', - gigya_sc, 'appKey variable') appKey = self._search_regex( - r'var\s+%s\s*=\s*["\']([0-9a-f]+)' % appKey_var, gigya_sc, 'appKey') + r'constant\s*\(\s*["\']_appGridApplicationKey["\']\s*,\s*["\']([0-9a-f]+)', + gigya_sc, 'appKey') session_json = self._download_json( 'https://appgrid-api.cloud.accedo.tv/session', diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 0efbe660a..f6360cce6 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -9,6 +9,7 @@ from .common import InfoExtractor from ..compat import ( compat_chr, compat_ord, + compat_str, compat_urllib_parse_unquote, compat_urlparse, ) @@ -53,16 +54,27 @@ class MixcloudIE(InfoExtractor): 'only_matching': True, }] - # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js - @staticmethod - def _decrypt_play_info(play_info): - KEY = 'pleasedontdownloadourmusictheartistswontgetpaid' + _keys = [ + 'return { requestAnimationFrame: function(callback) { callback(); }, innerHeight: 500 };', + 'pleasedontdownloadourmusictheartistswontgetpaid', + 'window.addEventListener = window.addEventListener || function() {};', + '(function() { return new Date().toLocaleDateString(); })()' + ] + _current_key = None + # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js + def _decrypt_play_info(self, play_info, video_id): play_info = base64.b64decode(play_info.encode('ascii')) - - return ''.join([ - compat_chr(compat_ord(ch) ^ compat_ord(KEY[idx % len(KEY)])) - for idx, ch in enumerate(play_info)]) + for num, key in enumerate(self._keys, start=1): + try: + return self._parse_json( + ''.join([ + compat_chr(compat_ord(ch) ^ compat_ord(key[idx % len(key)])) + for idx, ch in enumerate(play_info)]), + video_id) + except ExtractorError: + if num == len(self._keys): + raise def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -72,14 +84,30 @@ class MixcloudIE(InfoExtractor): webpage = self._download_webpage(url, track_id) + if not self._current_key: + js_url = self._search_regex( + r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)', + webpage, 'js url', default=None) + if js_url: + js = self._download_webpage(js_url, track_id, fatal=False) + if js: + KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P<key>(?:(?!\1).)+)\1' + for key_name in ('value', 'key_value', 'key_value.*?', '.*?value.*?'): + key = self._search_regex( + KEY_RE_TEMPLATE % key_name, js, 'key', + default=None, group='key') + if key and isinstance(key, compat_str): + self._keys.insert(0, key) + self._current_key = key + message = self._html_search_regex( r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)', webpage, 'error message', default=None) encrypted_play_info = self._search_regex( r'm-play-info="([^"]+)"', webpage, 'play info') - play_info = self._parse_json( - self._decrypt_play_info(encrypted_play_info), track_id) + + play_info = self._decrypt_play_info(encrypted_play_info, track_id) if message and 'stream_url' not in play_info: raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 59cd4b838..675ff6873 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -15,7 +15,7 @@ class MLBIE(InfoExtractor): (?:[\da-z_-]+\.)*mlb\.com/ (?: (?: - (?:.*?/)?video/(?:topic/[\da-z_-]+/)?v| + (?:.*?/)?video/(?:topic/[\da-z_-]+/)?(?:v|.*?/c-)| (?: shared/video/embed/(?:embed|m-internal-embed)\.html| (?:[^/]+/)+(?:play|index)\.jsp| @@ -84,7 +84,7 @@ class MLBIE(InfoExtractor): }, { 'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer', - 'md5': 'b190e70141fb9a1552a85426b4da1b5d', + 'md5': 'aafaf5b0186fee8f32f20508092f8111', 'info_dict': { 'id': '75609783', 'ext': 'mp4', @@ -95,6 +95,10 @@ class MLBIE(InfoExtractor): } }, { + 'url': 'https://www.mlb.com/video/hargrove-homers-off-caldwell/c-1352023483?tid=67793694', + 'only_matching': True, + }, + { 'url': 'http://m.mlb.com/shared/video/embed/embed.html?content_id=35692085&topic_id=6479266&width=400&height=224&property=mlb', 'only_matching': True, }, diff --git a/youtube_dl/extractor/mpora.py b/youtube_dl/extractor/mpora.py deleted file mode 100644 index 5a1bee5c8..000000000 --- a/youtube_dl/extractor/mpora.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import int_or_none - - -class MporaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mpora\.(?:com|de)/videos/(?P<id>[^?#/]+)' - IE_NAME = 'MPORA' - - _TEST = { - 'url': 'http://mpora.de/videos/AAdo8okx4wiz/embed?locale=de', - 'md5': 'a7a228473eedd3be741397cf452932eb', - 'info_dict': { - 'id': 'AAdo8okx4wiz', - 'ext': 'mp4', - 'title': 'Katy Curd - Winter in the Forest', - 'duration': 416, - 'uploader': 'Peter Newman Media', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - data_json = self._search_regex( - [r"new FM\.Player\('[^']+',\s*(\{.*?)\).player;", - r"new\s+FM\.Kaltura\.Player\('[^']+'\s*,\s*({.+?})\);"], - webpage, 'json') - data = self._parse_json(data_json, video_id) - - uploader = data['info_overlay'].get('username') - duration = data['video']['duration'] // 1000 - thumbnail = data['video']['encodings']['sd']['poster'] - title = data['info_overlay']['title'] - - formats = [] - for encoding_id, edata in data['video']['encodings'].items(): - for src in edata['sources']: - width_str = self._search_regex( - r'_([0-9]+)\.[a-zA-Z0-9]+$', src['src'], - False, default=None) - vcodec = src['type'].partition('/')[2] - - formats.append({ - 'format_id': encoding_id + '-' + vcodec, - 'url': src['src'], - 'vcodec': vcodec, - 'width': int_or_none(width_str), - }) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'uploader': uploader, - 'duration': duration, - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py index 1473bcf48..650731fdc 100644 --- a/youtube_dl/extractor/msn.py +++ b/youtube_dl/extractor/msn.py @@ -68,10 +68,6 @@ class MSNIE(InfoExtractor): format_url = file_.get('url') if not format_url: continue - ext = determine_ext(format_url) - if ext == 'ism': - formats.extend(self._extract_ism_formats( - format_url + '/Manifest', display_id, 'mss', fatal=False)) if 'm3u8' in format_url: # m3u8_native should not be used here until # https://github.com/rg3/youtube-dl/issues/9913 is fixed @@ -79,6 +75,9 @@ class MSNIE(InfoExtractor): format_url, display_id, 'mp4', m3u8_id='hls', fatal=False) formats.extend(m3u8_formats) + elif determine_ext(format_url) == 'ism': + formats.extend(self._extract_ism_formats( + format_url + '/Manifest', display_id, 'mss', fatal=False)) else: formats.append({ 'url': format_url, diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 8acea1461..25af5ddfd 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -50,8 +50,7 @@ class MTVServicesInfoExtractor(InfoExtractor): thumb_node = itemdoc.find(search_path) if thumb_node is None: return None - else: - return thumb_node.attrib['url'] + return thumb_node.get('url') or thumb_node.text or None def _extract_mobile_video_formats(self, mtvn_id): webpage_url = self._MOBILE_TEMPLATE % mtvn_id @@ -83,7 +82,7 @@ class MTVServicesInfoExtractor(InfoExtractor): hls_url = rendition.find('./src').text formats.extend(self._extract_m3u8_formats( hls_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls')) + m3u8_id='hls', fatal=False)) else: # fms try: @@ -106,7 +105,8 @@ class MTVServicesInfoExtractor(InfoExtractor): }]) except (KeyError, TypeError): raise ExtractorError('Invalid rendition field.') - self._sort_formats(formats) + if formats: + self._sort_formats(formats) return formats def _extract_subtitles(self, mdoc, mtvn_id): @@ -133,8 +133,11 @@ class MTVServicesInfoExtractor(InfoExtractor): mediagen_url += 'acceptMethods=' mediagen_url += 'hls' if use_hls else 'fms' - mediagen_doc = self._download_xml(mediagen_url, video_id, - 'Downloading video urls') + mediagen_doc = self._download_xml( + mediagen_url, video_id, 'Downloading video urls', fatal=False) + + if mediagen_doc is False: + return None item = mediagen_doc.find('./video/item') if item is not None and item.get('type') == 'text': @@ -174,6 +177,13 @@ class MTVServicesInfoExtractor(InfoExtractor): formats = self._extract_video_formats(mediagen_doc, mtvn_id, video_id) + # Some parts of complete video may be missing (e.g. missing Act 3 in + # http://www.southpark.de/alle-episoden/s14e01-sexual-healing) + if not formats: + return None + + self._sort_formats(formats) + return { 'title': title, 'formats': formats, @@ -205,9 +215,14 @@ class MTVServicesInfoExtractor(InfoExtractor): title = xpath_text(idoc, './channel/title') description = xpath_text(idoc, './channel/description') + entries = [] + for item in idoc.findall('.//item'): + info = self._get_video_info(item, use_hls) + if info: + entries.append(info) + return self.playlist_result( - [self._get_video_info(item, use_hls) for item in idoc.findall('.//item')], - playlist_title=title, playlist_description=description) + entries, playlist_title=title, playlist_description=description) def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None): triforce_feed = self._parse_json(self._search_regex( diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py index f281238c9..e164d5940 100644 --- a/youtube_dl/extractor/myspace.py +++ b/youtube_dl/extractor/myspace.py @@ -12,64 +12,62 @@ from ..utils import ( class MySpaceIE(InfoExtractor): - _VALID_URL = r'https?://myspace\.com/([^/]+)/(?P<mediatype>video/[^/]+/|music/song/.*?)(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + myspace\.com/[^/]+/ + (?P<mediatype> + video/[^/]+/(?P<video_id>\d+)| + music/song/[^/?#&]+-(?P<song_id>\d+)-\d+(?:[/?#&]|$) + ) + ''' - _TESTS = [ - { - 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919', - 'md5': '9c1483c106f4a695c47d2911feed50a7', - 'info_dict': { - 'id': '109594919', - 'ext': 'mp4', - 'title': 'Little Big Town', - 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.', - 'uploader': 'Five Minutes to the Stage', - 'uploader_id': 'fiveminutestothestage', - 'timestamp': 1414108751, - 'upload_date': '20141023', - }, + _TESTS = [{ + 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919', + 'md5': '9c1483c106f4a695c47d2911feed50a7', + 'info_dict': { + 'id': '109594919', + 'ext': 'mp4', + 'title': 'Little Big Town', + 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.', + 'uploader': 'Five Minutes to the Stage', + 'uploader_id': 'fiveminutestothestage', + 'timestamp': 1414108751, + 'upload_date': '20141023', }, + }, { # songs - { - 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681', - 'md5': '1d7ee4604a3da226dd69a123f748b262', - 'info_dict': { - 'id': '93388656', - 'ext': 'm4a', - 'title': 'Of weakened soul...', - 'uploader': 'Killsorrow', - 'uploader_id': 'killsorrow', - }, - }, { - 'add_ie': ['Youtube'], - 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041', - 'info_dict': { - 'id': 'xqds0B_meys', - 'ext': 'webm', - 'title': 'Three Days Grace - Animal I Have Become', - 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb', - 'uploader': 'ThreeDaysGraceVEVO', - 'uploader_id': 'ThreeDaysGraceVEVO', - 'upload_date': '20091002', - }, - }, { - 'add_ie': ['Youtube'], - 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426', - 'info_dict': { - 'id': 'ypWvQgnJrSU', - 'ext': 'mp4', - 'title': 'Starset - First Light', - 'description': 'md5:2d5db6c9d11d527683bcda818d332414', - 'uploader': 'Yumi K', - 'uploader_id': 'SorenPromotions', - 'upload_date': '20140725', - } + 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681', + 'md5': '1d7ee4604a3da226dd69a123f748b262', + 'info_dict': { + 'id': '93388656', + 'ext': 'm4a', + 'title': 'Of weakened soul...', + 'uploader': 'Killsorrow', + 'uploader_id': 'killsorrow', }, - ] + }, { + 'add_ie': ['Youtube'], + 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041', + 'info_dict': { + 'id': 'xqds0B_meys', + 'ext': 'webm', + 'title': 'Three Days Grace - Animal I Have Become', + 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb', + 'uploader': 'ThreeDaysGraceVEVO', + 'uploader_id': 'ThreeDaysGraceVEVO', + 'upload_date': '20091002', + }, + }, { + 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426', + 'only_matching': True, + }, { + 'url': 'https://myspace.com/thelargemouthbassband/music/song/02-pure-eyes.mp3-94422330-105113388', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = mobj.group('video_id') or mobj.group('song_id') is_song = mobj.group('mediatype').startswith('music/song') webpage = self._download_webpage(url, video_id) player_url = self._search_regex( diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index d2a44d05d..62db70b43 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -5,10 +5,8 @@ import re from .common import InfoExtractor from .theplatform import ThePlatformIE from .adobepass import AdobePassIE -from ..compat import compat_urllib_parse_urlparse from ..utils import ( find_xpath_attr, - lowercase_escape, smuggle_url, unescapeHTML, update_url_query, @@ -17,7 +15,7 @@ from ..utils import ( class NBCIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)' + _VALID_URL = r'(?P<permalink>https?://(?:www\.)?nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+))' _TESTS = [ { @@ -37,16 +35,6 @@ class NBCIE(AdobePassIE): }, }, { - 'url': 'http://www.nbc.com/the-tonight-show/episodes/176', - 'info_dict': { - 'id': '176', - 'ext': 'flv', - 'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen', - 'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.', - }, - 'skip': '404 Not Found', - }, - { 'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821', 'info_dict': { 'id': '2832821', @@ -64,11 +52,6 @@ class NBCIE(AdobePassIE): 'skip': 'Only works from US', }, { - # This video has expired but with an escaped embedURL - 'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515', - 'only_matching': True, - }, - { # HLS streams requires the 'hdnea3' cookie 'url': 'http://www.nbc.com/Kings/video/goliath/n1806', 'info_dict': { @@ -88,59 +71,38 @@ class NBCIE(AdobePassIE): ] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - info = { + permalink, video_id = re.match(self._VALID_URL, url).groups() + video_data = self._download_json( + 'https://api.nbc.com/v3/videos', video_id, query={ + 'filter[permalink]': permalink, + })['data'][0]['attributes'] + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + video_id = video_data['guid'] + title = video_data['title'] + if video_data.get('entitlement') == 'auth': + resource = self._get_mvpd_resource( + 'nbcentertainment', title, video_id, + video_data.get('vChipRating')) + query['auth'] = self._extract_mvpd_auth( + url, video_id, 'nbcentertainment', resource) + theplatform_url = smuggle_url(update_url_query( + 'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id, + query), {'force_smil_url': True}) + return { '_type': 'url_transparent', - 'ie_key': 'ThePlatform', 'id': video_id, + 'title': title, + 'url': theplatform_url, + 'description': video_data.get('description'), + 'keywords': video_data.get('keywords'), + 'season_number': int_or_none(video_data.get('seasonNumber')), + 'episode_number': int_or_none(video_data.get('episodeNumber')), + 'series': video_data.get('showName'), + 'ie_key': 'ThePlatform', } - video_data = None - preload = self._search_regex( - r'PRELOAD\s*=\s*({.+})', webpage, 'preload data', default=None) - if preload: - preload_data = self._parse_json(preload, video_id) - path = compat_urllib_parse_urlparse(url).path.rstrip('/') - entity_id = preload_data.get('xref', {}).get(path) - video_data = preload_data.get('entities', {}).get(entity_id) - if video_data: - query = { - 'mbr': 'true', - 'manifest': 'm3u', - } - video_id = video_data['guid'] - title = video_data['title'] - if video_data.get('entitlement') == 'auth': - resource = self._get_mvpd_resource( - 'nbcentertainment', title, video_id, - video_data.get('vChipRating')) - query['auth'] = self._extract_mvpd_auth( - url, video_id, 'nbcentertainment', resource) - theplatform_url = smuggle_url(update_url_query( - 'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id, - query), {'force_smil_url': True}) - info.update({ - 'id': video_id, - 'title': title, - 'url': theplatform_url, - 'description': video_data.get('description'), - 'keywords': video_data.get('keywords'), - 'season_number': int_or_none(video_data.get('seasonNumber')), - 'episode_number': int_or_none(video_data.get('episodeNumber')), - 'series': video_data.get('showName'), - }) - else: - theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex( - [ - r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"', - r'<iframe[^>]+src="((?:https?:)?//player\.theplatform\.com/[^"]+)"', - r'"embedURL"\s*:\s*"([^"]+)"' - ], - webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/'))) - if theplatform_url.startswith('//'): - theplatform_url = 'http:' + theplatform_url - info['url'] = smuggle_url(theplatform_url, {'source_url': url}) - return info class NBCSportsVPlayerIE(InfoExtractor): diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py index 9bea610c8..0e26f8399 100644 --- a/youtube_dl/extractor/newgrounds.py +++ b/youtube_dl/extractor/newgrounds.py @@ -1,6 +1,15 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + parse_duration, + parse_filesize, + unified_timestamp, +) class NewgroundsIE(InfoExtractor): @@ -13,7 +22,10 @@ class NewgroundsIE(InfoExtractor): 'ext': 'mp3', 'title': 'B7 - BusMode', 'uploader': 'Burn7', - } + 'timestamp': 1378878540, + 'upload_date': '20130911', + 'duration': 143, + }, }, { 'url': 'https://www.newgrounds.com/portal/view/673111', 'md5': '3394735822aab2478c31b1004fe5e5bc', @@ -22,25 +34,133 @@ class NewgroundsIE(InfoExtractor): 'ext': 'mp4', 'title': 'Dancin', 'uploader': 'Squirrelman82', + 'timestamp': 1460256780, + 'upload_date': '20160410', + }, + }, { + # source format unavailable, additional mp4 formats + 'url': 'http://www.newgrounds.com/portal/view/689400', + 'info_dict': { + 'id': '689400', + 'ext': 'mp4', + 'title': 'ZTV News Episode 8', + 'uploader': 'BennettTheSage', + 'timestamp': 1487965140, + 'upload_date': '20170224', + }, + 'params': { + 'skip_download': True, }, }] def _real_extract(self, url): media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id) title = self._html_search_regex( r'<title>([^>]+)</title>', webpage, 'title') - uploader = self._html_search_regex( - r'Author\s*<a[^>]+>([^<]+)', webpage, 'uploader', fatal=False) + media_url = self._parse_json(self._search_regex( + r'"url"\s*:\s*("[^"]+"),', webpage, ''), media_id) + + formats = [{ + 'url': media_url, + 'format_id': 'source', + 'quality': 1, + }] + + max_resolution = int_or_none(self._search_regex( + r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution', + default=None)) + if max_resolution: + url_base = media_url.rpartition('.')[0] + for resolution in (360, 720, 1080): + if resolution > max_resolution: + break + formats.append({ + 'url': '%s.%dp.mp4' % (url_base, resolution), + 'format_id': '%dp' % resolution, + 'height': resolution, + }) + + self._check_formats(formats, media_id) + self._sort_formats(formats) - music_url = self._parse_json(self._search_regex( - r'"url":("[^"]+"),', webpage, ''), media_id) + uploader = self._search_regex( + r'(?:Author|Writer)\s*<a[^>]+>([^<]+)', webpage, 'uploader', + fatal=False) + + timestamp = unified_timestamp(self._search_regex( + r'<dt>Uploaded</dt>\s*<dd>([^<]+)', webpage, 'timestamp', + default=None)) + duration = parse_duration(self._search_regex( + r'<dd>Song\s*</dd><dd>.+?</dd><dd>([^<]+)', webpage, 'duration', + default=None)) + + filesize_approx = parse_filesize(self._html_search_regex( + r'<dd>Song\s*</dd><dd>(.+?)</dd>', webpage, 'filesize', + default=None)) + if len(formats) == 1: + formats[0]['filesize_approx'] = filesize_approx + + if '<dd>Song' in webpage: + formats[0]['vcodec'] = 'none' return { 'id': media_id, 'title': title, - 'url': music_url, 'uploader': uploader, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, } + + +class NewgroundsPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:collection|[^/]+/search/[^/]+)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.newgrounds.com/collection/cats', + 'info_dict': { + 'id': 'cats', + 'title': 'Cats', + }, + 'playlist_mincount': 46, + }, { + 'url': 'http://www.newgrounds.com/portal/search/author/ZONE-SAMA', + 'info_dict': { + 'id': 'ZONE-SAMA', + 'title': 'Portal Search: ZONE-SAMA', + }, + 'playlist_mincount': 47, + }, { + 'url': 'http://www.newgrounds.com/audio/search/title/cats', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title = self._search_regex( + r'<title>([^>]+)</title>', webpage, 'title', default=None) + + # cut left menu + webpage = self._search_regex( + r'(?s)<div[^>]+\bclass=["\']column wide(.+)', + webpage, 'wide column', default=webpage) + + entries = [] + for a, path, media_id in re.findall( + r'(<a[^>]+\bhref=["\']/?((?:portal/view|audio/listen)/(\d+))[^>]+>)', + webpage): + a_class = extract_attributes(a).get('class') + if a_class not in ('item-portalsubmission', 'item-audiosubmission'): + continue + entries.append( + self.url_result( + 'https://www.newgrounds.com/%s' % path, + ie=NewgroundsIE.ie_key(), video_id=media_id)) + + return self.playlist_result(entries, playlist_id, title) diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py new file mode 100644 index 000000000..d0235fdfe --- /dev/null +++ b/youtube_dl/extractor/nexx.py @@ -0,0 +1,271 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import random +import re +import time + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + parse_duration, + try_get, + urlencode_postdata, +) + + +class NexxIE(InfoExtractor): + _VALID_URL = r'https?://api\.nexx(?:\.cloud|cdn\.com)/v3/(?P<domain_id>\d+)/videos/byid/(?P<id>\d+)' + _TESTS = [{ + # movie + 'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907', + 'md5': '16746bfc28c42049492385c989b26c4a', + 'info_dict': { + 'id': '128907', + 'ext': 'mp4', + 'title': 'Stiftung Warentest', + 'alt_title': 'Wie ein Test abläuft', + 'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2', + 'release_year': 2013, + 'creator': 'SPIEGEL TV', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2509, + 'timestamp': 1384264416, + 'upload_date': '20131112', + }, + 'params': { + 'format': 'bestvideo', + }, + }, { + # episode + 'url': 'https://api.nexx.cloud/v3/741/videos/byid/247858', + 'info_dict': { + 'id': '247858', + 'ext': 'mp4', + 'title': 'Return of the Golden Child (OV)', + 'description': 'md5:5d969537509a92b733de21bae249dc63', + 'release_year': 2017, + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1397, + 'timestamp': 1495033267, + 'upload_date': '20170517', + 'episode_number': 2, + 'season_number': 2, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + 'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + # Reference: + # 1. https://nx-s.akamaized.net/files/201510/44.pdf + + entries = [] + + # JavaScript Integration + mobj = re.search( + r'<script\b[^>]+\bsrc=["\']https?://require\.nexx(?:\.cloud|cdn\.com)/(?P<id>\d+)', + webpage) + if mobj: + domain_id = mobj.group('id') + for video_id in re.findall( + r'(?is)onPLAYReady.+?_play\.init\s*\(.+?\s*,\s*["\']?(\d+)', + webpage): + entries.append( + 'https://api.nexx.cloud/v3/%s/videos/byid/%s' + % (domain_id, video_id)) + + # TODO: support more embed formats + + return entries + + @staticmethod + def _extract_url(webpage): + return NexxIE._extract_urls(webpage)[0] + + def _handle_error(self, response): + status = int_or_none(try_get( + response, lambda x: x['metadata']['status']) or 200) + if 200 <= status < 300: + return + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, response['metadata']['errorhint']), + expected=True) + + def _call_api(self, domain_id, path, video_id, data=None, headers={}): + headers['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8' + result = self._download_json( + 'https://api.nexx.cloud/v3/%s/%s' % (domain_id, path), video_id, + 'Downloading %s JSON' % path, data=urlencode_postdata(data), + headers=headers) + self._handle_error(result) + return result['result'] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + domain_id, video_id = mobj.group('domain_id', 'id') + + # Reverse engineered from JS code (see getDeviceID function) + device_id = '%d:%d:%d%d' % ( + random.randint(1, 4), int(time.time()), + random.randint(1e4, 99999), random.randint(1, 9)) + + result = self._call_api(domain_id, 'session/init', video_id, data={ + 'nxp_devh': device_id, + 'nxp_userh': '', + 'precid': '0', + 'playlicense': '0', + 'screenx': '1920', + 'screeny': '1080', + 'playerversion': '6.0.00', + 'gateway': 'html5', + 'adGateway': '', + 'explicitlanguage': 'en-US', + 'addTextTemplates': '1', + 'addDomainData': '1', + 'addAdModel': '1', + }, headers={ + 'X-Request-Enable-Auth-Fallback': '1', + }) + + cid = result['general']['cid'] + + # As described in [1] X-Request-Token generation algorithm is + # as follows: + # md5( operation + domain_id + domain_secret ) + # where domain_secret is a static value that will be given by nexx.tv + # as per [1]. Here is how this "secret" is generated (reversed + # from _play.api.init function, search for clienttoken). So it's + # actually not static and not that much of a secret. + # 1. https://nexxtvstorage.blob.core.windows.net/files/201610/27.pdf + secret = result['device']['clienttoken'][int(device_id[0]):] + secret = secret[0:len(secret) - int(device_id[-1])] + + op = 'byid' + + # Reversed from JS code for _play.api.call function (search for + # X-Request-Token) + request_token = hashlib.md5( + ''.join((op, domain_id, secret)).encode('utf-8')).hexdigest() + + video = self._call_api( + domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={ + 'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description', + 'addInteractionOptions': '1', + 'addStatusDetails': '1', + 'addStreamDetails': '1', + 'addCaptions': '1', + 'addScenes': '1', + 'addHotSpots': '1', + 'addBumpers': '1', + 'captionFormat': 'data', + }, headers={ + 'X-Request-CID': cid, + 'X-Request-Token': request_token, + }) + + general = video['general'] + title = general['title'] + + stream_data = video['streamdata'] + language = general.get('language_raw') or '' + + # TODO: reverse more cdns and formats + + cdn = stream_data['cdnType'] + assert cdn == 'azure' + + azure_locator = stream_data['azureLocator'] + + AZURE_URL = 'http://nx-p%02d.akamaized.net/' + + for secure in ('s', ''): + cdn_shield = stream_data.get('cdnShieldHTTP%s' % secure.upper()) + if cdn_shield: + azure_base = 'http%s://%s' % (secure, cdn_shield) + break + else: + azure_base = AZURE_URL % int(stream_data['azureAccount'].replace('nexxplayplus', '')) + + is_ml = ',' in language + azure_m3u8_url = '%s%s/%s_src%s.ism/Manifest(format=m3u8-aapl)' % ( + azure_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + + protection_token = try_get( + video, lambda x: x['protectiondata']['token'], compat_str) + if protection_token: + azure_m3u8_url += '?hdnts=%s' % protection_token + + formats = self._extract_m3u8_formats( + azure_m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='%s-hls' % cdn) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'alt_title': general.get('subtitle'), + 'description': general.get('description'), + 'release_year': int_or_none(general.get('year')), + 'creator': general.get('studio') or general.get('studio_adref'), + 'thumbnail': try_get( + video, lambda x: x['imagedata']['thumb'], compat_str), + 'duration': parse_duration(general.get('runtime')), + 'timestamp': int_or_none(general.get('uploaded')), + 'episode_number': int_or_none(try_get( + video, lambda x: x['episodedata']['episode'])), + 'season_number': int_or_none(try_get( + video, lambda x: x['episodedata']['season'])), + 'formats': formats, + } + + +class NexxEmbedIE(InfoExtractor): + _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1', + 'md5': '16746bfc28c42049492385c989b26c4a', + 'info_dict': { + 'id': '161464', + 'ext': 'mp4', + 'title': 'Nervenkitzel Achterbahn', + 'alt_title': 'Karussellbauer in Deutschland', + 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', + 'release_year': 2005, + 'creator': 'SPIEGEL TV', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2761, + 'timestamp': 1394021479, + 'upload_date': '20140305', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + } + + @staticmethod + def _extract_urls(webpage): + # Reference: + # 1. https://nx-s.akamaized.net/files/201510/44.pdf + + # iFrame Embed Integration + return [mobj.group('url') for mobj in re.finditer( + r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1', + webpage)] + + def _real_extract(self, url): + embed_id = self._match_id(url) + + webpage = self._download_webpage(url, embed_id) + + return self.url_result(NexxIE._extract_url(webpage), ie=NexxIE.ie_key()) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 08a75929e..510b1c41f 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -12,6 +12,7 @@ class NickIE(MTVServicesInfoExtractor): IE_NAME = 'nick.com' _VALID_URL = r'https?://(?:(?:www|beta)\.)?nick(?:jr)?\.com/(?:[^/]+/)?(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)' _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm' + _GEO_COUNTRIES = ['US'] _TESTS = [{ 'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html', 'playlist': [ @@ -74,7 +75,7 @@ class NickIE(MTVServicesInfoExtractor): class NickDeIE(MTVServicesInfoExtractor): IE_NAME = 'nick.de' - _VALID_URL = r'https?://(?:www\.)?(?P<host>nick\.de|nickelodeon\.(?:nl|at))/(?:playlist|shows)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<host>nick\.(?:de|com\.pl)|nickelodeon\.(?:nl|at))/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse', 'only_matching': True, @@ -87,6 +88,9 @@ class NickDeIE(MTVServicesInfoExtractor): }, { 'url': 'http://www.nickelodeon.at/playlist/3773-top-videos/videos/episode/77993-das-letzte-gefecht', 'only_matching': True, + }, { + 'url': 'http://www.nick.com.pl/seriale/474-spongebob-kanciastoporty/wideo/17412-teatr-to-jest-to-rodeo-oszolom', + 'only_matching': True, }] def _extract_mrss_url(self, webpage, host): @@ -124,3 +128,21 @@ class NickNightIE(NickDeIE): return self._search_regex( r'mrss\s*:\s*(["\'])(?P<url>http.+?)\1', webpage, 'mrss url', group='url') + + +class NickRuIE(MTVServicesInfoExtractor): + IE_NAME = 'nickelodeonru' + _VALID_URL = r'https?://(?:www\.)nickelodeon\.ru/(?:playlist|shows|videos)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.nickelodeon.ru/shows/henrydanger/videos/episodes/3-sezon-15-seriya-licenziya-na-polyot/pmomfb#playlist/7airc6', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.ru/videos/smotri-na-nickelodeon-v-iyule/g9hvh7', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + mgid = self._extract_mgid(webpage) + return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 8baac23e4..026329d3e 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -1,23 +1,27 @@ # coding: utf-8 from __future__ import unicode_literals -import re import json import datetime from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_urlparse, ) from ..utils import ( + determine_ext, + dict_get, ExtractorError, int_or_none, + float_or_none, parse_duration, parse_iso8601, - sanitized_Request, - xpath_text, - determine_ext, + remove_start, + try_get, + unified_timestamp, urlencode_postdata, + xpath_text, ) @@ -32,12 +36,15 @@ class NiconicoIE(InfoExtractor): 'id': 'sm22312215', 'ext': 'mp4', 'title': 'Big Buck Bunny', + 'thumbnail': r're:https?://.*', 'uploader': 'takuya0301', 'uploader_id': '2698420', 'upload_date': '20131123', 'timestamp': 1385182762, 'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', 'duration': 33, + 'view_count': int, + 'comment_count': int, }, 'skip': 'Requires an account', }, { @@ -49,6 +56,7 @@ class NiconicoIE(InfoExtractor): 'ext': 'swf', 'title': '【鏡音リン】Dance on media【オリジナル】take2!', 'description': 'md5:689f066d74610b3b22e0f1739add0f58', + 'thumbnail': r're:https?://.*', 'uploader': 'りょうた', 'uploader_id': '18822557', 'upload_date': '20110429', @@ -65,9 +73,11 @@ class NiconicoIE(InfoExtractor): 'ext': 'unknown_video', 'description': 'deleted', 'title': 'ドラえもんエターナル第3話「決戦第3新東京市」<前編>', + 'thumbnail': r're:https?://.*', 'upload_date': '20071224', 'timestamp': int, # timestamp field has different value if logged in 'duration': 304, + 'view_count': int, }, 'skip': 'Requires an account', }, { @@ -77,15 +87,57 @@ class NiconicoIE(InfoExtractor): 'ext': 'mp4', 'title': '【第1回】RADIOアニメロミックス ラブライブ!~のぞえりRadio Garden~', 'description': 'md5:b27d224bb0ff53d3c8269e9f8b561cf1', + 'thumbnail': r're:https?://.*', 'timestamp': 1388851200, 'upload_date': '20140104', 'uploader': 'アニメロチャンネル', 'uploader_id': '312', }, 'skip': 'The viewing period of the video you were searching for has expired.', + }, { + # video not available via `getflv`; "old" HTML5 video + 'url': 'http://www.nicovideo.jp/watch/sm1151009', + 'md5': '8fa81c364eb619d4085354eab075598a', + 'info_dict': { + 'id': 'sm1151009', + 'ext': 'mp4', + 'title': 'マスターシステム本体内蔵のスペハリのメインテーマ(PSG版)', + 'description': 'md5:6ee077e0581ff5019773e2e714cdd0b7', + 'thumbnail': r're:https?://.*', + 'duration': 184, + 'timestamp': 1190868283, + 'upload_date': '20070927', + 'uploader': 'denden2', + 'uploader_id': '1392194', + 'view_count': int, + 'comment_count': int, + }, + 'skip': 'Requires an account', + }, { + # "New" HTML5 video + 'url': 'http://www.nicovideo.jp/watch/sm31464864', + 'md5': '351647b4917660986dc0fa8864085135', + 'info_dict': { + 'id': 'sm31464864', + 'ext': 'mp4', + 'title': '新作TVアニメ「戦姫絶唱シンフォギアAXZ」PV 最高画質', + 'description': 'md5:e52974af9a96e739196b2c1ca72b5feb', + 'timestamp': 1498514060, + 'upload_date': '20170626', + 'uploader': 'ゲス', + 'uploader_id': '40826363', + 'thumbnail': r're:https?://.*', + 'duration': 198, + 'view_count': int, + 'comment_count': int, + }, + 'skip': 'Requires an account', + }, { + 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', + 'only_matching': True, }] - _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' + _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' _NETRC_MACHINE = 'niconico' def _real_initialize(self): @@ -98,19 +150,102 @@ class NiconicoIE(InfoExtractor): return True # Log in + login_ok = True login_form_strs = { - 'mail': username, + 'mail_tel': username, 'password': password, } - login_data = urlencode_postdata(login_form_strs) - request = sanitized_Request( - 'https://secure.nicovideo.jp/secure/login', login_data) - login_results = self._download_webpage( - request, None, note='Logging in', errnote='Unable to log in') - if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None: + urlh = self._request_webpage( + 'https://account.nicovideo.jp/api/v1/login', None, + note='Logging in', errnote='Unable to log in', + data=urlencode_postdata(login_form_strs)) + if urlh is False: + login_ok = False + else: + parts = compat_urlparse.urlparse(urlh.geturl()) + if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login': + login_ok = False + if not login_ok: self._downloader.report_warning('unable to log in: bad username or password') - return False - return True + return login_ok + + def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality): + def yesno(boolean): + return 'yes' if boolean else 'no' + + session_api_data = api_data['video']['dmcInfo']['session_api'] + session_api_endpoint = session_api_data['urls'][0] + + format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality])) + + session_response = self._download_json( + session_api_endpoint['url'], video_id, + query={'_format': 'json'}, + headers={'Content-Type': 'application/json'}, + note='Downloading JSON metadata for %s' % format_id, + data=json.dumps({ + 'session': { + 'client_info': { + 'player_id': session_api_data['player_id'], + }, + 'content_auth': { + 'auth_type': session_api_data['auth_types'][session_api_data['protocols'][0]], + 'content_key_timeout': session_api_data['content_key_timeout'], + 'service_id': 'nicovideo', + 'service_user_id': session_api_data['service_user_id'] + }, + 'content_id': session_api_data['content_id'], + 'content_src_id_sets': [{ + 'content_src_ids': [{ + 'src_id_to_mux': { + 'audio_src_ids': [audio_quality['id']], + 'video_src_ids': [video_quality['id']], + } + }] + }], + 'content_type': 'movie', + 'content_uri': '', + 'keep_method': { + 'heartbeat': { + 'lifetime': session_api_data['heartbeat_lifetime'] + } + }, + 'priority': session_api_data['priority'], + 'protocol': { + 'name': 'http', + 'parameters': { + 'http_parameters': { + 'parameters': { + 'http_output_download_parameters': { + 'use_ssl': yesno(session_api_endpoint['is_ssl']), + 'use_well_known_port': yesno(session_api_endpoint['is_well_known_port']), + } + } + } + } + }, + 'recipe_id': session_api_data['recipe_id'], + 'session_operation_auth': { + 'session_operation_auth_by_signature': { + 'signature': session_api_data['signature'], + 'token': session_api_data['token'], + } + }, + 'timing_constraint': 'unlimited' + } + })) + + resolution = video_quality.get('resolution', {}) + + return { + 'url': session_response['data']['session']['content_uri'], + 'format_id': format_id, + 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4 + 'abr': float_or_none(audio_quality.get('bitrate'), 1000), + 'vbr': float_or_none(video_quality.get('bitrate'), 1000), + 'height': resolution.get('height'), + 'width': resolution.get('width'), + } def _real_extract(self, url): video_id = self._match_id(url) @@ -123,30 +258,84 @@ class NiconicoIE(InfoExtractor): if video_id.startswith('so'): video_id = self._match_id(handle.geturl()) - video_info = self._download_xml( - 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, - note='Downloading video info page') - - # Get flv info - flv_info_webpage = self._download_webpage( - 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', - video_id, 'Downloading flv info') - - flv_info = compat_urlparse.parse_qs(flv_info_webpage) - if 'url' not in flv_info: - if 'deleted' in flv_info: - raise ExtractorError('The video has been deleted.', - expected=True) - elif 'closed' in flv_info: - raise ExtractorError('Niconico videos now require logging in', - expected=True) - else: - raise ExtractorError('Unable to find video URL') - - video_real_url = flv_info['url'][0] + api_data = self._parse_json(self._html_search_regex( + 'data-api-data="([^"]+)"', webpage, + 'API data', default='{}'), video_id) + + def _format_id_from_url(video_url): + return 'economy' if video_real_url.endswith('low') else 'normal' + + try: + video_real_url = api_data['video']['smileInfo']['url'] + except KeyError: # Flash videos + # Get flv info + flv_info_webpage = self._download_webpage( + 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', + video_id, 'Downloading flv info') + + flv_info = compat_urlparse.parse_qs(flv_info_webpage) + if 'url' not in flv_info: + if 'deleted' in flv_info: + raise ExtractorError('The video has been deleted.', + expected=True) + elif 'closed' in flv_info: + raise ExtractorError('Niconico videos now require logging in', + expected=True) + elif 'error' in flv_info: + raise ExtractorError('%s reports error: %s' % ( + self.IE_NAME, flv_info['error'][0]), expected=True) + else: + raise ExtractorError('Unable to find video URL') + + video_info_xml = self._download_xml( + 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, + video_id, note='Downloading video info page') + + def get_video_info(items): + if not isinstance(items, list): + items = [items] + for item in items: + ret = xpath_text(video_info_xml, './/' + item) + if ret: + return ret + + video_real_url = flv_info['url'][0] + + extension = get_video_info('movie_type') + if not extension: + extension = determine_ext(video_real_url) + + formats = [{ + 'url': video_real_url, + 'ext': extension, + 'format_id': _format_id_from_url(video_real_url), + }] + else: + formats = [] + + dmc_info = api_data['video'].get('dmcInfo') + if dmc_info: # "New" HTML5 videos + quality_info = dmc_info['quality'] + for audio_quality in quality_info['audios']: + for video_quality in quality_info['videos']: + if not audio_quality['available'] or not video_quality['available']: + continue + formats.append(self._extract_format_for_quality( + api_data, video_id, audio_quality, video_quality)) + + self._sort_formats(formats) + else: # "Old" HTML5 videos + formats = [{ + 'url': video_real_url, + 'ext': 'mp4', + 'format_id': _format_id_from_url(video_real_url), + }] + + def get_video_info(items): + return dict_get(api_data['video'], items) # Start extracting information - title = xpath_text(video_info, './/title') + title = get_video_info('title') if not title: title = self._og_search_title(webpage, default=None) if not title: @@ -160,18 +349,15 @@ class NiconicoIE(InfoExtractor): watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {} video_detail = watch_api_data.get('videoDetail', {}) - extension = xpath_text(video_info, './/movie_type') - if not extension: - extension = determine_ext(video_real_url) - thumbnail = ( - xpath_text(video_info, './/thumbnail_url') or + get_video_info(['thumbnail_url', 'thumbnailURL']) or self._html_search_meta('image', webpage, 'thumbnail', default=None) or video_detail.get('thumbnail')) - description = xpath_text(video_info, './/description') + description = get_video_info('description') - timestamp = parse_iso8601(xpath_text(video_info, './/first_retrieve')) + timestamp = (parse_iso8601(get_video_info('first_retrieve')) or + unified_timestamp(get_video_info('postedDateTime'))) if not timestamp: match = self._html_search_meta('datePublished', webpage, 'date published', default=None) if match: @@ -181,7 +367,7 @@ class NiconicoIE(InfoExtractor): video_detail['postedAt'].replace('/', '-'), delimiter=' ', timezone=datetime.timedelta(hours=9)) - view_count = int_or_none(xpath_text(video_info, './/view_counter')) + view_count = int_or_none(get_video_info(['view_counter', 'viewCount'])) if not view_count: match = self._html_search_regex( r'>Views: <strong[^>]*>([^<]+)</strong>', @@ -190,38 +376,33 @@ class NiconicoIE(InfoExtractor): view_count = int_or_none(match.replace(',', '')) view_count = view_count or video_detail.get('viewCount') - comment_count = int_or_none(xpath_text(video_info, './/comment_num')) + comment_count = (int_or_none(get_video_info('comment_num')) or + video_detail.get('commentCount') or + try_get(api_data, lambda x: x['thread']['commentCount'])) if not comment_count: match = self._html_search_regex( r'>Comments: <strong[^>]*>([^<]+)</strong>', webpage, 'comment count', default=None) if match: comment_count = int_or_none(match.replace(',', '')) - comment_count = comment_count or video_detail.get('commentCount') duration = (parse_duration( - xpath_text(video_info, './/length') or + get_video_info('length') or self._html_search_meta( 'video:duration', webpage, 'video duration', default=None)) or - video_detail.get('length')) + video_detail.get('length') or + get_video_info('duration')) - webpage_url = xpath_text(video_info, './/watch_url') or url + webpage_url = get_video_info('watch_url') or url - if video_info.find('.//ch_id') is not None: - uploader_id = video_info.find('.//ch_id').text - uploader = video_info.find('.//ch_name').text - elif video_info.find('.//user_id') is not None: - uploader_id = video_info.find('.//user_id').text - uploader = video_info.find('.//user_nickname').text - else: - uploader_id = uploader = None + owner = api_data.get('owner', {}) + uploader_id = get_video_info(['ch_id', 'user_id']) or owner.get('id') + uploader = get_video_info(['ch_name', 'user_nickname']) or owner.get('nickname') return { 'id': video_id, - 'url': video_real_url, 'title': title, - 'ext': extension, - 'format_id': 'economy' if video_real_url.endswith('low') else 'normal', + 'formats': formats, 'thumbnail': thumbnail, 'description': description, 'uploader': uploader, diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py index f5e3f6815..9b5ad5a9f 100644 --- a/youtube_dl/extractor/njpwworld.py +++ b/youtube_dl/extractor/njpwworld.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( + extract_attributes, get_element_by_class, urlencode_postdata, ) @@ -56,17 +57,24 @@ class NJPWWorldIE(InfoExtractor): webpage = self._download_webpage(url, video_id) formats = [] - for player_url, kind in re.findall(r'<a[^>]+href="(/player[^"]+)".+?<img[^>]+src="[^"]+qf_btn_([^".]+)', webpage): - player_url = compat_urlparse.urljoin(url, player_url) - + for mobj in re.finditer(r'<a[^>]+\bhref=(["\'])/player.+?[^>]*>', webpage): + player = extract_attributes(mobj.group(0)) + player_path = player.get('href') + if not player_path: + continue + kind = self._search_regex( + r'(low|high)$', player.get('class') or '', 'kind', + default='low') + player_url = compat_urlparse.urljoin(url, player_path) player_page = self._download_webpage( player_url, video_id, note='Downloading player page') - entries = self._parse_html5_media_entries( player_url, player_page, video_id, m3u8_id='hls-%s' % kind, - m3u8_entry_protocol='m3u8_native', - preference=2 if 'hq' in kind else 1) - formats.extend(entries[0]['formats']) + m3u8_entry_protocol='m3u8_native') + kind_formats = entries[0]['formats'] + for f in kind_formats: + f['quality'] = 2 if kind == 'high' else 1 + formats.extend(kind_formats) self._sort_formats(formats) diff --git a/youtube_dl/extractor/nonktube.py b/youtube_dl/extractor/nonktube.py new file mode 100644 index 000000000..63e58aae2 --- /dev/null +++ b/youtube_dl/extractor/nonktube.py @@ -0,0 +1,33 @@ +from __future__ import unicode_literals + +from .nuevo import NuevoBaseIE + + +class NonkTubeIE(NuevoBaseIE): + _VALID_URL = r'https?://(?:www\.)?nonktube\.com/(?:(?:video|embed)/|media/nuevo/embed\.php\?.*?\bid=)(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.nonktube.com/video/118636/sensual-wife-uncensored-fucked-in-hairy-pussy-and-facialized', + 'info_dict': { + 'id': '118636', + 'ext': 'mp4', + 'title': 'Sensual Wife Uncensored Fucked In Hairy Pussy And Facialized', + 'age_limit': 18, + 'duration': 1150.98, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://www.nonktube.com/embed/118636', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = self._extract_nuevo( + 'https://www.nonktube.com/media/nuevo/econfig.php?key=%s' + % video_id, video_id) + + info['age_limit'] = 18 + return info diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py index f7fa098a5..974de3c3e 100644 --- a/youtube_dl/extractor/noovo.py +++ b/youtube_dl/extractor/noovo.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, + js_to_json, smuggle_url, try_get, ) @@ -24,8 +25,6 @@ class NoovoIE(InfoExtractor): 'timestamp': 1491399228, 'upload_date': '20170405', 'uploader_id': '618566855001', - 'creator': 'vtele', - 'view_count': int, 'series': 'RPM+', }, 'params': { @@ -37,13 +36,11 @@ class NoovoIE(InfoExtractor): 'info_dict': { 'id': '5395865725001', 'title': 'Épisode 13 : Les retrouvailles', - 'description': 'md5:336d5ebc5436534e61d16e63ddfca327', + 'description': 'md5:888c3330f0c1b4476c5bc99a1c040473', 'ext': 'mp4', 'timestamp': 1492019320, 'upload_date': '20170412', 'uploader_id': '618566855001', - 'creator': 'vtele', - 'view_count': int, 'series': "L'amour est dans le pré", 'season_number': 5, 'episode': 'Épisode 13', @@ -58,40 +55,46 @@ class NoovoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'http://api.noovo.ca/api/v1/pages/single-episode/%s' % video_id, - video_id)['data'] + webpage = self._download_webpage(url, video_id) - content = try_get(data, lambda x: x['contents'][0]) + bc_url = BrightcoveNewIE._extract_url(self, webpage) - brightcove_id = data.get('brightcoveId') or content['brightcoveId'] + data = self._parse_json( + self._search_regex( + r'(?s)dataLayer\.push\(\s*({.+?})\s*\);', webpage, 'data', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) + + title = try_get( + data, lambda x: x['video']['nom'], + compat_str) or self._html_search_meta( + 'dcterms.Title', webpage, 'title', fatal=True) + + description = self._html_search_meta( + ('dcterms.Description', 'description'), webpage, 'description') series = try_get( - data, ( - lambda x: x['show']['title'], - lambda x: x['season']['show']['title']), - compat_str) + data, lambda x: x['emission']['nom']) or self._search_regex( + r'<div[^>]+class="banner-card__subtitle h4"[^>]*>([^<]+)', + webpage, 'series', default=None) - episode = None - og = data.get('og') - if isinstance(og, dict) and og.get('type') == 'video.episode': - episode = og.get('title') + season_el = try_get(data, lambda x: x['emission']['saison'], dict) or {} + season = try_get(season_el, lambda x: x['nom'], compat_str) + season_number = int_or_none(try_get(season_el, lambda x: x['numero'])) - video = content or data + episode_el = try_get(season_el, lambda x: x['episode'], dict) or {} + episode = try_get(episode_el, lambda x: x['nom'], compat_str) + episode_number = int_or_none(try_get(episode_el, lambda x: x['numero'])) return { '_type': 'url_transparent', 'ie_key': BrightcoveNewIE.ie_key(), - 'url': smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - {'geo_countries': ['CA']}), - 'id': brightcove_id, - 'title': video.get('title'), - 'creator': video.get('source'), - 'view_count': int_or_none(video.get('viewsCount')), + 'url': smuggle_url(bc_url, {'geo_countries': ['CA']}), + 'title': title, + 'description': description, 'series': series, - 'season_number': int_or_none(try_get( - data, lambda x: x['season']['seasonNumber'])), + 'season': season, + 'season_number': season_number, 'episode': episode, - 'episode_number': int_or_none(data.get('episodeNumber')), + 'episode_number': episode_number, } diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 79296f0ef..fa4ef20c5 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -28,17 +28,17 @@ class NPOBaseIE(InfoExtractor): class NPOIE(NPOBaseIE): IE_NAME = 'npo' - IE_DESC = 'npo.nl and ntr.nl' + IE_DESC = 'npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl' _VALID_URL = r'''(?x) (?: npo:| https?:// (?:www\.)? (?: - npo\.nl/(?!live|radio)(?:[^/]+/){2}| + npo\.nl/(?!(?:live|radio)/)(?:[^/]+/){2}| ntr\.nl/(?:[^/]+/){2,}| omroepwnl\.nl/video/fragment/[^/]+__| - zapp\.nl/[^/]+/[^/]+/ + (?:zapp|npo3)\.nl/(?:[^/]+/){2} ) ) (?P<id>[^/?#]+) @@ -147,9 +147,15 @@ class NPOIE(NPOBaseIE): 'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990', 'only_matching': True, }, { + 'url': 'https://www.npo3.nl/3onderzoekt/16-09-2015/VPWON_1239870', + 'only_matching': True, + }, { # live stream 'url': 'npo:LI_NL1_4188102', 'only_matching': True, + }, { + 'url': 'http://www.npo.nl/radio-gaga/13-06-2017/BNN_101383373', + 'only_matching': True, }] def _real_extract(self, url): @@ -338,7 +344,7 @@ class NPOLiveIE(NPOBaseIE): webpage = self._download_webpage(url, display_id) live_id = self._search_regex( - r'data-prid="([^"]+)"', webpage, 'live id') + [r'media-id="([^"]+)"', r'data-prid="([^"]+)"'], webpage, 'live id') return { '_type': 'url_transparent', diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 7fe79cb53..18ead9426 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -148,13 +148,34 @@ class NRKBaseIE(InfoExtractor): vcodec = 'none' if data.get('mediaType') == 'Audio' else None - # TODO: extract chapters when https://github.com/rg3/youtube-dl/pull/9409 is merged - for entry in entries: entry.update(common_info) for f in entry['formats']: f['vcodec'] = vcodec + points = data.get('shortIndexPoints') + if isinstance(points, list): + chapters = [] + for next_num, point in enumerate(points, start=1): + if not isinstance(point, dict): + continue + start_time = parse_duration(point.get('startPoint')) + if start_time is None: + continue + end_time = parse_duration( + data.get('duration') + if next_num == len(points) + else points[next_num].get('startPoint')) + if end_time is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': point.get('title'), + }) + if chapters and len(entries) == 1: + entries[0]['chapters'] = chapters + return self.playlist_result(entries, video_id, title, description) @@ -216,7 +237,7 @@ class NRKTVIE(NRKBaseIE): (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P<part_id>\d+))? ''' % _EPISODE_RE - _API_HOST = 'psapi-we.nrk.no' + _API_HOST = 'psapi-ne.nrk.no' _TESTS = [{ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', diff --git a/youtube_dl/extractor/nuevo.py b/youtube_dl/extractor/nuevo.py index 87fb94d1f..be1e09d37 100644 --- a/youtube_dl/extractor/nuevo.py +++ b/youtube_dl/extractor/nuevo.py @@ -10,9 +10,10 @@ from ..utils import ( class NuevoBaseIE(InfoExtractor): - def _extract_nuevo(self, config_url, video_id): + def _extract_nuevo(self, config_url, video_id, headers={}): config = self._download_xml( - config_url, video_id, transform_source=lambda s: s.strip()) + config_url, video_id, transform_source=lambda s: s.strip(), + headers=headers) title = xpath_text(config, './title', 'title', fatal=True).strip() video_id = xpath_text(config, './mediaid', default=video_id) diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py index 94f57990b..58da1bc27 100644 --- a/youtube_dl/extractor/onet.py +++ b/youtube_dl/extractor/onet.py @@ -11,6 +11,7 @@ from ..utils import ( get_element_by_class, int_or_none, js_to_json, + NO_DEFAULT, parse_iso8601, remove_start, strip_or_none, @@ -199,6 +200,19 @@ class OnetPlIE(InfoExtractor): 'timestamp': 1487078046, }, }, { + # embedded via pulsembed + 'url': 'http://film.onet.pl/pensjonat-nad-rozlewiskiem-relacja-z-planu-serialu/y428n0', + 'info_dict': { + 'id': '501235.965429946', + 'ext': 'mp4', + 'title': '"Pensjonat nad rozlewiskiem": relacja z planu serialu', + 'upload_date': '20170622', + 'timestamp': 1498159955, + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://film.onet.pl/zwiastuny/ghost-in-the-shell-drugi-zwiastun-pl/5q6yl3', 'only_matching': True, }, { @@ -212,13 +226,25 @@ class OnetPlIE(InfoExtractor): 'only_matching': True, }] + def _search_mvp_id(self, webpage, default=NO_DEFAULT): + return self._search_regex( + r'data-(?:params-)?mvp=["\'](\d+\.\d+)', webpage, 'mvp id', + default=default) + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - mvp_id = self._search_regex( - r'data-params-mvp=["\'](\d+\.\d+)', webpage, 'mvp id') + mvp_id = self._search_mvp_id(webpage, default=None) + + if not mvp_id: + pulsembed_url = self._search_regex( + r'data-src=(["\'])(?P<url>(?:https?:)?//pulsembed\.eu/.+?)\1', + webpage, 'pulsembed url', group='url') + webpage = self._download_webpage( + pulsembed_url, video_id, 'Downloading pulsembed webpage') + mvp_id = self._search_mvp_id(webpage) return self.url_result( 'onetmvp:%s' % mvp_id, OnetMVPIE.ie_key(), video_id=mvp_id) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 84be2b1e3..52580baed 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -3,12 +3,14 @@ import re import base64 from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - int_or_none, - float_or_none, + determine_ext, ExtractorError, + float_or_none, + int_or_none, + try_get, unsmuggle_url, - determine_ext, ) from ..compat import compat_urllib_parse_urlencode @@ -39,13 +41,15 @@ class OoyalaBaseIE(InfoExtractor): formats = [] if cur_auth_data['authorized']: for stream in cur_auth_data['streams']: - s_url = base64.b64decode( - stream['url']['data'].encode('ascii')).decode('utf-8') - if s_url in urls: + url_data = try_get(stream, lambda x: x['url']['data'], compat_str) + if not url_data: + continue + s_url = base64.b64decode(url_data.encode('ascii')).decode('utf-8') + if not s_url or s_url in urls: continue urls.append(s_url) ext = determine_ext(s_url, None) - delivery_type = stream['delivery_type'] + delivery_type = stream.get('delivery_type') if delivery_type == 'hls' or ext == 'm3u8': formats.extend(self._extract_m3u8_formats( re.sub(r'/ip(?:ad|hone)/', '/all/', s_url), embed_code, 'mp4', 'm3u8_native', @@ -65,7 +69,7 @@ class OoyalaBaseIE(InfoExtractor): else: formats.append({ 'url': s_url, - 'ext': ext or stream.get('delivery_type'), + 'ext': ext or delivery_type, 'vcodec': stream.get('video_codec'), 'format_id': delivery_type, 'width': int_or_none(stream.get('width')), @@ -136,6 +140,11 @@ class OoyalaIE(OoyalaBaseIE): 'title': 'Divide Tool Path.mp4', 'duration': 204.405, } + }, + { + # empty stream['url']['data'] + 'url': 'http://player.ooyala.com/player.js?embedCode=w2bnZtYjE6axZ_dw1Cd0hQtXd_ige2Is', + 'only_matching': True, } ] diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 1e2c54e68..74fe8017e 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -2,20 +2,19 @@ from __future__ import unicode_literals import re -import calendar -import datetime from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + determine_ext, + float_or_none, HEADRequest, - unified_strdate, - strip_jsonp, int_or_none, - float_or_none, - determine_ext, + orderedSet, remove_end, + strip_jsonp, unescapeHTML, + unified_strdate, ) @@ -144,77 +143,25 @@ class ORFTVthekIE(InfoExtractor): } -class ORFOE1IE(InfoExtractor): - IE_NAME = 'orf:oe1' - IE_DESC = 'Radio Österreich 1' - _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole\?.*?\btrack_id=)(?P<id>[0-9]+)' - - # Audios on ORF radio are only available for 7 days, so we can't add tests. - _TESTS = [{ - 'url': 'http://oe1.orf.at/konsole?show=on_demand#?track_id=394211', - 'only_matching': True, - }, { - 'url': 'http://oe1.orf.at/konsole?show=ondemand&track_id=443608&load_day=/programm/konsole/tag/20160726', - 'only_matching': True, - }] - - def _real_extract(self, url): - show_id = self._match_id(url) - data = self._download_json( - 'http://oe1.orf.at/programm/%s/konsole' % show_id, - show_id - ) - - timestamp = datetime.datetime.strptime('%s %s' % ( - data['item']['day_label'], - data['item']['time'] - ), '%d.%m.%Y %H:%M') - unix_timestamp = calendar.timegm(timestamp.utctimetuple()) - - return { - 'id': show_id, - 'title': data['item']['title'], - 'url': data['item']['url_stream'], - 'ext': 'mp3', - 'description': data['item'].get('info'), - 'timestamp': unix_timestamp - } - - -class ORFFM4IE(InfoExtractor): - IE_NAME = 'orf:fm4' - IE_DESC = 'radio FM4' - _VALID_URL = r'https?://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)' - - _TEST = { - 'url': 'http://fm4.orf.at/player/20160110/IS/', - 'md5': '01e736e8f1cef7e13246e880a59ad298', - 'info_dict': { - 'id': '2016-01-10_2100_tl_54_7DaysSun13_11244', - 'ext': 'mp3', - 'title': 'Im Sumpf', - 'description': 'md5:384c543f866c4e422a55f66a62d669cd', - 'duration': 7173, - 'timestamp': 1452456073, - 'upload_date': '20160110', - }, - 'skip': 'Live streams on FM4 got deleted soon', - } - +class ORFRadioIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + station = mobj.group('station') show_date = mobj.group('date') show_id = mobj.group('show') + if station == 'fm4': + show_id = '4%s' % show_id + data = self._download_json( - 'http://audioapi.orf.at/fm4/json/2.0/broadcasts/%s/4%s' % (show_date, show_id), + 'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s' % (station, show_id, show_date), show_id ) def extract_entry_dict(info, title, subtitle): return { 'id': info['loopStreamId'].replace('.mp3', ''), - 'url': 'http://loopstream01.apa.at/?channel=fm4&id=%s' % info['loopStreamId'], + 'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (station, info['loopStreamId']), 'title': title, 'description': subtitle, 'duration': (info['end'] - info['start']) / 1000, @@ -233,6 +180,47 @@ class ORFFM4IE(InfoExtractor): } +class ORFFM4IE(ORFRadioIE): + IE_NAME = 'orf:fm4' + IE_DESC = 'radio FM4' + _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + + _TEST = { + 'url': 'http://fm4.orf.at/player/20170107/CC', + 'md5': '2b0be47375432a7ef104453432a19212', + 'info_dict': { + 'id': '2017-01-07_2100_tl_54_7DaysSat18_31295', + 'ext': 'mp3', + 'title': 'Solid Steel Radioshow', + 'description': 'Die Mixshow von Coldcut und Ninja Tune.', + 'duration': 3599, + 'timestamp': 1483819257, + 'upload_date': '20170107', + }, + 'skip': 'Shows from ORF radios are only available for 7 days.' + } + + +class ORFOE1IE(ORFRadioIE): + IE_NAME = 'orf:oe1' + IE_DESC = 'Radio Österreich 1' + _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + + _TEST = { + 'url': 'http://oe1.orf.at/player/20170108/456544', + 'md5': '34d8a6e67ea888293741c86a099b745b', + 'info_dict': { + 'id': '2017-01-08_0759_tl_51_7DaysSun6_256141', + 'ext': 'mp3', + 'title': 'Morgenjournal', + 'duration': 609, + 'timestamp': 1483858796, + 'upload_date': '20170108', + }, + 'skip': 'Shows from ORF radios are only available for 7 days.' + } + + class ORFIPTVIE(InfoExtractor): IE_NAME = 'orf:iptv' IE_DESC = 'iptv.ORF.at' @@ -320,3 +308,108 @@ class ORFIPTVIE(InfoExtractor): 'upload_date': upload_date, 'formats': formats, } + + +class ORFFM4StoryIE(InfoExtractor): + IE_NAME = 'orf:fm4:story' + IE_DESC = 'fm4.orf.at stories' + _VALID_URL = r'https?://fm4\.orf\.at/stories/(?P<id>\d+)' + + _TEST = { + 'url': 'http://fm4.orf.at/stories/2865738/', + 'playlist': [{ + 'md5': 'e1c2c706c45c7b34cf478bbf409907ca', + 'info_dict': { + 'id': '547792', + 'ext': 'flv', + 'title': 'Manu Delago und Inner Tongue live', + 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', + 'duration': 1748.52, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20170913', + }, + }, { + 'md5': 'c6dd2179731f86f4f55a7b49899d515f', + 'info_dict': { + 'id': '547798', + 'ext': 'flv', + 'title': 'Manu Delago und Inner Tongue live (2)', + 'duration': 1504.08, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20170913', + 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', + }, + }], + } + + def _real_extract(self, url): + story_id = self._match_id(url) + webpage = self._download_webpage(url, story_id) + + entries = [] + all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage)) + for idx, video_id in enumerate(all_ids): + data = self._download_json( + 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, + video_id)[0] + + duration = float_or_none(data['duration'], 1000) + + video = data['sources']['q8c'] + load_balancer_url = video['loadBalancerUrl'] + abr = int_or_none(video.get('audioBitrate')) + vbr = int_or_none(video.get('bitrate')) + fps = int_or_none(video.get('videoFps')) + width = int_or_none(video.get('videoWidth')) + height = int_or_none(video.get('videoHeight')) + thumbnail = video.get('preview') + + rendition = self._download_json( + load_balancer_url, video_id, transform_source=strip_jsonp) + + f = { + 'abr': abr, + 'vbr': vbr, + 'fps': fps, + 'width': width, + 'height': height, + } + + formats = [] + for format_id, format_url in rendition['redirect'].items(): + if format_id == 'rtmp': + ff = f.copy() + ff.update({ + 'url': format_url, + 'format_id': format_id, + }) + formats.append(ff) + elif determine_ext(format_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_id)) + elif determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=format_id)) + else: + continue + self._sort_formats(formats) + + title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at') + if idx >= 1: + # Titles are duplicates, make them unique + title += ' (' + str(idx + 1) + ')' + description = self._og_search_description(webpage) + upload_date = unified_strdate(self._html_search_meta( + 'dc.date', webpage, 'upload date')) + + entries.append({ + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'formats': formats, + }) + + return self.playlist_result(entries) diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py index 881f3bcc7..8ed3c6347 100644 --- a/youtube_dl/extractor/packtpub.py +++ b/youtube_dl/extractor/packtpub.py @@ -1,9 +1,13 @@ from __future__ import unicode_literals +import json import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_HTTPError, +) from ..utils import ( clean_html, ExtractorError, @@ -34,6 +38,25 @@ class PacktPubIE(PacktPubBaseIE): 'upload_date': '20170331', }, } + _NETRC_MACHINE = 'packtpub' + _TOKEN = None + + def _real_initialize(self): + (username, password) = self._get_login_info() + if username is None: + return + try: + self._TOKEN = self._download_json( + self._MAPT_REST + '/users/tokens', None, + 'Downloading Authorization Token', data=json.dumps({ + 'email': username, + 'password': password, + }).encode())['data']['access'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 404): + message = self._parse_json(e.cause.read().decode(), None)['message'] + raise ExtractorError(message, expected=True) + raise def _handle_error(self, response): if response.get('status') != 'success': @@ -51,14 +74,17 @@ class PacktPubIE(PacktPubBaseIE): course_id, chapter_id, video_id = mobj.group( 'course_id', 'chapter_id', 'id') + headers = {} + if self._TOKEN: + headers['Authorization'] = 'Bearer ' + self._TOKEN video = self._download_json( '%s/users/me/products/%s/chapters/%s/sections/%s' % (self._MAPT_REST, course_id, chapter_id, video_id), video_id, - 'Downloading JSON video')['data'] + 'Downloading JSON video', headers=headers)['data'] content = video.get('content') if not content: - raise ExtractorError('This video is locked', expected=True) + self.raise_login_required('This video is locked') video_url = content['file'] diff --git a/youtube_dl/extractor/pandatv.py b/youtube_dl/extractor/pandatv.py index 133cc9b88..c86d70771 100644 --- a/youtube_dl/extractor/pandatv.py +++ b/youtube_dl/extractor/pandatv.py @@ -10,13 +10,13 @@ from ..utils import ( class PandaTVIE(InfoExtractor): IE_DESC = '熊猫TV' - _VALID_URL = r'http://(?:www\.)?panda\.tv/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.panda.tv/10091', + _VALID_URL = r'https?://(?:www\.)?panda\.tv/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.panda.tv/66666', 'info_dict': { - 'id': '10091', + 'id': '66666', 'title': 're:.+', - 'uploader': '囚徒', + 'uploader': '刘杀鸡', 'ext': 'flv', 'is_live': True, }, @@ -24,13 +24,16 @@ class PandaTVIE(InfoExtractor): 'skip_download': True, }, 'skip': 'Live stream is offline', - } + }, { + 'url': 'https://www.panda.tv/66666', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) config = self._download_json( - 'http://www.panda.tv/api_room?roomid=%s' % video_id, video_id) + 'https://www.panda.tv/api_room?roomid=%s' % video_id, video_id) error_code = config.get('errno', 0) if error_code is not 0: @@ -74,7 +77,7 @@ class PandaTVIE(InfoExtractor): continue for pref, (ext, pl) in enumerate((('m3u8', '-hls'), ('flv', ''))): formats.append({ - 'url': 'http://pl%s%s.live.panda.tv/live_panda/%s%s%s.%s' + 'url': 'https://pl%s%s.live.panda.tv/live_panda/%s%s%s.%s' % (pl, plflag1, room_key, live_panda, suffix[quality], ext), 'format_id': '%s-%s' % (k, ext), 'quality': quality, diff --git a/youtube_dl/extractor/pandoratv.py b/youtube_dl/extractor/pandoratv.py index 89c95fffb..fc7bd3411 100644 --- a/youtube_dl/extractor/pandoratv.py +++ b/youtube_dl/extractor/pandoratv.py @@ -19,7 +19,7 @@ class PandoraTVIE(InfoExtractor): IE_NAME = 'pandora.tv' IE_DESC = '판도라TV' _VALID_URL = r'https?://(?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\?' - _TEST = { + _TESTS = [{ 'url': 'http://jp.channel.pandora.tv/channel/video.ptv?c1=&prgid=53294230&ch_userid=mikakim&ref=main&lot=cate_01_2', 'info_dict': { 'id': '53294230', @@ -34,7 +34,26 @@ class PandoraTVIE(InfoExtractor): 'view_count': int, 'like_count': int, } - } + }, { + 'url': 'http://channel.pandora.tv/channel/video.ptv?ch_userid=gogoucc&prgid=54721744', + 'info_dict': { + 'id': '54721744', + 'ext': 'flv', + 'title': '[HD] JAPAN COUNTDOWN 170423', + 'description': '[HD] JAPAN COUNTDOWN 170423', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1704.9, + 'upload_date': '20170423', + 'uploader': 'GOGO_UCC', + 'uploader_id': 'gogoucc', + 'view_count': int, + 'like_count': int, + }, + 'params': { + # Test metadata only + 'skip_download': True, + }, + }] def _real_extract(self, url): qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) @@ -86,7 +105,7 @@ class PandoraTVIE(InfoExtractor): 'description': info.get('body'), 'thumbnail': info.get('thumbnail') or info.get('poster'), 'duration': float_or_none(info.get('runtime'), 1000) or parse_duration(info.get('time')), - 'upload_date': info['fid'][:8] if isinstance(info.get('fid'), compat_str) else None, + 'upload_date': info['fid'].split('/')[-1][:8] if isinstance(info.get('fid'), compat_str) else None, 'uploader': info.get('nickname'), 'uploader_id': info.get('upload_userid'), 'view_count': str_to_int(info.get('hit')), diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 0727e381b..8889e4a1a 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -10,6 +10,7 @@ from ..utils import ( int_or_none, float_or_none, js_to_json, + orderedSet, strip_jsonp, strip_or_none, unified_strdate, @@ -188,7 +189,7 @@ class PBSIE(InfoExtractor): # Direct video URL (?:%s)/(?:viralplayer|video)/(?P<id>[0-9]+)/? | # Article with embedded player (or direct video) - (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) | + (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) | # Player (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/ ) @@ -265,6 +266,13 @@ class PBSIE(InfoExtractor): 'playlist_count': 2, }, { + 'url': 'http://www.pbs.org/wgbh/americanexperience/films/great-war/', + 'info_dict': { + 'id': 'great-war', + }, + 'playlist_count': 3, + }, + { 'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/', 'info_dict': { 'id': '2276541483', @@ -338,6 +346,21 @@ class PBSIE(InfoExtractor): }, }, { + # https://github.com/rg3/youtube-dl/issues/13801 + 'url': 'https://www.pbs.org/video/pbs-newshour-full-episode-july-31-2017-1501539057/', + 'info_dict': { + 'id': '3003333873', + 'ext': 'mp4', + 'title': 'PBS NewsHour - full episode July 31, 2017', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'duration': 3265, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + }, + { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, }, @@ -382,10 +405,10 @@ class PBSIE(InfoExtractor): # tabbed frontline videos MULTI_PART_REGEXES = ( r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"', - r'<a[^>]+href=["\']#video-\d+["\'][^>]+data-coveid=["\'](\d+)', + r'<a[^>]+href=["\']#(?:video-|part)\d+["\'][^>]+data-cove[Ii]d=["\'](\d+)', ) for p in MULTI_PART_REGEXES: - tabbed_videos = re.findall(p, webpage) + tabbed_videos = orderedSet(re.findall(p, webpage)) if tabbed_videos: return tabbed_videos, presumptive_id, upload_date, description @@ -425,6 +448,9 @@ class PBSIE(InfoExtractor): if url: break + if not url: + url = self._og_search_url(webpage) + mobj = re.match(self._VALID_URL, url) player_id = mobj.group('player_id') diff --git a/youtube_dl/extractor/pearvideo.py b/youtube_dl/extractor/pearvideo.py new file mode 100644 index 000000000..1d777221c --- /dev/null +++ b/youtube_dl/extractor/pearvideo.py @@ -0,0 +1,63 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + qualities, + unified_timestamp, +) + + +class PearVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pearvideo\.com/video_(?P<id>\d+)' + _TEST = { + 'url': 'http://www.pearvideo.com/video_1076290', + 'info_dict': { + 'id': '1076290', + 'ext': 'mp4', + 'title': '小浣熊在主人家玻璃上滚石头:没砸', + 'description': 'md5:01d576b747de71be0ee85eb7cac25f9d', + 'timestamp': 1494275280, + 'upload_date': '20170508', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + quality = qualities( + ('ldflv', 'ld', 'sdflv', 'sd', 'hdflv', 'hd', 'src')) + + formats = [{ + 'url': mobj.group('url'), + 'format_id': mobj.group('id'), + 'quality': quality(mobj.group('id')), + } for mobj in re.finditer( + r'(?P<id>[a-zA-Z]+)Url\s*=\s*(["\'])(?P<url>(?:https?:)?//.+?)\2', + webpage)] + self._sort_formats(formats) + + title = self._search_regex( + (r'<h1[^>]+\bclass=(["\'])video-tt\1[^>]*>(?P<value>[^<]+)', + r'<[^>]+\bdata-title=(["\'])(?P<value>(?:(?!\1).)+)\1'), + webpage, 'title', group='value') + description = self._search_regex( + (r'<div[^>]+\bclass=(["\'])summary\1[^>]*>(?P<value>[^<]+)', + r'<[^>]+\bdata-summary=(["\'])(?P<value>(?:(?!\1).)+)\1'), + webpage, 'description', default=None, + group='value') or self._html_search_meta('Description', webpage) + timestamp = unified_timestamp(self._search_regex( + r'<div[^>]+\bclass=["\']date["\'][^>]*>([^<]+)', + webpage, 'timestamp', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 1add6b840..e5e08538c 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -49,7 +49,7 @@ class PeriscopeIE(PeriscopeBaseIE): @staticmethod def _extract_url(webpage): mobj = re.search( - r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?periscope\.tv/(?:(?!\1).)+)\1', webpage) + r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1', webpage) if mobj: return mobj.group('url') @@ -80,18 +80,24 @@ class PeriscopeIE(PeriscopeBaseIE): stream = self._call_api( 'getAccessPublic', {'broadcast_id': token}, token) + video_urls = set() formats = [] - for format_id in ('replay', 'rtmp', 'hls', 'https_hls'): + for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'): video_url = stream.get(format_id + '_url') - if not video_url: + if not video_url or video_url in video_urls: continue - f = { + video_urls.add(video_url) + if format_id != 'rtmp': + formats.extend(self._extract_m3u8_formats( + video_url, token, 'mp4', + entry_protocol='m3u8_native' + if state in ('ended', 'timed_out') else 'm3u8', + m3u8_id=format_id, fatal=False)) + continue + formats.append({ 'url': video_url, 'ext': 'flv' if format_id == 'rtmp' else 'mp4', - } - if format_id != 'rtmp': - f['protocol'] = 'm3u8_native' if state in ('ended', 'timed_out') else 'm3u8' - formats.append(f) + }) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index e45d9fe55..f6a9131b1 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -18,6 +18,7 @@ from ..utils import ( parse_duration, qualities, srt_subtitles_timecode, + try_get, update_url_query, urlencode_postdata, ) @@ -26,6 +27,39 @@ from ..utils import ( class PluralsightBaseIE(InfoExtractor): _API_BASE = 'https://app.pluralsight.com' + def _download_course(self, course_id, url, display_id): + try: + return self._download_course_rpc(course_id, url, display_id) + except ExtractorError: + # Old API fallback + return self._download_json( + 'https://app.pluralsight.com/player/user/api/v1/player/payload', + display_id, data=urlencode_postdata({'courseId': course_id}), + headers={'Referer': url}) + + def _download_course_rpc(self, course_id, url, display_id): + response = self._download_json( + '%s/player/functions/rpc' % self._API_BASE, display_id, + 'Downloading course JSON', + data=json.dumps({ + 'fn': 'bootstrapPlayer', + 'payload': { + 'courseId': course_id, + }, + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'Referer': url, + }) + + course = try_get(response, lambda x: x['payload']['course'], dict) + if course: + return course + + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, response['error']['message']), + expected=True) + class PluralsightIE(PluralsightBaseIE): IE_NAME = 'pluralsight' @@ -162,10 +196,7 @@ class PluralsightIE(PluralsightBaseIE): display_id = '%s-%s' % (name, clip_id) - course = self._download_json( - 'https://app.pluralsight.com/player/user/api/v1/player/payload', - display_id, data=urlencode_postdata({'courseId': course_name}), - headers={'Referer': url}) + course = self._download_course(course_name, url, display_id) collection = course['modules'] @@ -224,6 +255,7 @@ class PluralsightIE(PluralsightBaseIE): req_format_split = req_format.split('-', 1) if len(req_format_split) > 1: req_ext, req_quality = req_format_split + req_quality = '-'.join(req_quality.split('-')[:2]) for allowed_quality in ALLOWED_QUALITIES: if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: return (AllowedQuality(req_ext, (req_quality, )), ) @@ -330,18 +362,7 @@ class PluralsightCourseIE(PluralsightBaseIE): # TODO: PSM cookie - course = self._download_json( - '%s/player/functions/rpc' % self._API_BASE, course_id, - 'Downloading course JSON', - data=json.dumps({ - 'fn': 'bootstrapPlayer', - 'payload': { - 'courseId': course_id, - } - }).encode('utf-8'), - headers={ - 'Content-Type': 'application/json;charset=utf-8' - })['payload']['course'] + course = self._download_course(course_id, url, course_id) title = course['title'] course_name = course['name'] diff --git a/youtube_dl/extractor/podomatic.py b/youtube_dl/extractor/podomatic.py index f20946a2b..25fcebf9f 100644 --- a/youtube_dl/extractor/podomatic.py +++ b/youtube_dl/extractor/podomatic.py @@ -9,39 +9,46 @@ from ..utils import int_or_none class PodomaticIE(InfoExtractor): IE_NAME = 'podomatic' - _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)' + _VALID_URL = r'''(?x) + (?P<proto>https?):// + (?: + (?P<channel>[^.]+)\.podomatic\.com/entry| + (?:www\.)?podomatic\.com/podcasts/(?P<channel_2>[^/]+)/episodes + )/ + (?P<id>[^/?#&]+) + ''' - _TESTS = [ - { - 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00', - 'md5': '84bb855fcf3429e6bf72460e1eed782d', - 'info_dict': { - 'id': '2009-01-02T16_03_35-08_00', - 'ext': 'mp3', - 'uploader': 'Science Teaching Tips', - 'uploader_id': 'scienceteachingtips', - 'title': '64. When the Moon Hits Your Eye', - 'duration': 446, - } - }, - { - 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00', - 'md5': 'd2cf443931b6148e27638650e2638297', - 'info_dict': { - 'id': '2013-11-15T16_31_21-08_00', - 'ext': 'mp3', - 'uploader': 'Ostbahnhof / Techno Mix', - 'uploader_id': 'ostbahnhof', - 'title': 'Einunddreizig', - 'duration': 3799, - } - }, - ] + _TESTS = [{ + 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00', + 'md5': '84bb855fcf3429e6bf72460e1eed782d', + 'info_dict': { + 'id': '2009-01-02T16_03_35-08_00', + 'ext': 'mp3', + 'uploader': 'Science Teaching Tips', + 'uploader_id': 'scienceteachingtips', + 'title': '64. When the Moon Hits Your Eye', + 'duration': 446, + } + }, { + 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00', + 'md5': 'd2cf443931b6148e27638650e2638297', + 'info_dict': { + 'id': '2013-11-15T16_31_21-08_00', + 'ext': 'mp3', + 'uploader': 'Ostbahnhof / Techno Mix', + 'uploader_id': 'ostbahnhof', + 'title': 'Einunddreizig', + 'duration': 3799, + } + }, { + 'url': 'https://www.podomatic.com/podcasts/scienceteachingtips/episodes/2009-01-02T16_03_35-08_00', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - channel = mobj.group('channel') + channel = mobj.group('channel') or mobj.group('channel_2') json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' + '?permalink=true&rtmp=0') % diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py index 2ac1fcb0b..978d6f813 100644 --- a/youtube_dl/extractor/polskieradio.py +++ b/youtube_dl/extractor/polskieradio.py @@ -65,7 +65,7 @@ class PolskieRadioIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) content = self._search_regex( - r'(?s)<div[^>]+class="audio atarticle"[^>]*>(.+?)<script>', + r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', webpage, 'content') timestamp = unified_timestamp(self._html_search_regex( diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 842317e6c..b52879c7a 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -54,7 +54,7 @@ class PornHdIE(InfoExtractor): r'<title>(.+?) - .*?[Pp]ornHD.*?</title>'], webpage, 'title') sources = self._parse_json(js_to_json(self._search_regex( - r"(?s)'sources'\s*:\s*(\{.+?\})\s*\}[;,)]", + r"(?s)sources'?\s*[:=]\s*(\{.+?\})", webpage, 'sources', default='{}')), video_id) if not sources: @@ -82,7 +82,8 @@ class PornHdIE(InfoExtractor): view_count = int_or_none(self._html_search_regex( r'(\d+) views\s*<', webpage, 'view count', fatal=False)) thumbnail = self._search_regex( - r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False) + r"poster'?\s*:\s*([\"'])(?P<url>(?:(?!\1).)+)\1", webpage, + 'thumbnail', fatal=False, group='url') return { 'id': video_id, diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index b25f1f193..3428458af 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -33,7 +33,7 @@ class PornHubIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)| + (?:[a-z]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P<id>[\da-z]+) @@ -97,6 +97,9 @@ class PornHubIE(InfoExtractor): }, { 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/video/show?viewkey=648719015', + 'only_matching': True, }] @staticmethod @@ -183,7 +186,7 @@ class PornHubIE(InfoExtractor): title, thumbnail, duration = [None] * 3 video_uploader = self._html_search_regex( - r'(?s)From: .+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<', + r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', webpage, 'uploader', fatal=False) view_count = self._extract_count( @@ -224,13 +227,20 @@ class PornHubIE(InfoExtractor): class PornHubPlaylistBaseIE(InfoExtractor): def _extract_entries(self, webpage): + # Only process container div with main playlist content skipping + # drop-down menu that uses similar pattern for videos (see + # https://github.com/rg3/youtube-dl/issues/11594). + container = self._search_regex( + r'(?s)(<div[^>]+class=["\']container.+)', webpage, + 'container', default=webpage) + return [ self.url_result( 'http://www.pornhub.com/%s' % video_url, PornHubIE.ie_key(), video_title=title) for video_url, title in orderedSet(re.findall( r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', - webpage)) + container)) ] def _real_extract(self, url): @@ -238,22 +248,18 @@ class PornHubPlaylistBaseIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) - # Only process container div with main playlist content skipping - # drop-down menu that uses similar pattern for videos (see - # https://github.com/rg3/youtube-dl/issues/11594). - container = self._search_regex( - r'(?s)(<div[^>]+class=["\']container.+)', webpage, - 'container', default=webpage) - - entries = self._extract_entries(container) + entries = self._extract_entries(webpage) playlist = self._parse_json( self._search_regex( - r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'), - playlist_id) + r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage, + 'playlist', default='{}'), + playlist_id, fatal=False) + title = playlist.get('title') or self._search_regex( + r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False) return self.playlist_result( - entries, playlist_id, playlist.get('title'), playlist.get('description')) + entries, playlist_id, title, playlist.get('description')) class PornHubPlaylistIE(PornHubPlaylistBaseIE): @@ -293,6 +299,7 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE): except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: break + raise page_entries = self._extract_entries(webpage) if not page_entries: break diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 17c27da46..084308aeb 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -2,38 +2,37 @@ from __future__ import unicode_literals import random -import time import re +import time from .common import InfoExtractor from ..utils import ( - sanitized_Request, - strip_jsonp, - unescapeHTML, clean_html, ExtractorError, + strip_jsonp, + unescapeHTML, ) class QQMusicIE(InfoExtractor): IE_NAME = 'qqmusic' IE_DESC = 'QQ音乐' - _VALID_URL = r'https?://y\.qq\.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/song/(?P<id>[0-9A-Za-z]+)\.html' _TESTS = [{ - 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD', - 'md5': '9ce1c1c8445f561506d2e3cfb0255705', + 'url': 'https://y.qq.com/n/yqq/song/004295Et37taLD.html', + 'md5': '5f1e6cea39e182857da7ffc5ef5e6bb8', 'info_dict': { 'id': '004295Et37taLD', 'ext': 'mp3', 'title': '可惜没如果', 'release_date': '20141227', 'creator': '林俊杰', - 'description': 'md5:d327722d0361576fde558f1ac68a7065', + 'description': 'md5:d85afb3051952ecc50a1ee8a286d1eac', 'thumbnail': r're:^https?://.*\.jpg$', } }, { 'note': 'There is no mp3-320 version of this song.', - 'url': 'http://y.qq.com/#type=song&mid=004MsGEo3DdNxV', + 'url': 'https://y.qq.com/n/yqq/song/004MsGEo3DdNxV.html', 'md5': 'fa3926f0c585cda0af8fa4f796482e3e', 'info_dict': { 'id': '004MsGEo3DdNxV', @@ -46,14 +45,14 @@ class QQMusicIE(InfoExtractor): } }, { 'note': 'lyrics not in .lrc format', - 'url': 'http://y.qq.com/#type=song&mid=001JyApY11tIp6', + 'url': 'https://y.qq.com/n/yqq/song/001JyApY11tIp6.html', 'info_dict': { 'id': '001JyApY11tIp6', 'ext': 'mp3', 'title': 'Shadows Over Transylvania', 'release_date': '19970225', 'creator': 'Dark Funeral', - 'description': 'md5:ed14d5bd7ecec19609108052c25b2c11', + 'description': 'md5:c9b20210587cbcd6836a1c597bab4525', 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { @@ -105,7 +104,7 @@ class QQMusicIE(InfoExtractor): [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], detail_info_page, 'album mid', default=None) if albummid: - thumbnail_url = "http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg" \ + thumbnail_url = 'http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg' \ % (albummid[-2:-1], albummid[-1], albummid) guid = self.m_r_get_ruin() @@ -156,15 +155,39 @@ class QQPlaylistBaseIE(InfoExtractor): def qq_static_url(category, mid): return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid) - @classmethod - def get_entries_from_page(cls, page): + def get_singer_all_songs(self, singmid, num): + return self._download_webpage( + r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg', singmid, + query={ + 'format': 'json', + 'inCharset': 'utf8', + 'outCharset': 'utf-8', + 'platform': 'yqq', + 'needNewCode': 0, + 'singermid': singmid, + 'order': 'listen', + 'begin': 0, + 'num': num, + 'songstatus': 1, + }) + + def get_entries_from_page(self, singmid): entries = [] - for item in re.findall(r'class="data"[^<>]*>([^<>]+)</', page): - song_mid = unescapeHTML(item).split('|')[-5] - entries.append(cls.url_result( - 'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic', - song_mid)) + default_num = 1 + json_text = self.get_singer_all_songs(singmid, default_num) + json_obj_all_songs = self._parse_json(json_text, singmid) + + if json_obj_all_songs['code'] == 0: + total = json_obj_all_songs['data']['total'] + json_text = self.get_singer_all_songs(singmid, total) + json_obj_all_songs = self._parse_json(json_text, singmid) + + for item in json_obj_all_songs['data']['list']: + if item['musicData'].get('songmid') is not None: + songmid = item['musicData']['songmid'] + entries.append(self.url_result( + r'https://y.qq.com/n/yqq/song/%s.html' % songmid, 'QQMusic', songmid)) return entries @@ -172,42 +195,32 @@ class QQPlaylistBaseIE(InfoExtractor): class QQMusicSingerIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:singer' IE_DESC = 'QQ音乐 - 歌手' - _VALID_URL = r'https?://y\.qq\.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/singer/(?P<id>[0-9A-Za-z]+)\.html' _TEST = { - 'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2', + 'url': 'https://y.qq.com/n/yqq/singer/001BLpXF2DyJe2.html', 'info_dict': { 'id': '001BLpXF2DyJe2', 'title': '林俊杰', 'description': 'md5:870ec08f7d8547c29c93010899103751', }, - 'playlist_count': 12, + 'playlist_mincount': 12, } def _real_extract(self, url): mid = self._match_id(url) - singer_page = self._download_webpage( - self.qq_static_url('singer', mid), mid, 'Download singer page') - - entries = self.get_entries_from_page(singer_page) - + entries = self.get_entries_from_page(mid) + singer_page = self._download_webpage(url, mid, 'Download singer page') singer_name = self._html_search_regex( - r"singername\s*:\s*'([^']+)'", singer_page, 'singer name', - default=None) - - singer_id = self._html_search_regex( - r"singerid\s*:\s*'([0-9]+)'", singer_page, 'singer id', - default=None) - + r"singername\s*:\s*'(.*?)'", singer_page, 'singer name', default=None) singer_desc = None - if singer_id: - req = sanitized_Request( - 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg?utf8=1&outCharset=utf-8&format=xml&singerid=%s' % singer_id) - req.add_header( - 'Referer', 'http://s.plcloud.music.qq.com/xhr_proxy_utf8.html') + if mid: singer_desc_page = self._download_xml( - req, mid, 'Donwload singer description XML') + 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg', mid, + 'Donwload singer description XML', + query={'utf8': 1, 'outCharset': 'utf-8', 'format': 'xml', 'singermid': mid}, + headers={'Referer': 'https://y.qq.com/n/yqq/singer/'}) singer_desc = singer_desc_page.find('./data/info/desc').text @@ -217,10 +230,10 @@ class QQMusicSingerIE(QQPlaylistBaseIE): class QQMusicAlbumIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:album' IE_DESC = 'QQ音乐 - 专辑' - _VALID_URL = r'https?://y\.qq\.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/album/(?P<id>[0-9A-Za-z]+)\.html' _TESTS = [{ - 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1', + 'url': 'https://y.qq.com/n/yqq/album/000gXCTb2AhRR1.html', 'info_dict': { 'id': '000gXCTb2AhRR1', 'title': '我们都是这样长大的', @@ -228,7 +241,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): }, 'playlist_count': 4, }, { - 'url': 'http://y.qq.com/#type=album&mid=002Y5a3b3AlCu3', + 'url': 'https://y.qq.com/n/yqq/album/002Y5a3b3AlCu3.html', 'info_dict': { 'id': '002Y5a3b3AlCu3', 'title': '그리고...', @@ -246,7 +259,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): entries = [ self.url_result( - 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid'] ) for song in album['list'] ] album_name = album.get('name') @@ -260,31 +273,30 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): class QQMusicToplistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:toplist' IE_DESC = 'QQ音乐 - 排行榜' - _VALID_URL = r'https?://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P<id>[0-9]+)\.html' _TESTS = [{ - 'url': 'http://y.qq.com/#type=toplist&p=global_123', + 'url': 'https://y.qq.com/n/yqq/toplist/123.html', 'info_dict': { - 'id': 'global_123', + 'id': '123', 'title': '美国iTunes榜', + 'description': 'md5:89db2335fdbb10678dee2d43fe9aba08', }, - 'playlist_count': 10, + 'playlist_count': 100, }, { - 'url': 'http://y.qq.com/#type=toplist&p=top_3', + 'url': 'https://y.qq.com/n/yqq/toplist/3.html', 'info_dict': { - 'id': 'top_3', + 'id': '3', 'title': '巅峰榜·欧美', - 'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统' - '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据' - '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:' - '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放' + 'description': 'md5:5a600d42c01696b26b71f8c4d43407da', }, 'playlist_count': 100, }, { - 'url': 'http://y.qq.com/#type=toplist&p=global_106', + 'url': 'https://y.qq.com/n/yqq/toplist/106.html', 'info_dict': { - 'id': 'global_106', + 'id': '106', 'title': '韩国Mnet榜', + 'description': 'md5:cb84b325215e1d21708c615cac82a6e7', }, 'playlist_count': 50, }] @@ -292,18 +304,15 @@ class QQMusicToplistIE(QQPlaylistBaseIE): def _real_extract(self, url): list_id = self._match_id(url) - list_type, num_id = list_id.split("_") - toplist_json = self._download_json( - 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json' - % (list_type, num_id), - list_id, 'Download toplist page') + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg', list_id, + note='Download toplist page', + query={'type': 'toplist', 'topid': list_id, 'format': 'json'}) - entries = [ - self.url_result( - 'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid'] - ) for song in toplist_json['songlist'] - ] + entries = [self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic', + song['data']['songmid']) + for song in toplist_json['songlist']] topinfo = toplist_json.get('topinfo', {}) list_name = topinfo.get('ListName') @@ -314,10 +323,10 @@ class QQMusicToplistIE(QQPlaylistBaseIE): class QQMusicPlaylistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:playlist' IE_DESC = 'QQ音乐 - 歌单' - _VALID_URL = r'https?://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/playlist/(?P<id>[0-9]+)\.html' _TESTS = [{ - 'url': 'http://y.qq.com/#type=taoge&id=3462654915', + 'url': 'http://y.qq.com/n/yqq/playlist/3462654915.html', 'info_dict': { 'id': '3462654915', 'title': '韩国5月新歌精选下旬', @@ -326,7 +335,7 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE): 'playlist_count': 40, 'skip': 'playlist gone', }, { - 'url': 'http://y.qq.com/#type=taoge&id=1374105607', + 'url': 'https://y.qq.com/n/yqq/playlist/1374105607.html', 'info_dict': { 'id': '1374105607', 'title': '易入人心的华语民谣', @@ -339,8 +348,9 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE): list_id = self._match_id(url) list_json = self._download_json( - 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s' - % list_id, list_id, 'Download list page', + 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg', + list_id, 'Download list page', + query={'type': 1, 'json': 1, 'utf8': 1, 'onlysong': 0, 'disstid': list_id}, transform_source=strip_jsonp) if not len(list_json.get('cdlist', [])): if list_json.get('code'): @@ -350,11 +360,9 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE): raise ExtractorError('Unable to get playlist info') cdlist = list_json['cdlist'][0] - entries = [ - self.url_result( - 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] - ) for song in cdlist['songlist'] - ] + entries = [self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid']) + for song in cdlist['songlist']] list_name = cdlist.get('dissname') list_description = clean_html(unescapeHTML(cdlist.get('desc'))) diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index 3b40002a8..b952e59b4 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -20,20 +20,37 @@ from ..utils import ( class RadioCanadaIE(InfoExtractor): IE_NAME = 'radiocanada' _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)' - _TEST = { - 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', - 'info_dict': { - 'id': '7184272', - 'ext': 'mp4', - 'title': 'Le parcours du tireur capté sur vidéo', - 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', - 'upload_date': '20141023', - }, - 'params': { - # m3u8 download - 'skip_download': True, + _TESTS = [ + { + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', + 'info_dict': { + 'id': '7184272', + 'ext': 'mp4', + 'title': 'Le parcours du tireur capté sur vidéo', + 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', + 'upload_date': '20141023', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } }, - } + { + # empty Title + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7754998/', + 'info_dict': { + 'id': '7754998', + 'ext': 'mp4', + 'title': 'letelejournal22h', + 'description': 'INTEGRALE WEB 22H-TJ', + 'upload_date': '20170720', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + ] def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -59,6 +76,7 @@ class RadioCanadaIE(InfoExtractor): device_types.append('android') formats = [] + error = None # TODO: extract f4m formats # f4m formats can be extracted using flashhd device_type but they produce unplayable file for device_type in device_types: @@ -84,8 +102,8 @@ class RadioCanadaIE(InfoExtractor): if not v_url: continue if v_url == 'null': - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, xpath_text(v_data, 'message')), expected=True) + error = xpath_text(v_data, 'message') + continue ext = determine_ext(v_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( @@ -129,6 +147,9 @@ class RadioCanadaIE(InfoExtractor): formats.extend(self._extract_f4m_formats( base_url + '/manifest.f4m', video_id, f4m_id='hds', fatal=False)) + if not formats and error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) self._sort_formats(formats) subtitles = {} @@ -141,7 +162,7 @@ class RadioCanadaIE(InfoExtractor): return { 'id': video_id, - 'title': get_meta('Title'), + 'title': get_meta('Title') or get_meta('AV-nomEmission'), 'description': get_meta('Description') or get_meta('ShortDescription'), 'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'), 'duration': int_or_none(get_meta('length')), diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 81eb9db85..5bf64a56b 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -191,11 +191,12 @@ class RaiPlayIE(RaiBaseIE): info = { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if relinker_info.get( + 'is_live') else title, 'alt_title': media.get('subtitle'), 'description': media.get('description'), - 'uploader': media.get('channel'), - 'creator': media.get('editor'), + 'uploader': strip_or_none(media.get('channel')), + 'creator': strip_or_none(media.get('editor')), 'duration': parse_duration(video.get('duration')), 'timestamp': timestamp, 'thumbnails': thumbnails, @@ -208,10 +209,46 @@ class RaiPlayIE(RaiBaseIE): } info.update(relinker_info) - return info +class RaiPlayLiveIE(RaiBaseIE): + _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'http://www.raiplay.it/dirette/rainews24', + 'info_dict': { + 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', + 'display_id': 'rainews24', + 'ext': 'mp4', + 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:6eca31500550f9376819f174e5644754', + 'uploader': 'Rai News 24', + 'creator': 'Rai News 24', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE, + webpage, 'content id') + + return { + '_type': 'url_transparent', + 'ie_key': RaiPlayIE.ie_key(), + 'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id, + 'id': video_id, + 'display_id': display_id, + } + + class RaiIE(RaiBaseIE): _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE _TESTS = [{ @@ -308,11 +345,11 @@ class RaiIE(RaiBaseIE): media_type = media['type'] if 'Audio' in media_type: relinker_info = { - 'formats': { + 'formats': [{ 'format_id': media.get('formatoAudio'), 'url': media['audioUrl'], 'ext': media.get('formatoAudio'), - } + }] } elif 'Video' in media_type: relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py index afab62426..5d6cc3610 100644 --- a/youtube_dl/extractor/redbulltv.py +++ b/youtube_dl/extractor/redbulltv.py @@ -13,7 +13,7 @@ from ..utils import ( class RedBullTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?redbull\.tv/(?:video|film)/(?P<id>AP-\w+)' + _VALID_URL = r'https?://(?:www\.)?redbull\.tv/(?:video|film|live)/(?:AP-\w+/segment/)?(?P<id>AP-\w+)' _TESTS = [{ # film 'url': 'https://www.redbull.tv/video/AP-1Q756YYX51W11/abc-of-wrc', @@ -42,6 +42,22 @@ class RedBullTVIE(InfoExtractor): 'season_number': 2, 'episode_number': 4, }, + 'params': { + 'skip_download': True, + }, + }, { + # segment + 'url': 'https://www.redbull.tv/live/AP-1R5DX49XS1W11/segment/AP-1QSAQJ6V52111/semi-finals', + 'info_dict': { + 'id': 'AP-1QSAQJ6V52111', + 'ext': 'mp4', + 'title': 'Semi Finals - Vans Park Series Pro Tour', + 'description': 'md5:306a2783cdafa9e65e39aa62f514fd97', + 'duration': 11791.991, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.redbull.tv/film/AP-1MSKKF5T92111/in-motion', 'only_matching': True, @@ -82,7 +98,8 @@ class RedBullTVIE(InfoExtractor): title = info['title'].strip() formats = self._extract_m3u8_formats( - video['url'], video_id, 'mp4', 'm3u8_native') + video['url'], video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') self._sort_formats(formats) subtitles = {} diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py new file mode 100644 index 000000000..01c85ee01 --- /dev/null +++ b/youtube_dl/extractor/reddit.py @@ -0,0 +1,114 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, +) + + +class RedditIE(InfoExtractor): + _VALID_URL = r'https?://v\.redd\.it/(?P<id>[^/?#&]+)' + _TEST = { + # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ + 'url': 'https://v.redd.it/zv89llsvexdz', + 'md5': '655d06ace653ea3b87bccfb1b27ec99d', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'title': 'zv89llsvexdz', + }, + 'params': { + 'format': 'bestvideo', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + formats = self._extract_m3u8_formats( + 'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id, + 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + + formats.extend(self._extract_mpd_formats( + 'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id, + mpd_id='dash', fatal=False)) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + +class RedditRIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'title': 'That small heart attack.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1501941939, + 'upload_date': '20170805', + 'uploader': 'Antw87', + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 0, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', + 'only_matching': True, + }, { + # imgur + 'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', + 'only_matching': True, + }, { + # streamable + 'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/', + 'only_matching': True, + }, { + # youtube + 'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = self._download_json( + url + '.json', video_id)[0]['data']['children'][0]['data'] + + video_url = data['url'] + + # Avoid recursing into the same reddit URL + if 'reddit.com/' in video_url and '/%s/' % video_id in video_url: + raise ExtractorError('No media found', expected=True) + + over_18 = data.get('over_18') + if over_18 is True: + age_limit = 18 + elif over_18 is False: + age_limit = 0 + else: + age_limit = None + + return { + '_type': 'url_transparent', + 'url': video_url, + 'title': data.get('title'), + 'thumbnail': data.get('thumbnail'), + 'timestamp': float_or_none(data.get('created_utc')), + 'uploader': data.get('author'), + 'like_count': int_or_none(data.get('ups')), + 'dislike_count': int_or_none(data.get('downs')), + 'comment_count': int_or_none(data.get('num_comments')), + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index c367a6ae7..f70a75256 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, @@ -62,7 +63,23 @@ class RedTubeIE(InfoExtractor): 'format_id': format_id, 'height': int_or_none(format_id), }) - else: + medias = self._parse_json( + self._search_regex( + r'mediaDefinition\s*:\s*(\[.+?\])', webpage, + 'media definitions', default='{}'), + video_id, fatal=False) + if medias and isinstance(medias, list): + for media in medias: + format_url = media.get('videoUrl') + if not format_url or not isinstance(format_url, compat_str): + continue + format_id = media.get('quality') + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + if not formats: video_url = self._html_search_regex( r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') formats.append({'url': video_url}) @@ -73,7 +90,7 @@ class RedTubeIE(InfoExtractor): r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<', webpage, 'upload date', fatal=False)) duration = int_or_none(self._search_regex( - r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None)) view_count = str_to_int(self._search_regex( r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)', webpage, 'view count', fatal=False)) diff --git a/youtube_dl/extractor/rmcdecouverte.py b/youtube_dl/extractor/rmcdecouverte.py index 2340dae53..e921ca3e6 100644 --- a/youtube_dl/extractor/rmcdecouverte.py +++ b/youtube_dl/extractor/rmcdecouverte.py @@ -13,21 +13,20 @@ class RMCDecouverteIE(InfoExtractor): _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/mediaplayer-replay.*?\bid=(?P<id>\d+)' _TEST = { - 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=1430&title=LES%20HEROS%20DU%2088e%20ETAGE', + 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=13502&title=AQUAMEN:LES%20ROIS%20DES%20AQUARIUMS%20:UN%20DELICIEUX%20PROJET', 'info_dict': { - 'id': '5111223049001', + 'id': '5419055995001', 'ext': 'mp4', - 'title': ': LES HEROS DU 88e ETAGE', - 'description': 'Découvrez comment la bravoure de deux hommes dans la Tour Nord du World Trade Center a sauvé la vie d\'innombrables personnes le 11 septembre 2001.', + 'title': 'UN DELICIEUX PROJET', + 'description': 'md5:63610df7c8b1fc1698acd4d0d90ba8b5', 'uploader_id': '1969646226001', - 'upload_date': '20160904', - 'timestamp': 1472951103, + 'upload_date': '20170502', + 'timestamp': 1493745308, }, 'params': { - # rtmp download 'skip_download': True, }, - 'skip': 'Only works from France', + 'skip': 'only available for a week', } BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s' @@ -35,5 +34,12 @@ class RMCDecouverteIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0] - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + if brightcove_legacy_url: + brightcove_id = compat_parse_qs(compat_urlparse.urlparse( + brightcove_legacy_url).query)['@videoPlayer'][0] + else: + brightcove_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'brightcove id') + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', + brightcove_id) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 54076de28..3e22998c6 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -15,7 +15,7 @@ class RtlNlIE(InfoExtractor): https?://(?:www\.)? (?: rtlxl\.nl/[^\#]*\#!/[^/]+/| - rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid= + rtl\.nl/(?:system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid=|video/) ) (?P<id>[0-9a-f-]+)''' @@ -70,6 +70,9 @@ class RtlNlIE(InfoExtractor): }, { 'url': 'http://rtlxl.nl/?_ga=1.204735956.572365465.1466978370#!/rtl-nieuws-132237/3c487912-023b-49ac-903e-2c5d79f8410f', 'only_matching': True, + }, { + 'url': 'https://www.rtl.nl/video/c603c9c2-601d-4b5e-8175-64f1e942dc7d/', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 889fa7628..89d89b65a 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -7,43 +7,84 @@ import itertools from .common import InfoExtractor from ..compat import ( compat_str, + compat_parse_qs, + compat_urllib_parse_urlparse, ) from ..utils import ( determine_ext, - unified_strdate, + bool_or_none, + int_or_none, + try_get, + unified_timestamp, ) -class RutubeIE(InfoExtractor): +class RutubeBaseIE(InfoExtractor): + def _extract_video(self, video, video_id=None, require_title=True): + title = video['title'] if require_title else video.get('title') + + age_limit = video.get('is_adult') + if age_limit is not None: + age_limit = 18 if age_limit is True else 0 + + uploader_id = try_get(video, lambda x: x['author']['id']) + category = try_get(video, lambda x: x['category']['name']) + + return { + 'id': video.get('id') or video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('thumbnail_url'), + 'duration': int_or_none(video.get('duration')), + 'uploader': try_get(video, lambda x: x['author']['name']), + 'uploader_id': compat_str(uploader_id) if uploader_id else None, + 'timestamp': unified_timestamp(video.get('created_ts')), + 'category': [category] if category else None, + 'age_limit': age_limit, + 'view_count': int_or_none(video.get('hits')), + 'comment_count': int_or_none(video.get('comments_count')), + 'is_live': bool_or_none(video.get('is_livestream')), + } + + +class RutubeIE(RutubeBaseIE): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})' _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', + 'md5': '79938ade01294ef7e27574890d0d3769', 'info_dict': { 'id': '3eac3b4561676c17df9132a9a1e62e3e', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Раненный кенгуру забежал в аптеку', 'description': 'http://www.ntdtv.ru ', 'duration': 80, 'uploader': 'NTDRussian', 'uploader_id': '29790', + 'timestamp': 1381943602, 'upload_date': '20131016', 'age_limit': 0, }, - 'params': { - # It requires ffmpeg (m3u8 download) - 'skip_download': True, - }, }, { 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, }, { 'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, + }, { + 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252', + 'only_matching': True, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url) + @staticmethod def _extract_urls(webpage): return [mobj.group('url') for mobj in re.finditer( @@ -52,12 +93,12 @@ class RutubeIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + video = self._download_json( 'http://rutube.ru/api/video/%s/?format=json' % video_id, video_id, 'Downloading video JSON') - # Some videos don't have the author field - author = video.get('author') or {} + info = self._extract_video(video, video_id) options = self._download_json( 'http://rutube.ru/api/play/options/%s/?format=json' % video_id, @@ -79,19 +120,8 @@ class RutubeIE(InfoExtractor): }) self._sort_formats(formats) - return { - 'id': video['id'], - 'title': video['title'], - 'description': video['description'], - 'duration': video['duration'], - 'view_count': video['hits'], - 'formats': formats, - 'thumbnail': video['thumbnail_url'], - 'uploader': author.get('name'), - 'uploader_id': compat_str(author['id']) if author else None, - 'upload_date': unified_strdate(video['created_ts']), - 'age_limit': 18 if video['is_adult'] else 0, - } + info['formats'] = formats + return info class RutubeEmbedIE(InfoExtractor): @@ -103,7 +133,8 @@ class RutubeEmbedIE(InfoExtractor): 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', 'info_dict': { 'id': 'a10e53b86e8f349080f718582ce4c661', - 'ext': 'mp4', + 'ext': 'flv', + 'timestamp': 1387830582, 'upload_date': '20131223', 'uploader_id': '297833', 'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89', @@ -111,7 +142,7 @@ class RutubeEmbedIE(InfoExtractor): 'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89', }, 'params': { - 'skip_download': 'Requires ffmpeg', + 'skip_download': True, }, }, { 'url': 'http://rutube.ru/play/embed/8083783', @@ -125,10 +156,51 @@ class RutubeEmbedIE(InfoExtractor): canonical_url = self._html_search_regex( r'<link\s+rel="canonical"\s+href="([^"]+?)"', webpage, 'Canonical URL') - return self.url_result(canonical_url, 'Rutube') + return self.url_result(canonical_url, RutubeIE.ie_key()) + + +class RutubePlaylistBaseIE(RutubeBaseIE): + def _next_page_url(self, page_num, playlist_id, *args, **kwargs): + return self._PAGE_TEMPLATE % (playlist_id, page_num) + def _entries(self, playlist_id, *args, **kwargs): + next_page_url = None + for pagenum in itertools.count(1): + page = self._download_json( + next_page_url or self._next_page_url( + pagenum, playlist_id, *args, **kwargs), + playlist_id, 'Downloading page %s' % pagenum) + + results = page.get('results') + if not results or not isinstance(results, list): + break + + for result in results: + video_url = result.get('video_url') + if not video_url or not isinstance(video_url, compat_str): + continue + entry = self._extract_video(result, require_title=False) + entry.update({ + '_type': 'url', + 'url': video_url, + 'ie_key': RutubeIE.ie_key(), + }) + yield entry -class RutubeChannelIE(InfoExtractor): + next_page_url = page.get('next') + if not next_page_url or not page.get('has_next'): + break + + def _extract_playlist(self, playlist_id, *args, **kwargs): + return self.playlist_result( + self._entries(playlist_id, *args, **kwargs), + playlist_id, kwargs.get('playlist_name')) + + def _real_extract(self, url): + return self._extract_playlist(self._match_id(url)) + + +class RutubeChannelIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:channel' IE_DESC = 'Rutube channels' _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)' @@ -142,27 +214,8 @@ class RutubeChannelIE(InfoExtractor): _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json' - def _extract_videos(self, channel_id, channel_title=None): - entries = [] - for pagenum in itertools.count(1): - page = self._download_json( - self._PAGE_TEMPLATE % (channel_id, pagenum), - channel_id, 'Downloading page %s' % pagenum) - results = page['results'] - if not results: - break - entries.extend(self.url_result(result['video_url'], 'Rutube') for result in results) - if not page['has_next']: - break - return self.playlist_result(entries, channel_id, channel_title) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - return self._extract_videos(channel_id) - -class RutubeMovieIE(RutubeChannelIE): +class RutubeMovieIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:movie' IE_DESC = 'Rutube movies' _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)' @@ -176,11 +229,11 @@ class RutubeMovieIE(RutubeChannelIE): movie = self._download_json( self._MOVIE_TEMPLATE % movie_id, movie_id, 'Downloading movie JSON') - movie_name = movie['name'] - return self._extract_videos(movie_id, movie_name) + return self._extract_playlist( + movie_id, playlist_name=movie.get('name')) -class RutubePersonIE(RutubeChannelIE): +class RutubePersonIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:person' IE_DESC = 'Rutube person videos' _VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)' @@ -193,3 +246,37 @@ class RutubePersonIE(RutubeChannelIE): }] _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' + + +class RutubePlaylistIE(RutubePlaylistBaseIE): + IE_NAME = 'rutube:playlist' + IE_DESC = 'Rutube playlists' + _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?.*?\bpl_id=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://rutube.ru/video/cecd58ed7d531fc0f3d795d51cee9026/?pl_id=3097&pl_type=tag', + 'info_dict': { + 'id': '3097', + }, + 'playlist_count': 27, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source', + 'only_matching': True, + }] + + _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json' + + @classmethod + def suitable(cls, url): + if not super(RutubePlaylistIE, cls).suitable(url): + return False + params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0]) + + def _next_page_url(self, page_num, playlist_id, item_kind): + return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num) + + def _real_extract(self, url): + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + playlist_kind = qs['pl_type'][0] + playlist_id = qs['pl_id'][0] + return self._extract_playlist(playlist_id, item_kind=playlist_kind) diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index a5e672c0a..d2713c19a 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -13,11 +13,15 @@ from ..utils import ( class RUTVIE(InfoExtractor): IE_DESC = 'RUTV.RU' _VALID_URL = r'''(?x) - https?://player\.(?:rutv\.ru|vgtrk\.com)/ - (?P<path>flash\d+v/container\.swf\?id= - |iframe/(?P<type>swf|video|live)/id/ - |index/iframe/cast_id/) - (?P<id>\d+)''' + https?:// + (?:test)?player\.(?:rutv\.ru|vgtrk\.com)/ + (?P<path> + flash\d+v/container\.swf\?id=| + iframe/(?P<type>swf|video|live)/id/| + index/iframe/cast_id/ + ) + (?P<id>\d+) + ''' _TESTS = [ { @@ -99,17 +103,21 @@ class RUTVIE(InfoExtractor): 'skip_download': True, }, }, + { + 'url': 'https://testplayer.vgtrk.com/iframe/live/id/19201/showZoomBtn/false/isPlay/true/', + 'only_matching': True, + }, ] @classmethod def _extract_url(cls, webpage): mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) + r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) if mobj: return mobj.group('url') mobj = re.search( - r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', + r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', webpage) if mobj: return mobj.group('url') diff --git a/youtube_dl/extractor/ruv.py b/youtube_dl/extractor/ruv.py new file mode 100644 index 000000000..8f3cc4095 --- /dev/null +++ b/youtube_dl/extractor/ruv.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + unified_timestamp, +) + + +class RuvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ruv\.is/(?:sarpurinn/[^/]+|node)/(?P<id>[^/]+(?:/\d+)?)' + _TESTS = [{ + # m3u8 + 'url': 'http://ruv.is/sarpurinn/ruv-aukaras/fh-valur/20170516', + 'md5': '66347652f4e13e71936817102acc1724', + 'info_dict': { + 'id': '1144499', + 'display_id': 'fh-valur/20170516', + 'ext': 'mp4', + 'title': 'FH - Valur', + 'description': 'Bein útsending frá 3. leik FH og Vals í úrslitum Olísdeildar karla í handbolta.', + 'timestamp': 1494963600, + 'upload_date': '20170516', + }, + }, { + # mp3 + 'url': 'http://ruv.is/sarpurinn/ras-2/morgunutvarpid/20170619', + 'md5': '395ea250c8a13e5fdb39d4670ef85378', + 'info_dict': { + 'id': '1153630', + 'display_id': 'morgunutvarpid/20170619', + 'ext': 'mp3', + 'title': 'Morgunútvarpið', + 'description': 'md5:a4cf1202c0a1645ca096b06525915418', + 'timestamp': 1497855000, + 'upload_date': '20170619', + }, + }, { + 'url': 'http://ruv.is/sarpurinn/ruv/frettir/20170614', + 'only_matching': True, + }, { + 'url': 'http://www.ruv.is/node/1151854', + 'only_matching': True, + }, { + 'url': 'http://ruv.is/sarpurinn/klippa/secret-soltice-hefst-a-morgun', + 'only_matching': True, + }, { + 'url': 'http://ruv.is/sarpurinn/ras-1/morgunvaktin/20170619', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + title = self._og_search_title(webpage) + + FIELD_RE = r'video\.%s\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1' + + media_url = self._html_search_regex( + FIELD_RE % 'src', webpage, 'video URL', group='url') + + video_id = self._search_regex( + r'<link\b[^>]+\bhref=["\']https?://www\.ruv\.is/node/(\d+)', + webpage, 'video id', default=display_id) + + ext = determine_ext(media_url) + + if ext == 'm3u8': + formats = self._extract_m3u8_formats( + media_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + elif ext == 'mp3': + formats = [{ + 'format_id': 'mp3', + 'url': media_url, + 'vcodec': 'none', + }] + else: + formats = [{ + 'url': media_url, + }] + + description = self._og_search_description(webpage, default=None) + thumbnail = self._og_search_thumbnail( + webpage, default=None) or self._search_regex( + FIELD_RE % 'poster', webpage, 'thumbnail', fatal=False) + timestamp = unified_timestamp(self._html_search_meta( + 'article:published_time', webpage, 'timestamp', fatal=False)) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index c3aec1edd..909a6ba97 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -16,7 +16,6 @@ from ..utils import ( class SafariBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/' - _SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>' _NETRC_MACHINE = 'safari' _API_BASE = 'https://www.safaribooksonline.com/api/v1' @@ -28,10 +27,6 @@ class SafariBaseIE(InfoExtractor): self._login() def _login(self): - # We only need to log in once for courses or individual videos - if self.LOGGED_IN: - return - (username, password) = self._get_login_info() if username is None: return @@ -39,11 +34,17 @@ class SafariBaseIE(InfoExtractor): headers = std_headers.copy() if 'Referer' not in headers: headers['Referer'] = self._LOGIN_URL - login_page_request = sanitized_Request(self._LOGIN_URL, headers=headers) login_page = self._download_webpage( - login_page_request, None, - 'Downloading login form') + self._LOGIN_URL, None, 'Downloading login form', headers=headers) + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'href=["\']/accounts/logout/', r'>Sign Out<')) + + if is_logged(login_page): + self.LOGGED_IN = True + return csrf = self._html_search_regex( r"name='csrfmiddlewaretoken'\s+value='([^']+)'", @@ -62,14 +63,12 @@ class SafariBaseIE(InfoExtractor): login_page = self._download_webpage( request, None, 'Logging in as %s' % username) - if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: + if not is_logged(login_page): raise ExtractorError( 'Login failed; make sure your credentials are correct and try again.', expected=True) - SafariBaseIE.LOGGED_IN = True - - self.to_screen('Login successful') + self.LOGGED_IN = True class SafariIE(SafariBaseIE): diff --git a/youtube_dl/extractor/sexu.py b/youtube_dl/extractor/sexu.py index 5e22ea730..3df51520b 100644 --- a/youtube_dl/extractor/sexu.py +++ b/youtube_dl/extractor/sexu.py @@ -32,8 +32,9 @@ class SexuIE(InfoExtractor): formats = [{ 'url': source['file'].replace('\\', ''), 'format_id': source.get('label'), - 'height': self._search_regex( - r'^(\d+)[pP]', source.get('label', ''), 'height', default=None), + 'height': int(self._search_regex( + r'^(\d+)[pP]', source.get('label', ''), 'height', + default=None)), } for source in sources if source.get('file')] self._sort_formats(formats) diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index 74a1dc672..e89ebebe7 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -31,7 +31,7 @@ class SlideshareIE(InfoExtractor): page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) slideshare_obj = self._search_regex( - r'\$\.extend\(slideshare_object,\s*(\{.*?\})\);', + r'\$\.extend\(.*?slideshare_object,\s*(\{.*?\})\);', webpage, 'slideshare object') info = json.loads(slideshare_obj) if info['slideshow']['type'] != 'video': diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 7da12cef8..a62ed84f1 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -8,7 +8,11 @@ from ..compat import ( compat_str, compat_urllib_parse_urlencode, ) -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, + try_get, +) class SohuIE(InfoExtractor): @@ -169,10 +173,11 @@ class SohuIE(InfoExtractor): formats.append({ 'url': video_url, 'format_id': format_id, - 'filesize': data['clipsBytes'][i], - 'width': data['width'], - 'height': data['height'], - 'fps': data['fps'], + 'filesize': int_or_none( + try_get(data, lambda x: x['clipsBytes'][i])), + 'width': int_or_none(data.get('width')), + 'height': int_or_none(data.get('height')), + 'fps': int_or_none(data.get('fps')), }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 0ee4a8ff8..1c6799d57 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -1,8 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals -import re import itertools +import re from .common import ( InfoExtractor, @@ -17,6 +17,7 @@ from ..utils import ( ExtractorError, int_or_none, unified_strdate, + update_url_query, ) @@ -31,6 +32,7 @@ class SoundcloudIE(InfoExtractor): _VALID_URL = r'''(?x)^(?:https?://)? (?:(?:(?:www\.|m\.)?soundcloud\.com/ + (?!stations/track) (?P<uploader>[\w\d-]+)/ (?!(?:tracks|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) (?P<title>[\w\d-]+)/? @@ -119,9 +121,24 @@ class SoundcloudIE(InfoExtractor): 'license': 'cc-by-sa', }, }, + # private link, downloadable format + { + 'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd', + 'md5': '64a60b16e617d41d0bef032b7f55441e', + 'info_dict': { + 'id': '340344461', + 'ext': 'wav', + 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', + 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', + 'uploader': 'Ori Uplift Music', + 'upload_date': '20170831', + 'duration': 7449, + 'license': 'all-rights-reserved', + }, + }, ] - _CLIENT_ID = '2t9loNQH90kzJcsFCODdigxfp325aq4z' + _CLIENT_ID = 'JlZIsxg2hY5WnBgtn3jfS0UYCl0K8DOg' _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' @staticmethod @@ -136,7 +153,7 @@ class SoundcloudIE(InfoExtractor): @classmethod def _resolv_url(cls, url): - return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID + return 'https://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None): track_id = compat_str(info['id']) @@ -159,11 +176,13 @@ class SoundcloudIE(InfoExtractor): 'license': info.get('license'), } formats = [] + query = {'client_id': self._CLIENT_ID} + if secret_token is not None: + query['secret_token'] = secret_token if info.get('downloadable', False): # We can build a direct link to the song - format_url = ( - 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format( - track_id, self._CLIENT_ID)) + format_url = update_url_query( + 'https://api.soundcloud.com/tracks/%s/download' % track_id, query) formats.append({ 'format_id': 'download', 'ext': info.get('original_format', 'mp3'), @@ -174,11 +193,8 @@ class SoundcloudIE(InfoExtractor): # We have to retrieve the url format_dict = self._download_json( - 'http://api.soundcloud.com/i1/tracks/%s/streams' % track_id, - track_id, 'Downloading track url', query={ - 'client_id': self._CLIENT_ID, - 'secret_token': secret_token, - }) + 'https://api.soundcloud.com/i1/tracks/%s/streams' % track_id, + track_id, 'Downloading track url', query=query) for key, stream_url in format_dict.items(): abr = int_or_none(self._search_regex( @@ -215,7 +231,7 @@ class SoundcloudIE(InfoExtractor): # cannot be always used, sometimes it can give an HTTP 404 error formats.append({ 'format_id': 'fallback', - 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, + 'url': update_url_query(info['stream_url'], query), 'ext': ext, }) @@ -236,7 +252,7 @@ class SoundcloudIE(InfoExtractor): track_id = mobj.group('track_id') if track_id is not None: - info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID + info_json_url = 'https://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID full_title = track_id token = mobj.group('secret_token') if token: @@ -261,7 +277,7 @@ class SoundcloudIE(InfoExtractor): self.report_resolve(full_title) - url = 'http://soundcloud.com/%s' % resolve_title + url = 'https://soundcloud.com/%s' % resolve_title info_json_url = self._resolv_url(url) info = self._download_json(info_json_url, full_title, 'Downloading info JSON') @@ -290,7 +306,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): 'id': '2284613', 'title': 'The Royal Concept EP', }, - 'playlist_mincount': 6, + 'playlist_mincount': 5, }, { 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token', 'only_matching': True, @@ -304,7 +320,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): # extract simple title (uploader + slug of song title) slug_title = mobj.group('slug_title') full_title = '%s/sets/%s' % (uploader, slug_title) - url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title) + url = 'https://soundcloud.com/%s/sets/%s' % (uploader, slug_title) token = mobj.group('token') if token: @@ -330,7 +346,63 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): } -class SoundcloudUserIE(SoundcloudPlaylistBaseIE): +class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): + _API_BASE = 'https://api.soundcloud.com' + _API_V2_BASE = 'https://api-v2.soundcloud.com' + + def _extract_playlist(self, base_url, playlist_id, playlist_title): + COMMON_QUERY = { + 'limit': 50, + 'client_id': self._CLIENT_ID, + 'linked_partitioning': '1', + } + + query = COMMON_QUERY.copy() + query['offset'] = 0 + + next_href = base_url + '?' + compat_urllib_parse_urlencode(query) + + entries = [] + for i in itertools.count(): + response = self._download_json( + next_href, playlist_id, 'Downloading track page %s' % (i + 1)) + + collection = response['collection'] + if not collection: + break + + def resolve_permalink_url(candidates): + for cand in candidates: + if isinstance(cand, dict): + permalink_url = cand.get('permalink_url') + entry_id = self._extract_id(cand) + if permalink_url and permalink_url.startswith('http'): + return permalink_url, entry_id + + for e in collection: + permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) + if permalink_url: + entries.append(self.url_result(permalink_url, video_id=entry_id)) + + next_href = response.get('next_href') + if not next_href: + break + + parsed_next_href = compat_urlparse.urlparse(response['next_href']) + qs = compat_urlparse.parse_qs(parsed_next_href.query) + qs.update(COMMON_QUERY) + next_href = compat_urlparse.urlunparse( + parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True))) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': playlist_title, + 'entries': entries, + } + + +class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): _VALID_URL = r'''(?x) https?:// (?:(?:www|m)\.)?soundcloud\.com/ @@ -380,21 +452,18 @@ class SoundcloudUserIE(SoundcloudPlaylistBaseIE): 'url': 'https://soundcloud.com/grynpyret/spotlight', 'info_dict': { 'id': '7098329', - 'title': 'GRYNPYRET (Spotlight)', + 'title': 'Grynpyret (Spotlight)', }, 'playlist_mincount': 1, }] - _API_BASE = 'https://api.soundcloud.com' - _API_V2_BASE = 'https://api-v2.soundcloud.com' - _BASE_URL_MAP = { - 'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE, - 'tracks': '%s/users/%%s/tracks' % _API_BASE, - 'sets': '%s/users/%%s/playlists' % _API_V2_BASE, - 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE, - 'likes': '%s/users/%%s/likes' % _API_V2_BASE, - 'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE, + 'all': '%s/profile/soundcloud:users:%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_BASE, + 'sets': '%s/users/%%s/playlists' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'likes': '%s/users/%%s/likes' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'spotlight': '%s/users/%%s/spotlight' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, } _TITLE_MAP = { @@ -410,70 +479,49 @@ class SoundcloudUserIE(SoundcloudPlaylistBaseIE): mobj = re.match(self._VALID_URL, url) uploader = mobj.group('user') - url = 'http://soundcloud.com/%s/' % uploader + url = 'https://soundcloud.com/%s/' % uploader resolv_url = self._resolv_url(url) user = self._download_json( resolv_url, uploader, 'Downloading user info') resource = mobj.group('rsrc') or 'all' - base_url = self._BASE_URL_MAP[resource] % user['id'] - COMMON_QUERY = { - 'limit': 50, - 'client_id': self._CLIENT_ID, - 'linked_partitioning': '1', - } + return self._extract_playlist( + self._BASE_URL_MAP[resource] % user['id'], compat_str(user['id']), + '%s (%s)' % (user['username'], self._TITLE_MAP[resource])) - query = COMMON_QUERY.copy() - query['offset'] = 0 - - next_href = base_url + '?' + compat_urllib_parse_urlencode(query) - entries = [] - for i in itertools.count(): - response = self._download_json( - next_href, uploader, 'Downloading track page %s' % (i + 1)) - - collection = response['collection'] - if not collection: - break - - def resolve_permalink_url(candidates): - for cand in candidates: - if isinstance(cand, dict): - permalink_url = cand.get('permalink_url') - entry_id = self._extract_id(cand) - if permalink_url and permalink_url.startswith('http'): - return permalink_url, entry_id +class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)' + IE_NAME = 'soundcloud:trackstation' + _TESTS = [{ + 'url': 'https://soundcloud.com/stations/track/officialsundial/your-text', + 'info_dict': { + 'id': '286017854', + 'title': 'Track station: your-text', + }, + 'playlist_mincount': 47, + }] - for e in collection: - permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) - if permalink_url: - entries.append(self.url_result(permalink_url, video_id=entry_id)) + def _real_extract(self, url): + track_name = self._match_id(url) - next_href = response.get('next_href') - if not next_href: - break + webpage = self._download_webpage(url, track_name) - parsed_next_href = compat_urlparse.urlparse(response['next_href']) - qs = compat_urlparse.parse_qs(parsed_next_href.query) - qs.update(COMMON_QUERY) - next_href = compat_urlparse.urlunparse( - parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True))) + track_id = self._search_regex( + r'soundcloud:track-stations:(\d+)', webpage, 'track id') - return { - '_type': 'playlist', - 'id': compat_str(user['id']), - 'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]), - 'entries': entries, - } + return self._extract_playlist( + '%s/stations/soundcloud:track-stations:%s/tracks' + % (self._API_V2_BASE, track_id), + track_id, 'Track station: %s' % track_name) class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): _VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' IE_NAME = 'soundcloud:playlist' _TESTS = [{ - 'url': 'http://api.soundcloud.com/playlists/4110309', + 'url': 'https://api.soundcloud.com/playlists/4110309', 'info_dict': { 'id': '4110309', 'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]', diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index ec1b60388..84298fee4 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .nexx import NexxEmbedIE from .spiegeltv import SpiegeltvIE from ..compat import compat_urlparse from ..utils import ( @@ -121,6 +122,26 @@ class SpiegelArticleIE(InfoExtractor): }, 'playlist_count': 6, + }, { + # Nexx iFrame embed + 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html', + 'info_dict': { + 'id': '161464', + 'ext': 'mp4', + 'title': 'Nervenkitzel Achterbahn', + 'alt_title': 'Karussellbauer in Deutschland', + 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', + 'release_year': 2005, + 'creator': 'SPIEGEL TV', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2761, + 'timestamp': 1394021479, + 'upload_date': '20140305', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -143,6 +164,9 @@ class SpiegelArticleIE(InfoExtractor): entries = [ self.url_result(compat_urlparse.urljoin( self.http_scheme() + '//spiegel.de/', embed_path)) - for embed_path in embeds - ] - return self.playlist_result(entries) + for embed_path in embeds] + if embeds: + return self.playlist_result(entries) + + return self.playlist_from_matches( + NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key()) diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index e1cfb8698..6ccf4c342 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -1,114 +1,17 @@ -# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse -from ..utils import ( - determine_ext, - float_or_none, -) +from .nexx import NexxIE class SpiegeltvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/(?:#/)?filme/(?P<id>[\-a-z0-9]+)' - _TESTS = [{ - 'url': 'http://www.spiegel.tv/filme/flug-mh370/', - 'info_dict': { - 'id': 'flug-mh370', - 'ext': 'm4v', - 'title': 'Flug MH370', - 'description': 'Das Rätsel um die Boeing 777 der Malaysia-Airlines', - 'thumbnail': r're:http://.*\.jpg$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, { - 'url': 'http://www.spiegel.tv/#/filme/alleskino-die-wahrheit-ueber-maenner/', + _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P<id>\d+)' + _TEST = { + 'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/', 'only_matching': True, - }] + } def _real_extract(self, url): - if '/#/' in url: - url = url.replace('/#/', '/') - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<h1.*?>(.*?)</h1>', webpage, 'title') - - apihost = 'http://spiegeltv-ivms2-restapi.s3.amazonaws.com' - version_json = self._download_json( - '%s/version.json' % apihost, video_id, - note='Downloading version information') - version_name = version_json['version_name'] - - slug_json = self._download_json( - '%s/%s/restapi/slugs/%s.json' % (apihost, version_name, video_id), - video_id, - note='Downloading object information') - oid = slug_json['object_id'] - - media_json = self._download_json( - '%s/%s/restapi/media/%s.json' % (apihost, version_name, oid), - video_id, note='Downloading media information') - uuid = media_json['uuid'] - is_wide = media_json['is_wide'] - - server_json = self._download_json( - 'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json', - video_id, note='Downloading server information') - - format = '16x9' if is_wide else '4x3' - - formats = [] - for streamingserver in server_json['streamingserver']: - endpoint = streamingserver.get('endpoint') - if not endpoint: - continue - play_path = 'mp4:%s_spiegeltv_0500_%s.m4v' % (uuid, format) - if endpoint.startswith('rtmp'): - formats.append({ - 'url': endpoint, - 'format_id': 'rtmp', - 'app': compat_urllib_parse_urlparse(endpoint).path[1:], - 'play_path': play_path, - 'player_path': 'http://prod-static.spiegel.tv/frontend-076.swf', - 'ext': 'flv', - 'rtmp_live': True, - }) - elif determine_ext(endpoint) == 'm3u8': - formats.append({ - 'url': endpoint.replace('[video]', play_path), - 'ext': 'm4v', - 'format_id': 'hls', # Prefer hls since it allows to workaround georestriction - 'protocol': 'm3u8', - 'preference': 1, - 'http_headers': { - 'Accept-Encoding': 'deflate', # gzip causes trouble on the server side - }, - }) - else: - formats.append({ - 'url': endpoint, - }) - self._check_formats(formats, video_id) - - thumbnails = [] - for image in media_json['images']: - thumbnails.append({ - 'url': image['url'], - 'width': image['width'], - 'height': image['height'], - }) - - description = media_json['subtitle'] - duration = float_or_none(media_json.get('duration_in_ms'), scale=1000) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'thumbnails': thumbnails, - 'formats': formats, - } + return self.url_result( + 'https://api.nexx.cloud/v3/748/videos/byid/%s' + % self._match_id(url), ie=NexxIE.ie_key()) diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index e7bd5bf91..54497c880 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -4,7 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( + determine_ext, + int_or_none, + js_to_json, +) class SportBoxEmbedIE(InfoExtractor): @@ -14,8 +18,10 @@ class SportBoxEmbedIE(InfoExtractor): 'info_dict': { 'id': '211355', 'ext': 'mp4', - 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', + 'title': '211355', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 292, + 'view_count': int, }, 'params': { # m3u8 download @@ -24,6 +30,9 @@ class SportBoxEmbedIE(InfoExtractor): }, { 'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580', 'only_matching': True, + }, { + 'url': 'https://news.sportbox.ru/vdl/player/media/193095', + 'only_matching': True, }] @staticmethod @@ -37,36 +46,34 @@ class SportBoxEmbedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - formats = [] - - def cleanup_js(code): - # desktop_advert_config contains complex Javascripts and we don't need it - return js_to_json(re.sub(r'desktop_advert_config.*', '', code)) - - jwplayer_data = self._parse_json(self._search_regex( - r'(?s)player\.setup\(({.+?})\);', webpage, 'jwplayer settings'), video_id, - transform_source=cleanup_js) - - hls_url = jwplayer_data.get('hls_url') - if hls_url: - formats.extend(self._extract_m3u8_formats( - hls_url, video_id, ext='mp4', m3u8_id='hls')) - - rtsp_url = jwplayer_data.get('rtsp_url') - if rtsp_url: - formats.append({ - 'url': rtsp_url, - 'format_id': 'rtsp', - }) + wjplayer_data = self._parse_json( + self._search_regex( + r'(?s)wjplayer\(({.+?})\);', webpage, 'wjplayer settings'), + video_id, transform_source=js_to_json) + formats = [] + for source in wjplayer_data['sources']: + src = source.get('src') + if not src: + continue + if determine_ext(src) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src, + }) self._sort_formats(formats) - title = jwplayer_data['node_title'] - thumbnail = jwplayer_data.get('image_url') + view_count = int_or_none(self._search_regex( + r'Просмотров\s*:\s*(\d+)', webpage, 'view count', default=None)) return { 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, + 'title': video_id, + 'thumbnail': wjplayer_data.get('poster'), + 'duration': int_or_none(wjplayer_data.get('duration')), + 'view_count': view_count, 'formats': formats, } diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py index aa4fad162..a9e34c027 100644 --- a/youtube_dl/extractor/streamango.py +++ b/youtube_dl/extractor/streamango.py @@ -22,6 +22,17 @@ class StreamangoIE(InfoExtractor): 'title': '20170315_150006.mp4', } }, { + # no og:title + 'url': 'https://streamango.com/embed/foqebrpftarclpob/asdf_asd_2_mp4', + 'info_dict': { + 'id': 'foqebrpftarclpob', + 'ext': 'mp4', + 'title': 'foqebrpftarclpob', + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4', 'only_matching': True, }] @@ -31,7 +42,7 @@ class StreamangoIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage) + title = self._og_search_title(webpage, default=video_id) formats = [] for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage): diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 9e533103c..58e0b4c80 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -26,7 +26,7 @@ class StreamCZIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti', - 'md5': '6d3ca61a8d0633c9c542b92fcb936b0c', + 'md5': '934bb6a6d220d99c010783c9719960d5', 'info_dict': { 'id': '765767', 'ext': 'mp4', @@ -37,7 +37,7 @@ class StreamCZIE(InfoExtractor): }, }, { 'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka', - 'md5': 'e54a254fb8b871968fd8403255f28589', + 'md5': '849a88c1e1ca47d41403c2ba5e59e261', 'info_dict': { 'id': '10002447', 'ext': 'mp4', @@ -85,6 +85,14 @@ class StreamCZIE(InfoExtractor): else: title = data['name'] + subtitles = {} + srt_url = data.get('subtitles_srt') + if srt_url: + subtitles['cs'] = [{ + 'ext': 'srt', + 'url': srt_url, + }] + return { 'id': video_id, 'title': title, @@ -93,4 +101,5 @@ class StreamCZIE(InfoExtractor): 'description': data.get('web_site_text'), 'duration': int_or_none(data.get('duration')), 'view_count': int_or_none(data.get('views')), + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 1b5afb73e..48bc4529e 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -181,7 +181,8 @@ class SVTPlayIE(SVTBaseIE): if video_id: data = self._download_json( - 'http://www.svt.se/videoplayer-api/video/%s' % video_id, video_id) + 'https://api.svt.se/videoplayer-api/video/%s' % video_id, + video_id, headers=self.geo_verification_headers()) info_dict = self._extract_video(data, video_id) if not info_dict.get('title'): info_dict['title'] = re.sub( diff --git a/youtube_dl/extractor/tastytrade.py b/youtube_dl/extractor/tastytrade.py new file mode 100644 index 000000000..7fe96bd5f --- /dev/null +++ b/youtube_dl/extractor/tastytrade.py @@ -0,0 +1,43 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .ooyala import OoyalaIE + + +class TastyTradeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tastytrade\.com/tt/shows/[^/]+/episodes/(?P<id>[^/?#&]+)' + + _TESTS = [{ + 'url': 'https://www.tastytrade.com/tt/shows/market-measures/episodes/correlation-in-short-volatility-06-28-2017', + 'info_dict': { + 'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM', + 'ext': 'mp4', + 'title': 'A History of Teaming', + 'description': 'md5:2a9033db8da81f2edffa4c99888140b3', + 'duration': 422.255, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], + }, { + 'url': 'https://www.tastytrade.com/tt/shows/daily-dose/episodes/daily-dose-06-30-2017', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + ooyala_code = self._search_regex( + r'data-media-id=(["\'])(?P<code>(?:(?!\1).)+)\1', + webpage, 'ooyala code', group='code') + + info = self._search_json_ld(webpage, display_id, fatal=False) + info.update({ + '_type': 'url_transparent', + 'ie_key': OoyalaIE.ie_key(), + 'url': 'ooyala:%s' % ooyala_code, + 'display_id': display_id, + }) + return info diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index bf93eb868..e9474533f 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -8,6 +8,9 @@ from ..utils import extract_attributes class TBSIE(TurnerBaseIE): + # https://github.com/rg3/youtube-dl/issues/13658 + _WORKING = False + _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P<id>[^/?#]+)\.html' _TESTS = [{ 'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html', @@ -17,7 +20,8 @@ class TBSIE(TurnerBaseIE): 'ext': 'mp4', 'title': 'Theatrical Trailer', 'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.', - } + }, + 'skip': 'TBS videos are deleted after a while', }, { 'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html', 'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56', @@ -26,7 +30,8 @@ class TBSIE(TurnerBaseIE): 'ext': 'mp4', 'title': 'You Better Run', 'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.', - } + }, + 'skip': 'TBS videos are deleted after a while', }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/teamfourstar.py b/youtube_dl/extractor/teamfourstar.py deleted file mode 100644 index a8c6ed7be..000000000 --- a/youtube_dl/extractor/teamfourstar.py +++ /dev/null @@ -1,48 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from .jwplatform import JWPlatformIE -from ..utils import unified_strdate - - -class TeamFourStarIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?teamfourstar\.com/(?P<id>[a-z0-9\-]+)' - _TEST = { - 'url': 'http://teamfourstar.com/tfs-abridged-parody-episode-1-2/', - 'info_dict': { - 'id': '0WdZO31W', - 'title': 'TFS Abridged Parody Episode 1', - 'description': 'md5:d60bc389588ebab2ee7ad432bda953ae', - 'ext': 'mp4', - 'timestamp': 1394168400, - 'upload_date': '20080508', - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - jwplatform_url = JWPlatformIE._extract_url(webpage) - - video_title = self._html_search_regex( - r'<h1[^>]+class="entry-title"[^>]*>(?P<title>.+?)</h1>', - webpage, 'title') - video_date = unified_strdate(self._html_search_regex( - r'<span[^>]+class="meta-date date updated"[^>]*>(?P<date>.+?)</span>', - webpage, 'date', fatal=False)) - video_description = self._html_search_regex( - r'(?s)<div[^>]+class="content-inner"[^>]*>.*?(?P<description><p>.+?)</div>', - webpage, 'description', fatal=False) - video_thumbnail = self._og_search_thumbnail(webpage) - - return { - '_type': 'url_transparent', - 'display_id': display_id, - 'title': video_title, - 'description': video_description, - 'upload_date': video_date, - 'thumbnail': video_thumbnail, - 'url': jwplatform_url, - } diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 3f3c681ae..06a27fd04 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -6,7 +6,10 @@ import re from .common import InfoExtractor from ..compat import compat_str -from ..utils import int_or_none +from ..utils import ( + int_or_none, + try_get, +) class TEDIE(InfoExtractor): @@ -113,8 +116,9 @@ class TEDIE(InfoExtractor): } def _extract_info(self, webpage): - info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>', - webpage, 'info json') + info_json = self._search_regex( + r'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>', + webpage, 'info json') return json.loads(info_json) def _real_extract(self, url): @@ -136,11 +140,16 @@ class TEDIE(InfoExtractor): webpage = self._download_webpage(url, name, 'Downloading playlist webpage') info = self._extract_info(webpage) - playlist_info = info['playlist'] + + playlist_info = try_get( + info, lambda x: x['__INITIAL_DATA__']['playlist'], + dict) or info['playlist'] playlist_entries = [ self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key()) - for talk in info['talks'] + for talk in try_get( + info, lambda x: x['__INITIAL_DATA__']['talks'], + dict) or info['talks'] ] return self.playlist_result( playlist_entries, @@ -149,9 +158,14 @@ class TEDIE(InfoExtractor): def _talk_info(self, url, video_name): webpage = self._download_webpage(url, video_name) - self.report_extraction(video_name) - talk_info = self._extract_info(webpage)['talks'][0] + info = self._extract_info(webpage) + + talk_info = try_get( + info, lambda x: x['__INITIAL_DATA__']['talks'][0], + dict) or info['talks'][0] + + title = talk_info['title'].strip() external = talk_info.get('external') if external: @@ -165,19 +179,27 @@ class TEDIE(InfoExtractor): 'url': ext_url or external['uri'], } + native_downloads = try_get( + talk_info, lambda x: x['downloads']['nativeDownloads'], + dict) or talk_info['nativeDownloads'] + formats = [{ 'url': format_url, 'format_id': format_id, 'format': format_id, - } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None] + } for (format_id, format_url) in native_downloads.items() if format_url is not None] if formats: for f in formats: finfo = self._NATIVE_FORMATS.get(f['format_id']) if finfo: f.update(finfo) + player_talk = talk_info['player_talks'][0] + + resources_ = player_talk.get('resources') or talk_info.get('resources') + http_url = None - for format_id, resources in talk_info['resources'].items(): + for format_id, resources in resources_.items(): if format_id == 'h264': for resource in resources: h264_url = resource.get('file') @@ -237,14 +259,11 @@ class TEDIE(InfoExtractor): video_id = compat_str(talk_info['id']) - thumbnail = talk_info['thumb'] - if not thumbnail.startswith('http'): - thumbnail = 'http://' + thumbnail return { 'id': video_id, - 'title': talk_info['title'].strip(), - 'uploader': talk_info['speaker'], - 'thumbnail': thumbnail, + 'title': title, + 'uploader': player_talk.get('speaker') or talk_info.get('speaker'), + 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'), 'description': self._og_search_description(webpage), 'subtitles': self._get_subtitles(video_id, talk_info), 'formats': formats, @@ -252,20 +271,22 @@ class TEDIE(InfoExtractor): } def _get_subtitles(self, video_id, talk_info): - languages = [lang['languageCode'] for lang in talk_info.get('languages', [])] - if languages: - sub_lang_list = {} - for l in languages: - sub_lang_list[l] = [ - { - 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext), - 'ext': ext, - } - for ext in ['ted', 'srt'] - ] - return sub_lang_list - else: - return {} + sub_lang_list = {} + for language in try_get( + talk_info, + (lambda x: x['downloads']['languages'], + lambda x: x['languages']), list): + lang_code = language.get('languageCode') or language.get('ianaCode') + if not lang_code: + continue + sub_lang_list[lang_code] = [ + { + 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext), + 'ext': ext, + } + for ext in ['ted', 'srt'] + ] + return sub_lang_list def _watch_info(self, url, name): webpage = self._download_webpage(url, name) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 9a424b1c6..de236bbba 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -80,14 +80,33 @@ class ThePlatformBaseIE(OnceIE): 'url': src, }) + duration = info.get('duration') + tp_chapters = info.get('chapters', []) + chapters = [] + if tp_chapters: + def _add_chapter(start_time, end_time): + start_time = float_or_none(start_time, 1000) + end_time = float_or_none(end_time, 1000) + if start_time is None or end_time is None: + return + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + }) + + for chapter in tp_chapters[:-1]: + _add_chapter(chapter.get('startTime'), chapter.get('endTime')) + _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration) + return { 'title': info['title'], 'subtitles': subtitles, 'description': info['description'], 'thumbnail': info['defaultThumbnailUrl'], - 'duration': int_or_none(info.get('duration'), 1000), + 'duration': float_or_none(duration, 1000), 'timestamp': int_or_none(info.get('pubDate'), 1000) or None, 'uploader': info.get('billingCode'), + 'chapters': chapters, } def _extract_theplatform_metadata(self, path, video_id): diff --git a/youtube_dl/extractor/thescene.py b/youtube_dl/extractor/thescene.py index b8504f0eb..cd642355c 100644 --- a/youtube_dl/extractor/thescene.py +++ b/youtube_dl/extractor/thescene.py @@ -3,10 +3,6 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urlparse -from ..utils import ( - int_or_none, - qualities, -) class TheSceneIE(InfoExtractor): @@ -24,6 +20,9 @@ class TheSceneIE(InfoExtractor): 'season': 'Ready To Wear Spring 2013', 'tags': list, 'categories': list, + 'upload_date': '20120913', + 'timestamp': 1347512400, + 'uploader': 'vogue', }, } @@ -37,32 +36,9 @@ class TheSceneIE(InfoExtractor): self._html_search_regex( r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url')) - player = self._download_webpage(player_url, display_id) - info = self._parse_json( - self._search_regex( - r'(?m)video\s*:\s*({.+?}),$', player, 'info json'), - display_id) - - video_id = info['id'] - title = info['title'] - - qualities_order = qualities(('low', 'high')) - formats = [{ - 'format_id': '{0}-{1}'.format(f['type'].split('/')[0], f['quality']), - 'url': f['src'], - 'quality': qualities_order(f['quality']), - } for f in info['sources']] - self._sort_formats(formats) - return { - 'id': video_id, + '_type': 'url_transparent', 'display_id': display_id, - 'title': title, - 'formats': formats, - 'thumbnail': info.get('poster_frame'), - 'duration': int_or_none(info.get('duration')), - 'series': info.get('series_title'), - 'season': info.get('season_title'), - 'tags': info.get('tags'), - 'categories': info.get('categories'), + 'url': player_url, + 'ie_key': 'CondeNast', } diff --git a/youtube_dl/extractor/thisoldhouse.py b/youtube_dl/extractor/thisoldhouse.py index 197258df1..6ab147ad7 100644 --- a/youtube_dl/extractor/thisoldhouse.py +++ b/youtube_dl/extractor/thisoldhouse.py @@ -2,13 +2,15 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str +from ..utils import try_get class ThisOldHouseIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode)/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench', - 'md5': '946f05bbaa12a33f9ae35580d2dfcfe3', + 'md5': '568acf9ca25a639f0c4ff905826b662f', 'info_dict': { 'id': '2REGtUDQ', 'ext': 'mp4', @@ -28,8 +30,15 @@ class ThisOldHouseIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - drupal_settings = self._parse_json(self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings'), display_id) - video_id = drupal_settings['jwplatform']['video_id'] + video_id = self._search_regex( + (r'data-mid=(["\'])(?P<id>(?:(?!\1).)+)\1', + r'id=(["\'])inline-video-player-(?P<id>(?:(?!\1).)+)\1'), + webpage, 'video id', default=None, group='id') + if not video_id: + drupal_settings = self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), display_id) + video_id = try_get( + drupal_settings, lambda x: x['jwplatform']['video_id'], + compat_str) or list(drupal_settings['comScore'])[0] return self.url_result('jwplatform:' + video_id, 'JWPlatform', video_id) diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py index c54b876d3..348d6ecdf 100644 --- a/youtube_dl/extractor/toggle.py +++ b/youtube_dl/extractor/toggle.py @@ -17,7 +17,7 @@ from ..utils import ( class ToggleIE(InfoExtractor): IE_NAME = 'toggle' - _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:series|clips|movies)/(?:[^/]+/)+(?P<id>[0-9]+)' + _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', 'info_dict': { @@ -73,6 +73,12 @@ class ToggleIE(InfoExtractor): }, { 'url': 'http://video.toggle.sg/en/movies/seven-days/321936', 'only_matching': True, + }, { + 'url': 'https://video.toggle.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456', + 'only_matching': True, + }, { + 'url': 'http://video.toggle.sg/en/channels/eleven-plus/401585', + 'only_matching': True, }] _FORMAT_PREFERENCES = { diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 26d770992..e59ed2661 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -5,7 +5,6 @@ from .common import InfoExtractor from ..utils import ( int_or_none, js_to_json, - ExtractorError, urlencode_postdata, extract_attributes, smuggle_url, @@ -78,8 +77,10 @@ class TouTvIE(InfoExtractor): def _real_extract(self, url): path = self._match_id(url) metadata = self._download_json('http://ici.tou.tv/presentation/%s' % path, path) + # IsDrm does not necessarily mean the video is DRM protected (see + # https://github.com/rg3/youtube-dl/issues/13994). if metadata.get('IsDrm'): - raise ExtractorError('This video is DRM protected.', expected=True) + self.report_warning('This video is probably DRM protected.', path) video_id = metadata['IdMedia'] details = metadata['Details'] title = details['OriginalTitle'] diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py index 938e05076..f705a06c9 100644 --- a/youtube_dl/extractor/toypics.py +++ b/youtube_dl/extractor/toypics.py @@ -6,42 +6,48 @@ import re class ToypicsIE(InfoExtractor): - IE_DESC = 'Toypics user profile' - _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*' + IE_DESC = 'Toypics video' + _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)' _TEST = { 'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/', 'md5': '16e806ad6d6f58079d210fe30985e08b', 'info_dict': { 'id': '514', 'ext': 'mp4', - 'title': 'Chance-Bulge\'d, 2', + 'title': "Chance-Bulge'd, 2", 'age_limit': 18, 'uploader': 'kidsune', } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - page = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'src:\s+"(http://static[0-9]+\.toypics\.net/flvideo/[^"]+)"', page, 'video URL') - title = self._html_search_regex( - r'<title>Toypics - ([^<]+)</title>', page, 'title') - username = self._html_search_regex( - r'toypics.net/([^/"]+)" class="user-name">', page, 'username') + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + formats = self._parse_html5_media_entries( + url, webpage, video_id)[0]['formats'] + title = self._html_search_regex([ + r'<h1[^>]+class=["\']view-video-title[^>]+>([^<]+)</h', + r'<title>([^<]+) - Toypics</title>', + ], webpage, 'title') + + uploader = self._html_search_regex( + r'More videos from <strong>([^<]+)</strong>', webpage, 'uploader', + fatal=False) + return { 'id': video_id, - 'url': video_url, + 'formats': formats, 'title': title, - 'uploader': username, + 'uploader': uploader, 'age_limit': 18, } class ToypicsUserIE(InfoExtractor): IE_DESC = 'Toypics user profile' - _VALID_URL = r'https?://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])' + _VALID_URL = r'https?://videos\.toypics\.net/(?!view)(?P<id>[^/?#&]+)' _TEST = { 'url': 'http://videos.toypics.net/Mikey', 'info_dict': { @@ -51,8 +57,7 @@ class ToypicsUserIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - username = mobj.group('username') + username = self._match_id(url) profile_page = self._download_webpage( url, username, note='Retrieving profile page') @@ -71,7 +76,7 @@ class ToypicsUserIE(InfoExtractor): note='Downloading page %d/%d' % (n, page_count)) urls.extend( re.findall( - r'<p class="video-entry-title">\s+<a href="(https?://videos.toypics.net/view/[^"]+)">', + r'<div[^>]+class=["\']preview[^>]+>\s*<a[^>]+href="(https?://videos\.toypics\.net/view/[^"]+)"', lpage)) return { diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 2aae55e7e..7421378a8 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -3,138 +3,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - InAdvancePagedList, - float_or_none, - unescapeHTML, -) - - -class TudouIE(InfoExtractor): - IE_NAME = 'tudou' - _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:(?:programs|wlplay)/view|(?:listplay|albumplay)/[\w-]{11})/(?P<id>[\w-]{11})' - _TESTS = [{ - 'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html', - 'md5': '140a49ed444bd22f93330985d8475fcb', - 'info_dict': { - 'id': '159448201', - 'ext': 'f4v', - 'title': '卡马乔国足开大脚长传冲吊集锦', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1372113489000, - 'description': '卡马乔卡家军,开大脚先进战术不完全集锦!', - 'duration': 289.04, - 'view_count': int, - 'filesize': int, - } - }, { - 'url': 'http://www.tudou.com/programs/view/ajX3gyhL0pc/', - 'info_dict': { - 'id': '117049447', - 'ext': 'f4v', - 'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1349207518000, - 'description': 'md5:294612423894260f2dcd5c6c04fe248b', - 'duration': 5478.33, - 'view_count': int, - 'filesize': int, - } - }] - - _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' - - # Translated from tudou/tools/TVCHelper.as in PortalPlayer_193.swf - # 0001, 0002 and 4001 are not included as they indicate temporary issues - TVC_ERRORS = { - '0003': 'The video is deleted or does not exist', - '1001': 'This video is unavailable due to licensing issues', - '1002': 'This video is unavailable as it\'s under review', - '1003': 'This video is unavailable as it\'s under review', - '3001': 'Password required', - '5001': 'This video is available in Mainland China only due to licensing issues', - '7001': 'This video is unavailable', - '8001': 'This video is unavailable due to licensing issues', - } - - def _url_for_id(self, video_id, quality=None): - info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id) - if quality: - info_url += '&hd' + quality - xml_data = self._download_xml(info_url, video_id, 'Opening the info XML page') - error = xml_data.attrib.get('error') - if error is not None: - raise ExtractorError('Tudou said: %s' % error, expected=True) - final_url = xml_data.text - return final_url - - def _real_extract(self, url): - video_id = self._match_id(url) - item_data = self._download_json( - 'http://www.tudou.com/tvp/getItemInfo.action?ic=%s' % video_id, video_id) - - youku_vcode = item_data.get('vcode') - if youku_vcode: - return self.url_result('youku:' + youku_vcode, ie='Youku') - - if not item_data.get('itemSegs'): - tvc_code = item_data.get('tvcCode') - if tvc_code: - err_msg = self.TVC_ERRORS.get(tvc_code) - if err_msg: - raise ExtractorError('Tudou said: %s' % err_msg, expected=True) - raise ExtractorError('Unexpected error %s returned from Tudou' % tvc_code) - raise ExtractorError('Unxpected error returned from Tudou') - - title = unescapeHTML(item_data['kw']) - description = item_data.get('desc') - thumbnail_url = item_data.get('pic') - view_count = int_or_none(item_data.get('playTimes')) - timestamp = int_or_none(item_data.get('pt')) - - segments = self._parse_json(item_data['itemSegs'], video_id) - # It looks like the keys are the arguments that have to be passed as - # the hd field in the request url, we pick the higher - # Also, filter non-number qualities (see issue #3643). - quality = sorted(filter(lambda k: k.isdigit(), segments.keys()), - key=lambda k: int(k))[-1] - parts = segments[quality] - len_parts = len(parts) - if len_parts > 1: - self.to_screen('%s: found %s parts' % (video_id, len_parts)) - - def part_func(partnum): - part = parts[partnum] - part_id = part['k'] - final_url = self._url_for_id(part_id, quality) - ext = (final_url.split('?')[0]).split('.')[-1] - return [{ - 'id': '%s' % part_id, - 'url': final_url, - 'ext': ext, - 'title': title, - 'thumbnail': thumbnail_url, - 'description': description, - 'view_count': view_count, - 'timestamp': timestamp, - 'duration': float_or_none(part.get('seconds'), 1000), - 'filesize': int_or_none(part.get('size')), - 'http_headers': { - 'Referer': self._PLAYER_URL, - }, - }] - - entries = InAdvancePagedList(part_func, len_parts, 1) - - return { - '_type': 'multi_video', - 'entries': entries, - 'id': video_id, - 'title': title, - } class TudouPlaylistIE(InfoExtractor): diff --git a/youtube_dl/extractor/turbo.py b/youtube_dl/extractor/turbo.py index 25aa9c58e..be3eaa5c2 100644 --- a/youtube_dl/extractor/turbo.py +++ b/youtube_dl/extractor/turbo.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, @@ -49,7 +50,7 @@ class TurboIE(InfoExtractor): for child in item: m = re.search(r'url_video_(?P<quality>.+)', child.tag) if m: - quality = m.group('quality') + quality = compat_str(m.group('quality')) formats.append({ 'format_id': quality, 'url': child.text, diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index 1c0be9fc6..efeb677ee 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -13,6 +13,7 @@ from ..utils import ( xpath_attr, update_url_query, ExtractorError, + strip_or_none, ) @@ -163,17 +164,21 @@ class TurnerBaseIE(AdobePassIE): 'height': int_or_none(image.get('height')), } for image in video_data.findall('images/image')] + is_live = xpath_text(video_data, 'isLive') == 'true' + return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'formats': formats, 'subtitles': subtitles, 'thumbnails': thumbnails, - 'description': xpath_text(video_data, 'description'), + 'thumbnail': xpath_text(video_data, 'poster'), + 'description': strip_or_none(xpath_text(video_data, 'description')), 'duration': parse_duration(xpath_text(video_data, 'length') or xpath_text(video_data, 'trt')), 'timestamp': self._extract_timestamp(video_data), 'upload_date': xpath_attr(video_data, 'metas', 'version'), 'series': xpath_text(video_data, 'showTitle'), 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), + 'is_live': is_live, } diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index 7aeb2c620..cfcce020a 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -18,7 +18,7 @@ class TV4IE(InfoExtractor): tv4\.se/(?:[^/]+)/klipp/(?:.*)-| tv4play\.se/ (?: - (?:program|barn)/(?:[^\?]+)\?video_id=| + (?:program|barn)/(?:[^/]+/|(?:[^\?]+)\?video_id=)| iframe/video/| film/| sport/| @@ -63,6 +63,10 @@ class TV4IE(InfoExtractor): 'url': 'http://www.tv4play.se/barn/looney-tunes?video_id=3062412', 'only_matching': True, }, + { + 'url': 'http://www.tv4play.se/program/farang/3922081', + 'only_matching': True, + } ] def _real_extract(self, url): diff --git a/youtube_dl/extractor/tvplayer.py b/youtube_dl/extractor/tvplayer.py index ebde6053f..8f8686a65 100644 --- a/youtube_dl/extractor/tvplayer.py +++ b/youtube_dl/extractor/tvplayer.py @@ -48,7 +48,7 @@ class TVPlayerIE(InfoExtractor): 'https://tvplayer.com/watch/context', display_id, 'Downloading JSON context', query={ 'resource': resource_id, - 'nonce': token, + 'gen': token, }) validate = context['validate'] diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py index 4fd1aa4bf..a42977f39 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/youtube_dl/extractor/twentymin.py @@ -50,7 +50,7 @@ class TwentyMinutenIE(InfoExtractor): @staticmethod def _extract_urls(webpage): return [m.group('url') for m in re.finditer( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1', + r'<iframe[^>]+src=(["\'])(?P<url>(?:(?:https?:)?//)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1', webpage)] def _real_extract(self, url): diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 2daf9dfac..c926c99a9 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -28,7 +28,7 @@ from ..utils import ( class TwitchBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv' + _VALID_URL_BASE = r'https?://(?:(?:www|go)\.)?twitch\.tv' _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'https://usher.ttvnw.net' @@ -217,7 +217,7 @@ class TwitchVodIE(TwitchItemBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?twitch\.tv/(?:[^/]+/v|videos)/| + (?:(?:www|go)\.)?twitch\.tv/(?:[^/]+/v|videos)/| player\.twitch\.tv/\?.*?\bvideo=v ) (?P<id>\d+) @@ -458,7 +458,7 @@ class TwitchStreamIE(TwitchBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?twitch\.tv/| + (?:(?:www|go)\.)?twitch\.tv/| player\.twitch\.tv/\?.*?\bchannel= ) (?P<id>[^/#?]+) @@ -489,6 +489,9 @@ class TwitchStreamIE(TwitchBaseIE): }, { 'url': 'https://player.twitch.tv/?channel=lotsofs', 'only_matching': True, + }, { + 'url': 'https://go.twitch.tv/food', + 'only_matching': True, }] @classmethod diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 37e3bc412..6eaf360a6 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -7,20 +7,38 @@ from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( determine_ext, + dict_get, + ExtractorError, float_or_none, - xpath_text, - remove_end, int_or_none, - ExtractorError, + remove_end, + try_get, + xpath_text, ) from .periscope import PeriscopeIE class TwitterBaseIE(InfoExtractor): - def _get_vmap_video_url(self, vmap_url, video_id): + def _extract_formats_from_vmap_url(self, vmap_url, video_id): vmap_data = self._download_xml(vmap_url, video_id) - return xpath_text(vmap_data, './/MediaFile').strip() + video_url = xpath_text(vmap_data, './/MediaFile').strip() + if determine_ext(video_url) == 'm3u8': + return self._extract_m3u8_formats( + video_url, video_id, ext='mp4', m3u8_id='hls', + entry_protocol='m3u8_native') + return [{ + 'url': video_url, + }] + + @staticmethod + def _search_dimensions_in_video_url(a_format, video_url): + m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url) + if m: + a_format.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) class TwitterCardIE(TwitterBaseIE): @@ -36,7 +54,8 @@ class TwitterCardIE(TwitterBaseIE): 'title': 'Twitter Card', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 30.033, - } + }, + 'skip': 'Video gone', }, { 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', @@ -48,6 +67,7 @@ class TwitterCardIE(TwitterBaseIE): 'thumbnail': r're:^https?://.*\.jpg', 'duration': 80.155, }, + 'skip': 'Video gone', }, { 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', @@ -65,7 +85,7 @@ class TwitterCardIE(TwitterBaseIE): }, { 'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568', - 'md5': 'ab2745d0b0ce53319a534fccaa986439', + 'md5': '6dabeaca9e68cbb71c99c322a4b42a11', 'info_dict': { 'id': 'iBb2x00UVlv', 'ext': 'mp4', @@ -73,16 +93,17 @@ class TwitterCardIE(TwitterBaseIE): 'uploader_id': '1189339351084113920', 'uploader': 'ArsenalTerje', 'title': 'Vine by ArsenalTerje', + 'timestamp': 1447451307, }, 'add_ie': ['Vine'], }, { 'url': 'https://twitter.com/i/videos/tweet/705235433198714880', - 'md5': '3846d0a07109b5ab622425449b59049d', + 'md5': '884812a2adc8aaf6fe52b15ccbfa3b88', 'info_dict': { 'id': '705235433198714880', 'ext': 'mp4', 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*', }, }, { 'url': 'https://twitter.com/i/videos/752274308186120192', @@ -90,6 +111,59 @@ class TwitterCardIE(TwitterBaseIE): }, ] + def _parse_media_info(self, media_info, video_id): + formats = [] + for media_variant in media_info.get('variants', []): + media_url = media_variant['url'] + if media_url.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls')) + elif media_url.endswith('.mpd'): + formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash')) + else: + vbr = int_or_none(dict_get(media_variant, ('bitRate', 'bitrate')), scale=1000) + a_format = { + 'url': media_url, + 'format_id': 'http-%d' % vbr if vbr else 'http', + 'vbr': vbr, + } + # Reported bitRate may be zero + if not a_format['vbr']: + del a_format['vbr'] + + self._search_dimensions_in_video_url(a_format, media_url) + + formats.append(a_format) + return formats + + def _extract_mobile_formats(self, username, video_id): + webpage = self._download_webpage( + 'https://mobile.twitter.com/%s/status/%s' % (username, video_id), + video_id, 'Downloading mobile webpage', + headers={ + # A recent mobile UA is necessary for `gt` cookie + 'User-Agent': 'Mozilla/5.0 (Android 6.0.1; Mobile; rv:54.0) Gecko/54.0 Firefox/54.0', + }) + main_script_url = self._html_search_regex( + r'<script[^>]+src="([^"]+main\.[^"]+)"', webpage, 'main script URL') + main_script = self._download_webpage( + main_script_url, video_id, 'Downloading main script') + bearer_token = self._search_regex( + r'BEARER_TOKEN\s*:\s*"([^"]+)"', + main_script, 'bearer token') + guest_token = self._search_regex( + r'document\.cookie\s*=\s*decodeURIComponent\("gt=(\d+)', + webpage, 'guest token') + api_data = self._download_json( + 'https://api.twitter.com/2/timeline/conversation/%s.json' % video_id, + video_id, 'Downloading mobile API data', + headers={ + 'Authorization': 'Bearer ' + bearer_token, + 'x-guest-token': guest_token, + }) + media_info = try_get(api_data, lambda o: o['globalObjects']['tweets'][video_id] + ['extended_entities']['media'][0]['video_info']) or {} + return self._parse_media_info(media_info, video_id) + def _real_extract(self, url): video_id = self._match_id(url) @@ -117,14 +191,6 @@ class TwitterCardIE(TwitterBaseIE): if periscope_url: return self.url_result(periscope_url, PeriscopeIE.ie_key()) - def _search_dimensions_in_video_url(a_format, video_url): - m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url) - if m: - a_format.update({ - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source') if video_url: @@ -135,15 +201,14 @@ class TwitterCardIE(TwitterBaseIE): 'url': video_url, } - _search_dimensions_in_video_url(f, video_url) + self._search_dimensions_in_video_url(f, video_url) formats.append(f) vmap_url = config.get('vmapUrl') or config.get('vmap_url') if vmap_url: - formats.append({ - 'url': self._get_vmap_video_url(vmap_url, video_id), - }) + formats.extend( + self._extract_formats_from_vmap_url(vmap_url, video_id)) media_info = None @@ -152,29 +217,14 @@ class TwitterCardIE(TwitterBaseIE): media_info = entity['mediaInfo'] if media_info: - for media_variant in media_info['variants']: - media_url = media_variant['url'] - if media_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls')) - elif media_url.endswith('.mpd'): - formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash')) - else: - vbr = int_or_none(media_variant.get('bitRate'), scale=1000) - a_format = { - 'url': media_url, - 'format_id': 'http-%d' % vbr if vbr else 'http', - 'vbr': vbr, - } - # Reported bitRate may be zero - if not a_format['vbr']: - del a_format['vbr'] - - _search_dimensions_in_video_url(a_format, media_url) - - formats.append(a_format) - + formats.extend(self._parse_media_info(media_info, video_id)) duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9) + username = config.get('user', {}).get('screen_name') + if username: + formats.extend(self._extract_mobile_formats(username, video_id)) + + self._remove_duplicate_formats(formats) self._sort_formats(formats) title = self._search_regex(r'<title>([^<]+)</title>', webpage, 'title') @@ -255,10 +305,10 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'JG - BEAT PROD: @suhmeduh #Damndaniel', - 'description': 'JG on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', + 'title': 'Donte - BEAT PROD: @suhmeduh #Damndaniel', + 'description': 'Donte on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'JG', + 'uploader': 'Donte', 'uploader_id': 'jaydingeer', }, 'params': { @@ -270,9 +320,11 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': 'MIOxnrUteUd', 'ext': 'mp4', - 'title': 'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン', - 'uploader': 'TAKUMA', - 'uploader_id': '1004126642786242560', + 'title': 'FilmDrunk - Vine of the day', + 'description': 'FilmDrunk on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"', + 'uploader': 'FilmDrunk', + 'uploader_id': 'Filmdrunk', + 'timestamp': 1402826626, 'upload_date': '20140615', }, 'add_ie': ['Vine'], @@ -294,13 +346,28 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': '1zqKVVlkqLaKB', 'ext': 'mp4', - 'title': 'Sgt Kerry Schmidt - Ontario Provincial Police - Road rage, mischief, assault, rollover and fire in one occurrence', + 'title': 'Sgt Kerry Schmidt - LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence', + 'description': 'Sgt Kerry Schmidt on Twitter: "LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence https://t.co/EKrVgIXF3s"', 'upload_date': '20160923', 'uploader_id': 'OPP_HSD', - 'uploader': 'Sgt Kerry Schmidt - Ontario Provincial Police', + 'uploader': 'Sgt Kerry Schmidt', 'timestamp': 1474613214, }, 'add_ie': ['Periscope'], + }, { + # has mp4 formats via mobile API + 'url': 'https://twitter.com/news_al3alm/status/852138619213144067', + 'info_dict': { + 'id': '852138619213144067', + 'ext': 'mp4', + 'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة', + 'description': 'عالم الأخبار on Twitter: "كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN"', + 'uploader': 'عالم الأخبار', + 'uploader_id': 'news_al3alm', + }, + 'params': { + 'format': 'best[format_id^=http-]', + }, }] def _real_extract(self, url): @@ -393,7 +460,7 @@ class TwitterAmplifyIE(TwitterBaseIE): vmap_url = self._html_search_meta( 'twitter:amplify:vmap', webpage, 'vmap url') - video_url = self._get_vmap_video_url(vmap_url, video_id) + formats = self._extract_formats_from_vmap_url(vmap_url, video_id) thumbnails = [] thumbnail = self._html_search_meta( @@ -415,11 +482,10 @@ class TwitterAmplifyIE(TwitterBaseIE): }) video_w, video_h = _find_dimension('player') - formats = [{ - 'url': video_url, + formats[0].update({ 'width': video_w, 'height': video_h, - }] + }) return { 'id': video_id, diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index dae1aa3c6..207c4a6a7 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -15,6 +15,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + js_to_json, sanitized_Request, unescapeHTML, urlencode_postdata, @@ -52,6 +53,10 @@ class UdemyIE(InfoExtractor): # new URL schema 'url': 'https://www.udemy.com/electric-bass-right-from-the-start/learn/v4/t/lecture/4580906', 'only_matching': True, + }, { + # no url in outputs format entry + 'url': 'https://www.udemy.com/learn-web-development-complete-step-by-step-guide-to-success/learn/v4/t/lecture/4125812', + 'only_matching': True, }] def _extract_course_info(self, webpage, video_id): @@ -69,7 +74,7 @@ class UdemyIE(InfoExtractor): return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url checkout_url = unescapeHTML(self._search_regex( - r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/payment/checkout/.+?)\1', + r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/(?:payment|cart)/checkout/.+?)\1', webpage, 'checkout url', group='url', default=None)) if checkout_url: raise ExtractorError( @@ -219,7 +224,7 @@ class UdemyIE(InfoExtractor): def extract_output_format(src, f_id): return { - 'url': src['url'], + 'url': src.get('url'), 'format_id': '%sp' % (src.get('height') or f_id), 'width': int_or_none(src.get('width')), 'height': int_or_none(src.get('height')), @@ -264,6 +269,25 @@ class UdemyIE(InfoExtractor): f = add_output_format_meta(f, format_id) formats.append(f) + def extract_subtitles(track_list): + if not isinstance(track_list, list): + return + for track in track_list: + if not isinstance(track, dict): + continue + if track.get('kind') != 'captions': + continue + src = track.get('src') + if not src or not isinstance(src, compat_str): + continue + lang = track.get('language') or track.get( + 'srclang') or track.get('label') + sub_dict = automatic_captions if track.get( + 'autogenerated') is True else subtitles + sub_dict.setdefault(lang, []).append({ + 'url': src, + }) + download_urls = asset.get('download_urls') if isinstance(download_urls, dict): extract_formats(download_urls.get('Video')) @@ -311,23 +335,16 @@ class UdemyIE(InfoExtractor): extract_formats(data.get('sources')) if not duration: duration = int_or_none(data.get('duration')) - tracks = data.get('tracks') - if isinstance(tracks, list): - for track in tracks: - if not isinstance(track, dict): - continue - if track.get('kind') != 'captions': - continue - src = track.get('src') - if not src or not isinstance(src, compat_str): - continue - lang = track.get('language') or track.get( - 'srclang') or track.get('label') - sub_dict = automatic_captions if track.get( - 'autogenerated') is True else subtitles - sub_dict.setdefault(lang, []).append({ - 'url': src, - }) + extract_subtitles(data.get('tracks')) + + if not subtitles and not automatic_captions: + text_tracks = self._parse_json( + self._search_regex( + r'text-tracks=(["\'])(?P<data>\[.+?\])\1', view_html, + 'text tracks', default='{}', group='data'), video_id, + transform_source=lambda s: js_to_json(unescapeHTML(s)), + fatal=False) + extract_subtitles(text_tracks) self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) diff --git a/youtube_dl/extractor/upskill.py b/youtube_dl/extractor/upskill.py new file mode 100644 index 000000000..30297b4dd --- /dev/null +++ b/youtube_dl/extractor/upskill.py @@ -0,0 +1,176 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .wistia import WistiaIE +from ..compat import compat_str +from ..utils import ( + clean_html, + ExtractorError, + get_element_by_class, + urlencode_postdata, + urljoin, +) + + +class UpskillBaseIE(InfoExtractor): + _LOGIN_URL = 'http://upskillcourses.com/sign_in' + _NETRC_MACHINE = 'upskill' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Downloading login page') + + login_url = compat_str(urlh.geturl()) + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'user[email]': username, + 'user[password]': password, + }) + + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>(?:(?!\1).)+)\1', login_page, + 'post url', default=login_url, group='url') + + if not post_url.startswith('http'): + post_url = urljoin(login_url, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': login_url, + }) + + # Successful login + if any(re.search(p, response) for p in ( + r'class=["\']user-signout', + r'<a[^>]+\bhref=["\']/sign_out', + r'>\s*Log out\s*<')): + return + + message = get_element_by_class('alert', response) + if message is not None: + raise ExtractorError( + 'Unable to login: %s' % clean_html(message), expected=True) + + raise ExtractorError('Unable to log in') + + +class UpskillIE(UpskillBaseIE): + _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/[^/]+/lectures/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'info_dict': { + 'id': 'uzw6zw58or', + 'ext': 'mp4', + 'title': 'Welcome to the Course!', + 'description': 'md5:8d66c13403783370af62ca97a7357bdd', + 'duration': 138.763, + 'timestamp': 1479846621, + 'upload_date': '20161122', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://upskillcourses.com/courses/119763/lectures/1747100', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + wistia_url = WistiaIE._extract_url(webpage) + if not wistia_url: + if any(re.search(p, webpage) for p in ( + r'class=["\']lecture-contents-locked', + r'>\s*Lecture contents locked', + r'id=["\']lecture-locked')): + self.raise_login_required('Lecture contents locked') + + title = self._og_search_title(webpage, default=None) + + return { + '_type': 'url_transparent', + 'url': wistia_url, + 'ie_key': WistiaIE.ie_key(), + 'title': title, + } + + +class UpskillCourseIE(UpskillBaseIE): + _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/(?:enrolled/)?(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/', + 'info_dict': { + 'id': '119763', + 'title': 'The Essential Web Developer Course (Free)', + }, + 'playlist_count': 192, + }, { + 'url': 'http://upskillcourses.com/courses/119763/', + 'only_matching': True, + }, { + 'url': 'http://upskillcourses.com/courses/enrolled/119763', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if UpskillIE.suitable(url) else super( + UpskillCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_id = self._match_id(url) + + webpage = self._download_webpage(url, course_id) + + course_id = self._search_regex( + r'data-course-id=["\'](\d+)', webpage, 'course id', + default=course_id) + + entries = [] + + for mobj in re.finditer( + r'(?s)(?P<li><li[^>]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?</li>)', + webpage): + li = mobj.group('li') + if 'fa-youtube-play' not in li: + continue + lecture_url = self._search_regex( + r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', li, + 'lecture url', default=None, group='url') + if not lecture_url: + continue + lecture_id = self._search_regex( + r'/lectures/(\d+)', lecture_url, 'lecture id', default=None) + title = self._html_search_regex( + r'<span[^>]+class=["\']lecture-name[^>]+>([^<]+)', li, + 'title', default=None) + entries.append( + self.url_result( + urljoin('http://upskillcourses.com/', lecture_url), + ie=UpskillIE.ie_key(), video_id=lecture_id, + video_title=clean_html(title))) + + course_title = self._html_search_regex( + (r'(?s)<img[^>]+class=["\']course-image[^>]+>\s*<h\d>(.+?)</h', + r'(?s)<h\d[^>]+class=["\']course-title[^>]+>(.+?)</h'), + webpage, 'course title', fatal=False) + + return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index 0f5d68738..b20dddc5c 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -12,47 +12,46 @@ from ..utils import ( class VeohIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|yapi-)[\da-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|e|yapi-)[\da-zA-Z]+)' - _TESTS = [ - { - 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3', - 'md5': '620e68e6a3cff80086df3348426c9ca3', - 'info_dict': { - 'id': '56314296', - 'ext': 'mp4', - 'title': 'Straight Backs Are Stronger', - 'uploader': 'LUMOback', - 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ', - }, + _TESTS = [{ + 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3', + 'md5': '620e68e6a3cff80086df3348426c9ca3', + 'info_dict': { + 'id': '56314296', + 'ext': 'mp4', + 'title': 'Straight Backs Are Stronger', + 'uploader': 'LUMOback', + 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ', }, - { - 'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage', - 'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa', - 'info_dict': { - 'id': '27701988', - 'ext': 'mp4', - 'title': 'Chile workers cover up to avoid skin damage', - 'description': 'md5:2bd151625a60a32822873efc246ba20d', - 'uploader': 'afp-news', - 'duration': 123, - }, - 'skip': 'This video has been deleted.', + }, { + 'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage', + 'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa', + 'info_dict': { + 'id': '27701988', + 'ext': 'mp4', + 'title': 'Chile workers cover up to avoid skin damage', + 'description': 'md5:2bd151625a60a32822873efc246ba20d', + 'uploader': 'afp-news', + 'duration': 123, }, - { - 'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX', - 'md5': '4fde7b9e33577bab2f2f8f260e30e979', - 'note': 'Embedded ooyala video', - 'info_dict': { - 'id': '69525809', - 'ext': 'mp4', - 'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery', - 'description': 'md5:f5a11c51f8fb51d2315bca0937526891', - 'uploader': 'newsy-videos', - }, - 'skip': 'This video has been deleted.', + 'skip': 'This video has been deleted.', + }, { + 'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX', + 'md5': '4fde7b9e33577bab2f2f8f260e30e979', + 'note': 'Embedded ooyala video', + 'info_dict': { + 'id': '69525809', + 'ext': 'mp4', + 'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery', + 'description': 'md5:f5a11c51f8fb51d2315bca0937526891', + 'uploader': 'newsy-videos', }, - ] + 'skip': 'This video has been deleted.', + }, { + 'url': 'http://www.veoh.com/watch/e152215AJxZktGS', + 'only_matching': True, + }] def _extract_formats(self, source): formats = [] diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 0f8c156a7..c21a09c01 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -42,7 +42,7 @@ class VGTVIE(XstreamIE): ) /? (?: - \#!/(?:video|live)/| + (?:\#!/)?(?:video|live)/| embed?.*id=| articles/ )| @@ -146,7 +146,11 @@ class VGTVIE(XstreamIE): { 'url': 'abtv:140026', 'only_matching': True, - } + }, + { + 'url': 'http://www.vgtv.no/video/84196/hevnen-er-soet-episode-10-abu', + 'only_matching': True, + }, ] def _real_extract(self, url): diff --git a/youtube_dl/extractor/vh1.py b/youtube_dl/extractor/vh1.py index 6be3774b7..570fa45ea 100644 --- a/youtube_dl/extractor/vh1.py +++ b/youtube_dl/extractor/vh1.py @@ -121,7 +121,11 @@ class VH1IE(MTVIE): idoc = self._download_xml( doc_url, video_id, 'Downloading info', transform_source=fix_xml_ampersands) - return self.playlist_result( - [self._get_video_info(item) for item in idoc.findall('.//item')], - playlist_id=video_id, - ) + + entries = [] + for item in idoc.findall('.//item'): + info = self._get_video_info(item) + if info: + entries.append(info) + + return self.playlist_result(entries, playlist_id=video_id) diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index f0a7fd739..b8b8bf979 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -7,6 +7,7 @@ import hashlib import json from .adobepass import AdobePassIE +from .youtube import YoutubeIE from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( @@ -20,7 +21,7 @@ from ..utils import ( class ViceBaseIE(AdobePassIE): - def _extract_preplay_video(self, url, webpage): + def _extract_preplay_video(self, url, locale, webpage): watch_hub_data = extract_attributes(self._search_regex( r'(?s)(<watch-hub\s*.+?</watch-hub>)', webpage, 'watch hub')) video_id = watch_hub_data['vms-id'] @@ -32,7 +33,8 @@ class ViceBaseIE(AdobePassIE): resource = self._get_mvpd_resource( 'VICELAND', title, video_id, watch_hub_data.get('video-rating')) - query['tvetoken'] = self._extract_mvpd_auth(url, video_id, 'VICELAND', resource) + query['tvetoken'] = self._extract_mvpd_auth( + url, video_id, 'VICELAND', resource) # signature generation algorithm is reverse engineered from signatureGenerator in # webpack:///../shared/~/vice-player/dist/js/vice-player.js in @@ -45,11 +47,14 @@ class ViceBaseIE(AdobePassIE): try: host = 'www.viceland' if is_locked else self._PREPLAY_HOST - preplay = self._download_json('https://%s.com/en_us/preplay/%s' % (host, video_id), video_id, query=query) + preplay = self._download_json( + 'https://%s.com/%s/preplay/%s' % (host, locale, video_id), + video_id, query=query) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: error = json.loads(e.cause.read().decode()) - raise ExtractorError('%s said: %s' % (self.IE_NAME, error['details']), expected=True) + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, error['details']), expected=True) raise video_data = preplay['video'] @@ -88,41 +93,30 @@ class ViceBaseIE(AdobePassIE): class ViceIE(ViceBaseIE): - _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P<id>[^/?#&]+)' + IE_NAME = 'vice' + _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:(?P<locale>[^/]+)/)?videos?/(?P<id>[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.vice.com/video/cowboy-capitalists-part-1', - 'md5': 'e9d77741f9e42ba583e683cd170660f7', + 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', + 'md5': '7d3ae2f9ba5f196cdd9f9efd43657ac2', 'info_dict': { - 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', + 'id': 'N2bzkydjraWDGwnt8jAttCF6Y0PDv4Zj', 'ext': 'flv', - 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', - 'duration': 725.983, + 'title': 'Monkey Labs of Holland', + 'description': 'md5:92b3c7dcbfe477f772dd4afa496c9149', }, 'add_ie': ['Ooyala'], }, { - 'url': 'http://www.vice.com/video/how-to-hack-a-car', - 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', - 'info_dict': { - 'id': '3jstaBeXgAs', - 'ext': 'mp4', - 'title': 'How to Hack a Car: Phreaked Out (Episode 2)', - 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', - 'uploader_id': 'MotherboardTV', - 'uploader': 'Motherboard', - 'upload_date': '20140529', - }, - 'add_ie': ['Youtube'], - }, { 'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56', - 'md5': '', 'info_dict': { 'id': '5816510690b70e6c5fd39a56', 'ext': 'mp4', 'uploader': 'Waypoint', 'title': 'The Signal From Tölva', + 'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5', 'uploader_id': '57f7d621e05ca860fa9ccaf9', - 'timestamp': 1477941983938, + 'timestamp': 1477941983, + 'upload_date': '20161031', }, 'params': { # m3u8 download @@ -130,19 +124,31 @@ class ViceIE(ViceBaseIE): }, 'add_ie': ['UplynkPreplay'], }, { - 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', - 'only_matching': True, - }, { - 'url': 'http://www.vice.com/ru/video/big-night-out-ibiza-clive-martin-229', - 'only_matching': True, + 'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f', + 'info_dict': { + 'id': '581b12b60a0e1f4c0fb6ea2f', + 'ext': 'mp4', + 'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1', + 'description': '<p>Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.</p>', + 'uploader': 'VICE', + 'uploader_id': '57a204088cb727dec794c67b', + 'timestamp': 1485368119, + 'upload_date': '20170125', + 'age_limit': 14, + }, + 'params': { + # AES-encrypted m3u8 + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], }, { - 'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show', + 'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4', 'only_matching': True, }] _PREPLAY_HOST = 'video.vice' def _real_extract(self, url): - video_id = self._match_id(url) + locale, video_id = re.match(self._VALID_URL, url).groups() webpage, urlh = self._download_webpage_handle(url, video_id) embed_code = self._search_regex( r'embedCode=([^&\'"]+)', webpage, @@ -153,10 +159,11 @@ class ViceIE(ViceBaseIE): r'data-youtube-id="([^"]+)"', webpage, 'youtube id', default=None) if youtube_id: return self.url_result(youtube_id, 'Youtube') - return self._extract_preplay_video(urlh.geturl(), webpage) + return self._extract_preplay_video(urlh.geturl(), locale, webpage) class ViceShowIE(InfoExtractor): + IE_NAME = 'vice:show' _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)' _TEST = { @@ -183,6 +190,84 @@ class ViceShowIE(InfoExtractor): r'<title>(.+?)</title>', webpage, 'title', default=None) if title: title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip() - description = self._html_search_meta('description', webpage, 'description') + description = self._html_search_meta( + 'description', webpage, 'description') return self.playlist_result(entries, show_id, title, description) + + +class ViceArticleIE(InfoExtractor): + IE_NAME = 'vice:article' + _VALID_URL = r'https://www.vice.com/[^/]+/article/(?P<id>[^?#]+)' + + _TESTS = [{ + 'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah', + 'info_dict': { + 'id': '58dc0a3dee202d2a0ccfcbd8', + 'ext': 'mp4', + 'title': 'Mormon War on Porn ', + 'description': 'md5:ad396a2481e7f8afb5ed486878421090', + 'uploader': 'VICE', + 'uploader_id': '57a204088cb727dec794c693', + 'timestamp': 1489160690, + 'upload_date': '20170310', + }, + 'params': { + # AES-encrypted m3u8 + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], + }, { + 'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car', + 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', + 'info_dict': { + 'id': '3jstaBeXgAs', + 'ext': 'mp4', + 'title': 'How to Hack a Car: Phreaked Out (Episode 2)', + 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', + 'uploader_id': 'MotherboardTV', + 'uploader': 'Motherboard', + 'upload_date': '20140529', + }, + 'add_ie': ['Youtube'], + }, { + 'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1', + 'only_matching': True, + }, { + 'url': 'https://www.vice.com/ru/article/big-night-out-ibiza-clive-martin-229', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + prefetch_data = self._parse_json(self._search_regex( + r'window\.__PREFETCH_DATA\s*=\s*({.*});', + webpage, 'prefetch data'), display_id) + body = prefetch_data['body'] + + def _url_res(video_url, ie_key): + return { + '_type': 'url_transparent', + 'url': video_url, + 'display_id': display_id, + 'ie_key': ie_key, + } + + embed_code = self._search_regex( + r'embedCode=([^&\'"]+)', body, + 'ooyala embed code', default=None) + if embed_code: + return _url_res('ooyala:%s' % embed_code, 'Ooyala') + + youtube_url = YoutubeIE._extract_url(body) + if youtube_url: + return _url_res(youtube_url, YoutubeIE.ie_key()) + + video_url = self._html_search_regex( + r'data-video-url="([^"]+)"', + prefetch_data['embed_code'], 'video URL') + + return _url_res(video_url, ViceIE.ie_key()) diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py index 87f9216b5..bd60235c8 100644 --- a/youtube_dl/extractor/viceland.py +++ b/youtube_dl/extractor/viceland.py @@ -1,11 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .vice import ViceBaseIE class VicelandIE(ViceBaseIE): - _VALID_URL = r'https?://(?:www\.)?viceland\.com/[^/]+/video/[^/]+/(?P<id>[a-f0-9]+)' + _VALID_URL = r'https?://(?:www\.)?viceland\.com/(?P<locale>[^/]+)/video/[^/]+/(?P<id>[a-f0-9]+)' _TEST = { 'url': 'https://www.viceland.com/en_us/video/trapped/588a70d0dba8a16007de7316', 'info_dict': { @@ -24,10 +26,13 @@ class VicelandIE(ViceBaseIE): 'skip_download': True, }, 'add_ie': ['UplynkPreplay'], + 'skip': '404', } _PREPLAY_HOST = 'www.viceland' def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + locale = mobj.group('locale') webpage = self._download_webpage(url, video_id) - return self._extract_preplay_video(url, webpage) + return self._extract_preplay_video(url, locale, webpage) diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py index 701bb1d01..01da32f1c 100644 --- a/youtube_dl/extractor/vidio.py +++ b/youtube_dl/extractor/vidio.py @@ -56,7 +56,8 @@ class VidioIE(InfoExtractor): self._sort_formats(formats) duration = int_or_none(duration or self._search_regex( - r'data-video-duration=(["\'])(?P<duartion>\d+)\1', webpage, 'duration')) + r'data-video-duration=(["\'])(?P<duration>\d+)\1', webpage, + 'duration', fatal=False, group='duration')) thumbnail = thumbnail or self._og_search_thumbnail(webpage) like_count = int_or_none(self._search_regex( diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index e9ff336c4..59adb2377 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -3,7 +3,10 @@ from __future__ import unicode_literals import itertools from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( ExtractorError, int_or_none, @@ -161,13 +164,28 @@ class VidmeIE(InfoExtractor): 'or for violating the terms of use.', expected=True) - formats = [{ - 'format_id': f.get('type'), - 'url': f['uri'], - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'preference': 0 if f.get('type', '').endswith('clip') else 1, - } for f in video.get('formats', []) if f.get('uri')] + formats = [] + for f in video.get('formats', []): + format_url = f.get('uri') + if not format_url or not isinstance(format_url, compat_str): + continue + format_type = f.get('type') + if format_type == 'dash': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + elif format_type == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'format_id': f.get('type'), + 'url': format_url, + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'preference': 0 if f.get('type', '').endswith( + 'clip') else 1, + }) if not formats and video.get('complete_url'): formats.append({ @@ -245,29 +263,35 @@ class VidmeListBaseIE(InfoExtractor): class VidmeUserIE(VidmeListBaseIE): IE_NAME = 'vidme:user' - _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})(?!/likes)(?:[^\da-zA-Z]|$)' + _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})(?!/likes)(?:[^\da-zA-Z_-]|$)' _API_ITEM = 'list' _TITLE = 'Videos' - _TEST = { - 'url': 'https://vid.me/EFARCHIVE', + _TESTS = [{ + 'url': 'https://vid.me/MasakoX', 'info_dict': { - 'id': '3834632', - 'title': 'EFARCHIVE - %s' % _TITLE, + 'id': '16112341', + 'title': 'MasakoX - %s' % _TITLE, }, - 'playlist_mincount': 238, - } + 'playlist_mincount': 191, + }, { + 'url': 'https://vid.me/unsQuare_netWork', + 'only_matching': True, + }] class VidmeUserLikesIE(VidmeListBaseIE): IE_NAME = 'vidme:user:likes' - _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})/likes' + _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})/likes' _API_ITEM = 'likes' _TITLE = 'Likes' - _TEST = { + _TESTS = [{ 'url': 'https://vid.me/ErinAlexis/likes', 'info_dict': { 'id': '6483530', 'title': 'ErinAlexis - %s' % _TITLE, }, 'playlist_mincount': 415, - } + }, { + 'url': 'https://vid.me/Kaleidoscope-Ish/likes', + 'only_matching': True, + }] diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index 5ef7635b6..dbd5ba9ba 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -5,24 +5,44 @@ import re import itertools from .common import InfoExtractor +from ..utils import ( + urlencode_postdata, + int_or_none, + unified_strdate, +) class VierIE(InfoExtractor): IE_NAME = 'vier' IE_DESC = 'vier.be and vijf.be' - _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?(?P<site>vier|vijf)\.be/ + (?: + (?: + [^/]+/videos| + video(?:/[^/]+)* + )/ + (?P<display_id>[^/]+)(?:/(?P<id>\d+))?| + (?: + video/v3/embed| + embed/video/public + )/(?P<embed_id>\d+) + ) + ''' + _NETRC_MACHINE = 'vier' _TESTS = [{ 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', + 'md5': 'e4ae2054a6b040ef1e289e20d111b46e', 'info_dict': { 'id': '16129', 'display_id': 'het-wordt-warm-de-moestuin', 'ext': 'mp4', 'title': 'Het wordt warm in De Moestuin', 'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...', - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'upload_date': '20121025', + 'series': 'Plan B', + 'tags': ['De Moestuin', 'Moestuin', 'meisjes', 'Tomaat', 'Wim', 'Droom'], }, }, { 'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614', @@ -30,46 +50,145 @@ class VierIE(InfoExtractor): 'id': '2561614', 'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas', 'ext': 'mp4', - 'title': 'ZO grappig: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s', - 'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.', + 'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7', + 'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe', + 'upload_date': '20170228', + 'series': 'Temptation Island', + 'tags': list, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', + 'info_dict': { + 'id': '2674839', + 'display_id': 'jani-gaat-naar-tokio-aflevering-4', + 'ext': 'mp4', + 'title': 'Jani gaat naar Tokio - Aflevering 4', + 'description': 'md5:aa8d611541db6ae9e863125704511f88', + 'upload_date': '20170501', + 'series': 'Jani gaat', + 'episode_number': 4, + 'tags': ['Jani Gaat', 'Volledige Aflevering'], + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires account credentials', + }, { + # Requires account credentials but bypassed extraction via v3/embed page + # without metadata + 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', + 'info_dict': { + 'id': '2674839', + 'display_id': 'jani-gaat-naar-tokio-aflevering-4', + 'ext': 'mp4', + 'title': 'jani-gaat-naar-tokio-aflevering-4', }, 'params': { - # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Log in to extract metadata'], }, { - 'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen', + # Without video id in URL + 'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b', 'only_matching': True, }, { 'url': 'http://www.vier.be/video/v3/embed/16129', 'only_matching': True, + }, { + 'url': 'https://www.vijf.be/embed/video/public/4093', + 'only_matching': True, + }, { + 'url': 'https://www.vier.be/video/blockbusters/in-juli-en-augustus-summer-classics', + 'only_matching': True, + }, { + 'url': 'https://www.vier.be/video/achter-de-rug/2017/achter-de-rug-seizoen-1-aflevering-6', + 'only_matching': True, }] + def _real_initialize(self): + self._logged_in = False + + def _login(self, site): + username, password = self._get_login_info() + if username is None or password is None: + return + + login_page = self._download_webpage( + 'http://www.%s.be/user/login' % site, + None, note='Logging in', errnote='Unable to log in', + data=urlencode_postdata({ + 'form_id': 'user_login', + 'name': username, + 'pass': password, + }), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + login_error = self._html_search_regex( + r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<', + login_page, 'login error', default=None) + if login_error: + self.report_warning('Unable to log in: %s' % login_error) + else: + self._logged_in = True + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) embed_id = mobj.group('embed_id') display_id = mobj.group('display_id') or embed_id + video_id = mobj.group('id') or embed_id site = mobj.group('site') + if not self._logged_in: + self._login(site) + webpage = self._download_webpage(url, display_id) + if r'id="user-login"' in webpage: + self.report_warning( + 'Log in to extract metadata', video_id=display_id) + webpage = self._download_webpage( + 'http://www.%s.be/video/v3/embed/%s' % (site, video_id), + display_id) + video_id = self._search_regex( [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'], - webpage, 'video id') - application = self._search_regex( - [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], - webpage, 'application', default=site + '_vod') - filename = self._search_regex( - [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], - webpage, 'filename') - - playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename) - formats = self._extract_wowza_formats(playlist_url, display_id, skip_protocols=['dash']) + webpage, 'video id', default=video_id or display_id) + + playlist_url = self._search_regex( + r'data-file=(["\'])(?P<url>(?:https?:)?//[^/]+/.+?\.m3u8.*?)\1', + webpage, 'm3u8 url', default=None, group='url') + + if not playlist_url: + application = self._search_regex( + [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], + webpage, 'application', default=site + '_vod') + filename = self._search_regex( + [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], + webpage, 'filename') + playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename) + + formats = self._extract_wowza_formats( + playlist_url, display_id, skip_protocols=['dash']) self._sort_formats(formats) title = self._og_search_title(webpage, default=display_id) - description = self._og_search_description(webpage, default=None) + description = self._html_search_regex( + r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-type-text-with-summary\b[^>]*?\1[^>]*>.*?<p>(?P<value>.+?)</p>', + webpage, 'description', default=None, group='value') thumbnail = self._og_search_thumbnail(webpage, default=None) + upload_date = unified_strdate(self._html_search_regex( + r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-name-post-date\b[^>]*?\1[^>]*>.*?(?P<value>\d{2}/\d{2}/\d{4})', + webpage, 'upload date', default=None, group='value')) + + series = self._search_regex( + r'data-program=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'series', default=None, group='value') + episode_number = int_or_none(self._search_regex( + r'(?i)aflevering (\d+)', title, 'episode number', default=None)) + tags = re.findall(r'<a\b[^>]+\bhref=["\']/tags/[^>]+>([^<]+)<', webpage) return { 'id': video_id, @@ -77,6 +196,10 @@ class VierIE(InfoExtractor): 'title': title, 'description': description, 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'series': series, + 'episode_number': episode_number, + 'tags': tags, 'formats': formats, } diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 4adcd1830..a0abbae60 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -4,12 +4,14 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urlparse, + compat_HTTPError, compat_str, + compat_urlparse, ) from ..utils import ( - parse_duration, + ExtractorError, js_to_json, + parse_duration, parse_iso8601, ) @@ -128,9 +130,16 @@ class ViideaIE(InfoExtractor): base_url = self._proto_relative_url(cfg['livepipe'], 'http:') - lecture_data = self._download_json( - '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), - lecture_id)['lecture'][0] + try: + lecture_data = self._download_json( + '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), + lecture_id)['lecture'][0] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + msg = self._parse_json( + e.cause.read().decode('utf-8'), lecture_id) + raise ExtractorError(msg['detail'], expected=True) + raise lecture_info = { 'id': lecture_id, diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 61cc469bf..c3f71b45e 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -151,10 +151,16 @@ class VimeoBaseInfoExtractor(InfoExtractor): else: mpd_manifest_urls = [(format_id, manifest_url)] for f_id, m_url in mpd_manifest_urls: - formats.extend(self._extract_mpd_formats( + mpd_formats = self._extract_mpd_formats( m_url.replace('/master.json', '/master.mpd'), video_id, f_id, 'Downloading %s MPD information' % cdn_name, - fatal=False)) + fatal=False) + for f in mpd_formats: + if f.get('vcodec') == 'none': + f['preference'] = -50 + elif f.get('acodec') == 'none': + f['preference'] = -40 + formats.extend(mpd_formats) subtitles = {} text_tracks = config['request'].get('text_tracks') @@ -609,7 +615,10 @@ class VimeoIE(VimeoBaseInfoExtractor): if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'): source_name = source_file.get('public_name', 'Original') if self._is_valid_url(download_url, video_id, '%s video' % source_name): - ext = source_file.get('extension', determine_ext(download_url)).lower() + ext = (try_get( + source_file, lambda x: x['extension'], + compat_str) or determine_ext( + download_url, None) or 'mp4').lower() formats.append({ 'url': download_url, 'ext': ext, diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 4957a07f7..46950d3a1 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -92,10 +92,12 @@ class VineIE(InfoExtractor): username = data.get('username') + alt_title = 'Vine by %s' % username if username else None + return { 'id': video_id, - 'title': data.get('description'), - 'alt_title': 'Vine by %s' % username if username else None, + 'title': data.get('description') or alt_title or 'Vine video', + 'alt_title': alt_title, 'thumbnail': data.get('thumbnailUrl'), 'timestamp': unified_timestamp(data.get('created')), 'uploader': username, diff --git a/youtube_dl/extractor/viu.py b/youtube_dl/extractor/viu.py index db6a65d2e..5cf93591c 100644 --- a/youtube_dl/extractor/viu.py +++ b/youtube_dl/extractor/viu.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_kwargs, + compat_str, +) from ..utils import ( ExtractorError, int_or_none, @@ -36,7 +39,8 @@ class ViuBaseIE(InfoExtractor): headers.update(kwargs.get('headers', {})) kwargs['headers'] = headers response = self._download_json( - 'https://www.viu.com/api/' + path, *args, **kwargs)['response'] + 'https://www.viu.com/api/' + path, *args, + **compat_kwargs(kwargs))['response'] if response.get('status') != 'success': raise ExtractorError('%s said: %s' % ( self.IE_NAME, response['message']), expected=True) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index dc2719cf9..105e172d5 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -25,6 +25,7 @@ from ..utils import ( from .dailymotion import DailymotionIE from .pladform import PladformIE from .vimeo import VimeoIE +from .youtube import YoutubeIE class VKBaseIE(InfoExtractor): @@ -345,11 +346,9 @@ class VKIE(VKBaseIE): if re.search(error_re, info_page): raise ExtractorError(error_msg % video_id, expected=True) - youtube_url = self._search_regex( - r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', - info_page, 'youtube iframe', default=None) + youtube_url = YoutubeIE._extract_url(info_page) if youtube_url: - return self.url_result(youtube_url, 'Youtube') + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) vimeo_url = VimeoIE._extract_url(url, info_page) if vimeo_url is not None: diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index e58940607..64d0224e6 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -49,6 +49,10 @@ class VLiveIE(InfoExtractor): }, }] + @classmethod + def suitable(cls, url): + return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url) + def _real_extract(self, url): video_id = self._match_id(url) @@ -232,7 +236,12 @@ class VLiveChannelIE(InfoExtractor): query={ 'app_id': app_id, 'channelSeq': channel_seq, - 'maxNumOfRows': 1000, + # Large values of maxNumOfRows (~300 or above) may cause + # empty responses (see [1]), e.g. this happens for [2] that + # has more than 300 videos. + # 1. https://github.com/rg3/youtube-dl/issues/13830 + # 2. http://channels.vlive.tv/EDBF. + 'maxNumOfRows': 100, '_': int(time.time()), 'pageNo': page_num } @@ -261,3 +270,54 @@ class VLiveChannelIE(InfoExtractor): return self.playlist_result( entries, channel_code, channel_name) + + +class VLivePlaylistIE(InfoExtractor): + IE_NAME = 'vlive:playlist' + _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.vlive.tv/video/22867/playlist/22912', + 'info_dict': { + 'id': '22912', + 'title': 'Valentine Day Message from TWICE' + }, + 'playlist_mincount': 9 + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, playlist_id = mobj.group('video_id', 'id') + + VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s' + if self._downloader.params.get('noplaylist'): + self.to_screen( + 'Downloading just video %s because of --no-playlist' % video_id) + return self.url_result( + VIDEO_URL_TEMPLATE % video_id, + ie=VLiveIE.ie_key(), video_id=video_id) + + self.to_screen( + 'Downloading playlist %s - add --no-playlist to just download video' + % playlist_id) + + webpage = self._download_webpage( + 'http://www.vlive.tv/video/%s/playlist/%s' + % (video_id, playlist_id), playlist_id) + + item_ids = self._parse_json( + self._search_regex( + r'playlistVideoSeqs\s*=\s*(\[[^]]+\])', webpage, + 'playlist video seqs'), + playlist_id) + + entries = [ + self.url_result( + VIDEO_URL_TEMPLATE % item_id, ie=VLiveIE.ie_key(), + video_id=compat_str(item_id)) + for item_id in item_ids] + + playlist_name = self._html_search_regex( + r'<div[^>]+class="[^"]*multicam_playlist[^>]*>\s*<h3[^>]+>([^<]+)', + webpage, 'playlist title', fatal=False) + + return self.playlist_result(entries, playlist_id, playlist_name) diff --git a/youtube_dl/extractor/voot.py b/youtube_dl/extractor/voot.py new file mode 100644 index 000000000..5de3deb8c --- /dev/null +++ b/youtube_dl/extractor/voot.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .kaltura import KalturaIE +from ..utils import ( + ExtractorError, + int_or_none, + try_get, + unified_timestamp, +) + + +class VootIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?voot\.com/(?:[^/]+/)+(?P<id>\d+)' + _GEO_COUNTRIES = ['IN'] + _TESTS = [{ + 'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353', + 'info_dict': { + 'id': '0_8ledb18o', + 'ext': 'mp4', + 'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340', + 'description': 'md5:06291fbbbc4dcbe21235c40c262507c1', + 'uploader_id': 'batchUser', + 'timestamp': 1472162937, + 'upload_date': '20160825', + 'duration': 1146, + 'series': 'Ishq Ka Rang Safed', + 'season_number': 1, + 'episode': 'Is this the end of Kamini?', + 'episode_number': 340, + 'view_count': int, + 'like_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'https://www.voot.com/kids/characters/mighty-cat-masked-niyander-e-/400478/school-bag-disappears/440925', + 'only_matching': True, + }, { + 'url': 'https://www.voot.com/movies/pandavas-5/424627', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + media_info = self._download_json( + 'https://wapi.voot.com/ws/ott/getMediaInfo.json', video_id, + query={ + 'platform': 'Web', + 'pId': 2, + 'mediaId': video_id, + }) + + status_code = try_get(media_info, lambda x: x['status']['code'], int) + if status_code != 0: + raise ExtractorError(media_info['status']['message'], expected=True) + + media = media_info['assets'] + + entry_id = media['EntryId'] + title = media['MediaName'] + + description, series, season_number, episode, episode_number = [None] * 5 + + for meta in try_get(media, lambda x: x['Metas'], list) or []: + key, value = meta.get('Key'), meta.get('Value') + if not key or not value: + continue + if key == 'ContentSynopsis': + description = value + elif key == 'RefSeriesTitle': + series = value + elif key == 'RefSeriesSeason': + season_number = int_or_none(value) + elif key == 'EpisodeMainTitle': + episode = value + elif key == 'EpisodeNo': + episode_number = int_or_none(value) + + return { + '_type': 'url_transparent', + 'url': 'kaltura:1982551:%s' % entry_id, + 'ie_key': KalturaIE.ie_key(), + 'title': title, + 'description': description, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + 'timestamp': unified_timestamp(media.get('CreationDate')), + 'duration': int_or_none(media.get('Duration')), + 'view_count': int_or_none(media.get('ViewCounter')), + 'like_count': int_or_none(media.get('like_counter')), + } diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 487047fd7..9959627c0 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -112,21 +112,41 @@ class VRVIE(VRVBaseIE): audio_locale = streams_json.get('audio_locale') formats = [] - for stream_id, stream in streams_json.get('streams', {}).get('adaptive_hls', {}).items(): - stream_url = stream.get('url') - if not stream_url: - continue - stream_id = stream_id or audio_locale - m3u8_formats = self._extract_m3u8_formats( - stream_url, video_id, 'mp4', m3u8_id=stream_id, - note='Downloading %s m3u8 information' % stream_id, - fatal=False) - if audio_locale: - for f in m3u8_formats: - f['language'] = audio_locale - formats.extend(m3u8_formats) + for stream_type, streams in streams_json.get('streams', {}).items(): + if stream_type in ('adaptive_hls', 'adaptive_dash'): + for stream in streams.values(): + stream_url = stream.get('url') + if not stream_url: + continue + stream_id = stream.get('hardsub_locale') or audio_locale + format_id = '%s-%s' % (stream_type.split('_')[1], stream_id) + if stream_type == 'adaptive_hls': + adaptive_formats = self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id=format_id, + note='Downloading %s m3u8 information' % stream_id, + fatal=False) + else: + adaptive_formats = self._extract_mpd_formats( + stream_url, video_id, mpd_id=format_id, + note='Downloading %s MPD information' % stream_id, + fatal=False) + if audio_locale: + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = audio_locale + formats.extend(adaptive_formats) self._sort_formats(formats) + subtitles = {} + for subtitle in streams_json.get('subtitles', {}).values(): + subtitle_url = subtitle.get('url') + if not subtitle_url: + continue + subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({ + 'url': subtitle_url, + 'ext': subtitle.get('format', 'ass'), + }) + thumbnails = [] for thumbnail in video_data.get('images', {}).get('thumbnails', []): thumbnail_url = thumbnail.get('source') @@ -142,6 +162,7 @@ class VRVIE(VRVBaseIE): 'id': video_id, 'title': title, 'formats': formats, + 'subtitles': subtitles, 'thumbnails': thumbnails, 'description': video_data.get('description'), 'duration': float_or_none(video_data.get('duration_ms'), 1000), diff --git a/youtube_dl/extractor/vzaar.py b/youtube_dl/extractor/vzaar.py index b270f08d1..02fcd52c7 100644 --- a/youtube_dl/extractor/vzaar.py +++ b/youtube_dl/extractor/vzaar.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( int_or_none, @@ -28,6 +30,12 @@ class VzaarIE(InfoExtractor): }, }] + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+src=["\']((?:https?:)?//(?:view\.vzaar\.com)/[0-9]+)', + webpage) + def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( diff --git a/youtube_dl/extractor/watchbox.py b/youtube_dl/extractor/watchbox.py new file mode 100644 index 000000000..b382338fa --- /dev/null +++ b/youtube_dl/extractor/watchbox.py @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + js_to_json, + strip_or_none, + try_get, + unified_timestamp, +) + + +class WatchBoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?watchbox\.de/(?P<kind>serien|filme)/(?:[^/]+/)*[^/]+-(?P<id>\d+)' + _TESTS = [{ + # film + 'url': 'https://www.watchbox.de/filme/free-jimmy-12325.html', + 'info_dict': { + 'id': '341368', + 'ext': 'mp4', + 'title': 'Free Jimmy', + 'description': 'md5:bcd8bafbbf9dc0ef98063d344d7cc5f6', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 4890, + 'age_limit': 16, + 'release_year': 2009, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + # episode + 'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-1/date-in-der-hoelle-328286.html', + 'info_dict': { + 'id': '328286', + 'ext': 'mp4', + 'title': 'S01 E01 - Date in der Hölle', + 'description': 'md5:2f31c74a8186899f33cb5114491dae2b', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1291, + 'age_limit': 12, + 'release_year': 2010, + 'series': 'Ugly Americans', + 'season_number': 1, + 'episode': 'Date in der Hölle', + 'episode_number': 1, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-2/der-ring-des-powers-328270', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + kind, video_id = mobj.group('kind', 'id') + + webpage = self._download_webpage(url, video_id) + + source = self._parse_json( + self._search_regex( + r'(?s)source\s*:\s*({.+?})\s*,\s*\n', webpage, 'source', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) or {} + + video_id = compat_str(source.get('videoId') or video_id) + + devapi = self._download_json( + 'http://api.watchbox.de/devapi/id/%s' % video_id, video_id, query={ + 'format': 'json', + 'apikey': 'hbbtv', + }, fatal=False) + + item = try_get(devapi, lambda x: x['items'][0], dict) or {} + + title = item.get('title') or try_get( + item, lambda x: x['movie']['headline_movie'], + compat_str) or source['title'] + + formats = [] + hls_url = item.get('media_videourl_hls') or source.get('hls') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + dash_url = item.get('media_videourl_wv') or source.get('dash') + if dash_url: + formats.extend(self._extract_mpd_formats( + dash_url, video_id, mpd_id='dash', fatal=False)) + mp4_url = item.get('media_videourl') + if mp4_url: + formats.append({ + 'url': mp4_url, + 'format_id': 'mp4', + 'width': int_or_none(item.get('width')), + 'height': int_or_none(item.get('height')), + 'tbr': int_or_none(item.get('bitrate')), + }) + self._sort_formats(formats) + + description = strip_or_none(item.get('descr')) + thumbnail = item.get('media_content_thumbnail_large') or source.get('poster') or item.get('media_thumbnail') + duration = int_or_none(item.get('media_length') or source.get('length')) + timestamp = unified_timestamp(item.get('pubDate')) + view_count = int_or_none(item.get('media_views')) + age_limit = int_or_none(try_get(item, lambda x: x['movie']['fsk'])) + release_year = int_or_none(try_get(item, lambda x: x['movie']['rel_year'])) + + info = { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, + 'age_limit': age_limit, + 'release_year': release_year, + 'formats': formats, + } + + if kind.lower() == 'serien': + series = try_get( + item, lambda x: x['special']['title'], + compat_str) or source.get('format') + season_number = int_or_none(self._search_regex( + r'^S(\d{1,2})\s*E\d{1,2}', title, 'season number', + default=None) or self._search_regex( + r'/staffel-(\d+)/', url, 'season number', default=None)) + episode = source.get('title') + episode_number = int_or_none(self._search_regex( + r'^S\d{1,2}\s*E(\d{1,2})', title, 'episode number', + default=None)) + info.update({ + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + }) + + return info diff --git a/youtube_dl/extractor/watchindianporn.py b/youtube_dl/extractor/watchindianporn.py index ed099beea..fadc539ee 100644 --- a/youtube_dl/extractor/watchindianporn.py +++ b/youtube_dl/extractor/watchindianporn.py @@ -4,11 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - unified_strdate, - parse_duration, - int_or_none, -) +from ..utils import parse_duration class WatchIndianPornIE(InfoExtractor): @@ -23,11 +19,8 @@ class WatchIndianPornIE(InfoExtractor): 'ext': 'mp4', 'title': 'Hot milf from kerala shows off her gorgeous large breasts on camera', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'LoveJay', - 'upload_date': '20160428', 'duration': 226, 'view_count': int, - 'comment_count': int, 'categories': list, 'age_limit': 18, } @@ -40,51 +33,36 @@ class WatchIndianPornIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - video_url = self._html_search_regex( - r"url: escape\('([^']+)'\)", webpage, 'url') + info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] - title = self._html_search_regex( - r'<h2 class="he2"><span>(.*?)</span>', - webpage, 'title') - thumbnail = self._html_search_regex( - r'<span id="container"><img\s+src="([^"]+)"', - webpage, 'thumbnail', fatal=False) - - uploader = self._html_search_regex( - r'class="aupa">\s*(.*?)</a>', - webpage, 'uploader') - upload_date = unified_strdate(self._html_search_regex( - r'Added: <strong>(.+?)</strong>', webpage, 'upload date', fatal=False)) + title = self._html_search_regex(( + r'<title>(.+?)\s*-\s*Indian\s+Porn</title>', + r'<h4>(.+?)</h4>' + ), webpage, 'title') duration = parse_duration(self._search_regex( - r'<td>Time:\s*</td>\s*<td align="right"><span>\s*(.+?)\s*</span>', + r'Time:\s*<strong>\s*(.+?)\s*</strong>', webpage, 'duration', fatal=False)) - view_count = int_or_none(self._search_regex( - r'<td>Views:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', + view_count = int(self._search_regex( + r'(?s)Time:\s*<strong>.*?</strong>.*?<strong>\s*(\d+)\s*</strong>', webpage, 'view count', fatal=False)) - comment_count = int_or_none(self._search_regex( - r'<td>Comments:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', - webpage, 'comment count', fatal=False)) categories = re.findall( - r'<a href="[^"]+/search/video/desi"><span>([^<]+)</span></a>', + r'<a[^>]+class=[\'"]categories[\'"][^>]*>\s*([^<]+)\s*</a>', webpage) - return { + info_dict.update({ 'id': video_id, 'display_id': display_id, - 'url': video_url, 'http_headers': { 'Referer': url, }, 'title': title, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, 'duration': duration, 'view_count': view_count, - 'comment_count': comment_count, 'categories': categories, 'age_limit': 18, - } + }) + + return info_dict diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index c634b8dec..2182d6fd4 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -1,10 +1,13 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, float_or_none, + unescapeHTML, ) @@ -34,6 +37,25 @@ class WistiaIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_url(webpage): + match = re.search( + r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) + if match: + return unescapeHTML(match.group('url')) + + match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) + if match: + return 'wistia:%s' % match.group('id') + + match = re.search( + r'''(?sx) + <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? + <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2 + ''', webpage) + if match: + return 'wistia:%s' % match.group('id') + def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index 45cfca7c5..9b5487710 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -13,7 +13,7 @@ class WSJIE(InfoExtractor): _VALID_URL = r'''(?x) (?: https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=| - https?://(?:www\.)?wsj\.com/video/[^/]+/| + https?://(?:www\.)?(?:wsj|barrons)\.com/video/[^/]+/| wsj: ) (?P<id>[a-fA-F0-9-]{36}) @@ -35,6 +35,9 @@ class WSJIE(InfoExtractor): }, { 'url': 'http://www.wsj.com/video/can-alphabet-build-a-smarter-city/359DDAA8-9AC1-489C-82E6-0429C1E430E0.html', 'only_matching': True, + }, { + 'url': 'http://www.barrons.com/video/capitalism-deserves-more-respect-from-millennials/F301217E-6F46-43AE-B8D2-B7180D642EE9.html', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index 13f8be6cb..ad747978d 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -10,7 +10,6 @@ from ..utils import ( ExtractorError, int_or_none, NO_DEFAULT, - sanitized_Request, urlencode_postdata, ) @@ -30,6 +29,8 @@ class XFileShareIE(InfoExtractor): (r'vidabc\.com', 'Vid ABC'), (r'vidbom\.com', 'VidBom'), (r'vidlo\.us', 'vidlo'), + (r'rapidvideo\.(?:cool|org)', 'RapidVideo.TV'), + (r'fastvideo\.me', 'FastVideo.me'), ) IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) @@ -109,6 +110,12 @@ class XFileShareIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'http://www.rapidvideo.cool/b667kprndr8w', + 'only_matching': True, + }, { + 'url': 'http://www.fastvideo.me/k8604r8nk8sn/FAST_FURIOUS_8_-_Trailer_italiano_ufficiale.mp4.html', + 'only_matching': True }] def _real_extract(self, url): @@ -130,12 +137,12 @@ class XFileShareIE(InfoExtractor): if countdown: self._sleep(countdown, video_id) - post = urlencode_postdata(fields) - - req = sanitized_Request(url, post) - req.add_header('Content-type', 'application/x-www-form-urlencoded') - - webpage = self._download_webpage(req, video_id, 'Downloading video page') + webpage = self._download_webpage( + url, video_id, 'Downloading video page', + data=urlencode_postdata(fields), headers={ + 'Referer': url, + 'Content-type': 'application/x-www-form-urlencoded', + }) title = (self._search_regex( (r'style="z-index: [0-9]+;">([^<]+)</span>', @@ -150,7 +157,7 @@ class XFileShareIE(InfoExtractor): def extract_formats(default=NO_DEFAULT): urls = [] for regex in ( - r'file\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', + r'(?:file|src)\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', r'file_link\s*=\s*(["\'])(?P<url>http(?:(?!\1).)+)\1', r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http(?:(?!\2).)+)\2\)', r'<embed[^>]+src=(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'): diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 7b6703714..c42b59e51 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -3,7 +3,9 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + clean_html, dict_get, ExtractorError, int_or_none, @@ -13,29 +15,41 @@ from ..utils import ( class XHamsterIE(InfoExtractor): - _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.*?)\.html(?:\?.*)?' + _VALID_URL = r'''(?x) + https?:// + (?:.+?\.)?xhamster\.com/ + (?: + movies/(?P<id>\d+)/(?P<display_id>[^/]*)\.html| + videos/(?P<display_id_2>[^/]*)-(?P<id_2>\d+) + ) + ''' + _TESTS = [{ 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', 'md5': '8281348b8d3c53d39fffb377d24eac4e', 'info_dict': { 'id': '1509445', + 'display_id': 'femaleagent_shy_beauty_takes_the_bait', 'ext': 'mp4', 'title': 'FemaleAgent Shy beauty takes the bait', 'upload_date': '20121014', 'uploader': 'Ruseful2011', 'duration': 893, 'age_limit': 18, + 'categories': ['Fake Hub', 'Amateur', 'MILFs', 'POV', 'Boss', 'Office', 'Oral', 'Reality', 'Sexy'], }, }, { 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', 'info_dict': { 'id': '2221348', + 'display_id': 'britney_spears_sexy_booty', 'ext': 'mp4', 'title': 'Britney Spears Sexy Booty', 'upload_date': '20130914', 'uploader': 'jojo747400', 'duration': 200, 'age_limit': 18, + 'categories': ['Britney Spears', 'Celebrities', 'HD Videos', 'Sexy', 'Sexy Booty'], }, 'params': { 'skip_download': True, @@ -51,6 +65,7 @@ class XHamsterIE(InfoExtractor): 'uploader': 'parejafree', 'duration': 72, 'age_limit': 18, + 'categories': ['Amateur', 'Blowjobs'], }, 'params': { 'skip_download': True, @@ -62,26 +77,18 @@ class XHamsterIE(InfoExtractor): # This video is visible for marcoalfa123456's friends only 'url': 'https://it.xhamster.com/movies/7263980/la_mia_vicina.html', 'only_matching': True, + }, { + # new URL schema + 'url': 'https://pt.xhamster.com/videos/euro-pedal-pumping-7937821', + 'only_matching': True, }] def _real_extract(self, url): - def extract_video_url(webpage, name): - return self._search_regex( - [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''', - r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''', - r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''], - webpage, name, group='mp4') - - def is_hd(webpage): - return '<div class=\'icon iconHD\'' in webpage - mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('id_2') + display_id = mobj.group('display_id') or mobj.group('display_id_2') - video_id = mobj.group('id') - seo = mobj.group('seo') - proto = mobj.group('proto') - mrss_url = '%s://xhamster.com/movies/%s/%s.html' % (proto, video_id, seo) - webpage = self._download_webpage(mrss_url, video_id) + webpage = self._download_webpage(url, video_id) error = self._html_search_regex( r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>', @@ -95,6 +102,39 @@ class XHamsterIE(InfoExtractor): r'<title[^>]*>(.+?)(?:,\s*[^,]*?\s*Porn\s*[^,]*?:\s*xHamster[^<]*| - xHamster\.com)</title>'], webpage, 'title') + formats = [] + format_urls = set() + + sources = self._parse_json( + self._search_regex( + r'sources\s*:\s*({.+?})\s*,?\s*\n', webpage, 'sources', + default='{}'), + video_id, fatal=False) + for format_id, format_url in sources.items(): + if not isinstance(format_url, compat_str): + continue + if format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'height': int_or_none(self._search_regex( + r'^(\d+)[pP]', format_id, 'height', default=None)) + }) + + video_url = self._search_regex( + [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''', + r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''', + r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''], + webpage, 'video url', group='mp4', default=None) + if video_url and video_url not in format_urls: + formats.append({ + 'url': video_url, + }) + + self._sort_formats(formats) + # Only a few videos have an description mobj = re.search(r'<span>Description: </span>([^<]+)', webpage) description = mobj.group(1) if mobj else None @@ -104,7 +144,7 @@ class XHamsterIE(InfoExtractor): webpage, 'upload date', fatal=False)) uploader = self._html_search_regex( - r'<span[^>]+itemprop=["\']author[^>]+><a[^>]+href=["\'].+?xhamster\.com/user/[^>]+>(?P<uploader>.+?)</a>', + r'<span[^>]+itemprop=["\']author[^>]+><a[^>]+><span[^>]+>([^<]+)', webpage, 'uploader', default='anonymous') thumbnail = self._search_regex( @@ -113,14 +153,15 @@ class XHamsterIE(InfoExtractor): webpage, 'thumbnail', fatal=False, group='thumbnail') duration = parse_duration(self._search_regex( - r'Runtime:\s*</span>\s*([\d:]+)', webpage, + [r'<[^<]+\bitemprop=["\']duration["\'][^<]+\bcontent=["\'](.+?)["\']', + r'Runtime:\s*</span>\s*([\d:]+)'], webpage, 'duration', fatal=False)) view_count = int_or_none(self._search_regex( r'content=["\']User(?:View|Play)s:(\d+)', webpage, 'view count', fatal=False)) - mobj = re.search(r"hint='(?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes'", webpage) + mobj = re.search(r'hint=[\'"](?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes', webpage) (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None) mobj = re.search(r'</label>Comments \((?P<commentcount>\d+)\)</div>', webpage) @@ -128,32 +169,15 @@ class XHamsterIE(InfoExtractor): age_limit = self._rta_search(webpage) - hd = is_hd(webpage) - - format_id = 'hd' if hd else 'sd' - - video_url = extract_video_url(webpage, format_id) - formats = [{ - 'url': video_url, - 'format_id': 'hd' if hd else 'sd', - 'preference': 1, - }] - - if not hd: - mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url') - webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage') - if is_hd(webpage): - video_url = extract_video_url(webpage, 'hd') - formats.append({ - 'url': video_url, - 'format_id': 'hd', - 'preference': 2, - }) - - self._sort_formats(formats) + categories_html = self._search_regex( + r'(?s)<table.+?(<span>Categories:.+?)</table>', webpage, + 'categories', default=None) + categories = [clean_html(category) for category in re.findall( + r'<a[^>]+>(.+?)</a>', categories_html)] if categories_html else None return { 'id': video_id, + 'display_id': display_id, 'title': title, 'description': description, 'upload_date': upload_date, @@ -165,6 +189,7 @@ class XHamsterIE(InfoExtractor): 'dislike_count': int_or_none(dislike_count), 'comment_count': int_or_none(comment_count), 'age_limit': age_limit, + 'categories': categories, 'formats': formats, } diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index e0818201a..0276c0dbb 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -1,14 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 - from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, + float_or_none, + get_element_by_attribute, parse_iso8601, - parse_duration, + remove_end, ) @@ -24,6 +23,7 @@ class XuiteIE(InfoExtractor): 'id': '3860914', 'ext': 'mp3', 'title': '孤單南半球-歐德陽', + 'description': '孤單南半球-歐德陽', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 247.246, 'timestamp': 1314932940, @@ -44,7 +44,7 @@ class XuiteIE(InfoExtractor): 'duration': 596.458, 'timestamp': 1454242500, 'upload_date': '20160131', - 'uploader': 'yan12125', + 'uploader': '屁姥', 'uploader_id': '12158353', 'categories': ['個人短片'], 'description': 'http://download.blender.org/peach/bigbuckbunny_movies/BigBuckBunny_320x180.mp4', @@ -72,10 +72,10 @@ class XuiteIE(InfoExtractor): # from http://forgetfulbc.blogspot.com/2016/06/date.html 'url': 'http://vlog.xuite.net/embed/cE1xbENoLTI3NDQ3MzM2LmZsdg==?ar=0&as=0', 'info_dict': { - 'id': 'cE1xbENoLTI3NDQ3MzM2LmZsdg==', + 'id': '27447336', 'ext': 'mp4', 'title': '男女平權只是口號?專家解釋約會時男生是否該幫女生付錢 (中字)', - 'description': 'md5:f0abdcb69df300f522a5442ef3146f2a', + 'description': 'md5:1223810fa123b179083a3aed53574706', 'timestamp': 1466160960, 'upload_date': '20160617', 'uploader': 'B.C. & Lowy', @@ -86,29 +86,9 @@ class XuiteIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def base64_decode_utf8(data): - return base64.b64decode(data.encode('utf-8')).decode('utf-8') - - @staticmethod - def base64_encode_utf8(data): - return base64.b64encode(data.encode('utf-8')).decode('utf-8') - - def _extract_flv_config(self, encoded_media_id): - flv_config = self._download_xml( - 'http://vlog.xuite.net/flash/player?media=%s' % encoded_media_id, - 'flv config') - prop_dict = {} - for prop in flv_config.findall('./property'): - prop_id = self.base64_decode_utf8(prop.attrib['id']) - # CDATA may be empty in flv config - if not prop.text: - continue - encoded_content = self.base64_decode_utf8(prop.text) - prop_dict[prop_id] = compat_urllib_parse_unquote(encoded_content) - return prop_dict - def _real_extract(self, url): + # /play/ URLs provide embedded video URL and more metadata + url = url.replace('/embed/', '/play/') video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -121,51 +101,53 @@ class XuiteIE(InfoExtractor): '%s returned error: %s' % (self.IE_NAME, error_msg), expected=True) - encoded_media_id = self._search_regex( - r'attributes\.name\s*=\s*"([^"]+)"', webpage, - 'encoded media id', default=None) - if encoded_media_id is None: - video_id = self._html_search_regex( - r'data-mediaid="(\d+)"', webpage, 'media id') - encoded_media_id = self.base64_encode_utf8(video_id) - flv_config = self._extract_flv_config(encoded_media_id) + media_info = self._parse_json(self._search_regex( + r'var\s+mediaInfo\s*=\s*({.*});', webpage, 'media info'), video_id) - FORMATS = { - 'audio': 'mp3', - 'video': 'mp4', - } + video_id = media_info['MEDIA_ID'] formats = [] - for format_tag in ('src', 'hq_src'): - video_url = flv_config.get(format_tag) + for key in ('html5Url', 'html5HQUrl'): + video_url = media_info.get(key) if not video_url: continue format_id = self._search_regex( - r'\bq=(.+?)\b', video_url, 'format id', default=format_tag) + r'\bq=(.+?)\b', video_url, 'format id', default=None) formats.append({ 'url': video_url, - 'ext': FORMATS.get(flv_config['type'], 'mp4'), + 'ext': 'mp4' if format_id.isnumeric() else format_id, 'format_id': format_id, 'height': int(format_id) if format_id.isnumeric() else None, }) self._sort_formats(formats) - timestamp = flv_config.get('publish_datetime') + timestamp = media_info.get('PUBLISH_DATETIME') if timestamp: timestamp = parse_iso8601(timestamp + ' +0800', ' ') - category = flv_config.get('category') + category = media_info.get('catName') categories = [category] if category else [] + uploader = media_info.get('NICKNAME') + uploader_url = None + + author_div = get_element_by_attribute('itemprop', 'author', webpage) + if author_div: + uploader = uploader or self._html_search_meta('name', author_div) + uploader_url = self._html_search_regex( + r'<link[^>]+itemprop="url"[^>]+href="([^"]+)"', author_div, + 'uploader URL', fatal=False) + return { 'id': video_id, - 'title': flv_config['title'], - 'description': flv_config.get('description'), - 'thumbnail': flv_config.get('thumb'), + 'title': media_info['TITLE'], + 'description': remove_end(media_info.get('metaDesc'), ' (Xuite 影音)'), + 'thumbnail': media_info.get('ogImageUrl'), 'timestamp': timestamp, - 'uploader': flv_config.get('author_name'), - 'uploader_id': flv_config.get('author_id'), - 'duration': parse_duration(flv_config.get('duration')), + 'uploader': uploader, + 'uploader_id': media_info.get('MEMBER_ID'), + 'uploader_url': uploader_url, + 'duration': float_or_none(media_info.get('MEDIA_DURATION'), 1000000), 'categories': categories, 'formats': formats, } diff --git a/youtube_dl/extractor/xxxymovies.py b/youtube_dl/extractor/xxxymovies.py index 5c8f17eb2..e34ebe3a6 100644 --- a/youtube_dl/extractor/xxxymovies.py +++ b/youtube_dl/extractor/xxxymovies.py @@ -39,8 +39,8 @@ class XXXYMoviesIE(InfoExtractor): r"video_url\s*:\s*'([^']+)'", webpage, 'video URL') title = self._html_search_regex( - [r'<div class="block_header">\s*<h1>([^<]+)</h1>', - r'<title>(.*?)\s*-\s*XXXYMovies\.com</title>'], + [r'<div[^>]+\bclass="block_header"[^>]*>\s*<h1>([^<]+)<', + r'<title>(.*?)\s*-\s*(?:XXXYMovies\.com|XXX\s+Movies)</title>'], webpage, 'title') thumbnail = self._search_regex( diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py deleted file mode 100644 index ef5535547..000000000 --- a/youtube_dl/extractor/yam.py +++ /dev/null @@ -1,123 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - float_or_none, - month_by_abbreviation, - ExtractorError, - get_element_by_attribute, -) - - -class YamIE(InfoExtractor): - IE_DESC = '蕃薯藤yam天空部落' - _VALID_URL = r'https?://mymedia\.yam\.com/m/(?P<id>\d+)' - - _TESTS = [{ - # An audio hosted on Yam - 'url': 'http://mymedia.yam.com/m/2283921', - 'md5': 'c011b8e262a52d5473d9c2e3c9963b9c', - 'info_dict': { - 'id': '2283921', - 'ext': 'mp3', - 'title': '發現 - 趙薇 京華煙雲主題曲', - 'description': '發現 - 趙薇 京華煙雲主題曲', - 'uploader_id': 'princekt', - 'upload_date': '20080807', - 'duration': 313.0, - } - }, { - # An external video hosted on YouTube - 'url': 'http://mymedia.yam.com/m/3599430', - 'md5': '03127cf10d8f35d120a9e8e52e3b17c6', - 'info_dict': { - 'id': 'CNpEoQlrIgA', - 'ext': 'mp4', - 'upload_date': '20150306', - 'uploader': '新莊社大瑜伽社', - 'description': 'md5:11e2e405311633ace874f2e6226c8b17', - 'uploader_id': '2323agoy', - 'title': '20090412陽明山二子坪-1', - }, - 'skip': 'Video does not exist', - }, { - 'url': 'http://mymedia.yam.com/m/3598173', - 'info_dict': { - 'id': '3598173', - 'ext': 'mp4', - }, - 'skip': 'cause Yam system error', - }, { - 'url': 'http://mymedia.yam.com/m/3599437', - 'info_dict': { - 'id': '3599437', - 'ext': 'mp4', - }, - 'skip': 'invalid YouTube URL', - }, { - 'url': 'http://mymedia.yam.com/m/2373534', - 'md5': '7ff74b91b7a817269d83796f8c5890b1', - 'info_dict': { - 'id': '2373534', - 'ext': 'mp3', - 'title': '林俊傑&蔡卓妍-小酒窩', - 'description': 'md5:904003395a0fcce6cfb25028ff468420', - 'upload_date': '20080928', - 'uploader_id': 'onliner2', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - page = self._download_webpage(url, video_id) - - # Check for errors - system_msg = self._html_search_regex( - r'系統訊息(?:<br>|\n|\r)*([^<>]+)<br>', page, 'system message', - default=None) - if system_msg: - raise ExtractorError(system_msg, expected=True) - - # Is it hosted externally on YouTube? - youtube_url = self._html_search_regex( - r'<embed src="(http://www.youtube.com/[^"]+)"', - page, 'YouTube url', default=None) - if youtube_url: - return self.url_result(youtube_url, 'Youtube') - - title = self._html_search_regex( - r'<h1[^>]+class="heading"[^>]*>\s*(.+)\s*</h1>', page, 'title') - - api_page = self._download_webpage( - 'http://mymedia.yam.com/api/a/?pID=' + video_id, video_id, - note='Downloading API page') - api_result_obj = compat_urlparse.parse_qs(api_page) - - info_table = get_element_by_attribute('class', 'info', page) - uploader_id = self._html_search_regex( - r'<!-- 發表作者 -->:[\n ]+<a href="/([a-z0-9]+)"', - info_table, 'uploader id', fatal=False) - mobj = re.search(r'<!-- 發表於 -->(?P<mon>[A-Z][a-z]{2})\s+' + - r'(?P<day>\d{1,2}), (?P<year>\d{4})', page) - if mobj: - upload_date = '%s%02d%02d' % ( - mobj.group('year'), - month_by_abbreviation(mobj.group('mon')), - int(mobj.group('day'))) - else: - upload_date = None - duration = float_or_none(api_result_obj['totaltime'][0], scale=1000) - - return { - 'id': video_id, - 'url': api_result_obj['mp3file'][0], - 'title': title, - 'description': self._html_search_meta('description', page), - 'duration': duration, - 'uploader_id': uploader_id, - 'upload_date': upload_date, - } diff --git a/youtube_dl/extractor/yandexdisk.py b/youtube_dl/extractor/yandexdisk.py new file mode 100644 index 000000000..e8f6ae10f --- /dev/null +++ b/youtube_dl/extractor/yandexdisk.py @@ -0,0 +1,118 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + try_get, + urlencode_postdata, +) + + +class YandexDiskIE(InfoExtractor): + _VALID_URL = r'https?://yadi\.sk/[di]/(?P<id>[^/?#&]+)' + + _TESTS = [{ + 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', + 'md5': '33955d7ae052f15853dc41f35f17581c', + 'info_dict': { + 'id': 'VdOeDou8eZs6Y', + 'ext': 'mp4', + 'title': '4.mp4', + 'duration': 168.6, + 'uploader': 'y.botova', + 'uploader_id': '300043621', + 'view_count': int, + }, + }, { + 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + status = self._download_webpage( + 'https://disk.yandex.com/auth/status', video_id, query={ + 'urlOrigin': url, + 'source': 'public', + 'md5': 'false', + }) + + sk = self._search_regex( + r'(["\'])sk(?:External)?\1\s*:\s*(["\'])(?P<value>(?:(?!\2).)+)\2', + status, 'sk', group='value') + + webpage = self._download_webpage(url, video_id) + + models = self._parse_json( + self._search_regex( + r'<script[^>]+id=["\']models-client[^>]+>\s*(\[.+?\])\s*</script', + webpage, 'video JSON'), + video_id) + + data = next( + model['data'] for model in models + if model.get('model') == 'resource') + + video_hash = data['id'] + title = data['name'] + + models = self._download_json( + 'https://disk.yandex.com/models/', video_id, + data=urlencode_postdata({ + '_model.0': 'videoInfo', + 'id.0': video_hash, + '_model.1': 'do-get-resource-url', + 'id.1': video_hash, + 'version': '13.6', + 'sk': sk, + }), query={'_m': 'videoInfo'})['models'] + + videos = try_get(models, lambda x: x[0]['data']['videos'], list) or [] + source_url = try_get( + models, lambda x: x[1]['data']['file'], compat_str) + + formats = [] + if source_url: + formats.append({ + 'url': source_url, + 'format_id': 'source', + 'ext': determine_ext(title, 'mp4'), + 'quality': 1, + }) + for video in videos: + format_url = video.get('url') + if not format_url: + continue + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': format_url, + }) + self._sort_formats(formats) + + duration = float_or_none(try_get( + models, lambda x: x[0]['data']['duration']), 1000) + uploader = try_get( + data, lambda x: x['user']['display_name'], compat_str) + uploader_id = try_get( + data, lambda x: x['user']['uid'], compat_str) + view_count = int_or_none(try_get( + data, lambda x: x['meta']['views_counter'])) + + return { + 'id': video_id, + 'title': title, + 'duration': duration, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index b50f34e9b..f33fabe19 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -1,39 +1,95 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, +) class YouJizzIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]+)?-(?P<id>[0-9]+)\.html(?:$|[?#])' + _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]*-(?P<id>\d+)\.html|embed/(?P<embed_id>\d+))' _TESTS = [{ 'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', - 'md5': '78fc1901148284c69af12640e01c6310', + 'md5': 'b1e1dfaa8bb9537d8b84eeda9cf4acf4', 'info_dict': { 'id': '2189178', 'ext': 'mp4', 'title': 'Zeichentrick 1', 'age_limit': 18, + 'duration': 2874, } }, { 'url': 'http://www.youjizz.com/videos/-2189178.html', 'only_matching': True, + }, { + 'url': 'https://www.youjizz.com/videos/embed/31991001', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('embed_id') + webpage = self._download_webpage(url, video_id) - # YouJizz's HTML5 player has invalid HTML - webpage = webpage.replace('"controls', '" controls') - age_limit = self._rta_search(webpage) - video_title = self._html_search_regex( - r'<title>\s*(.*)\s*</title>', webpage, 'title') - info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] + title = self._html_search_regex( + r'<title>(.+?)</title>', webpage, 'title') + + formats = [] + + encodings = self._parse_json( + self._search_regex( + r'encodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings', + default='[]'), + video_id, fatal=False) + for encoding in encodings: + if not isinstance(encoding, dict): + continue + format_url = encoding.get('filename') + if not isinstance(format_url, compat_str): + continue + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + format_id = encoding.get('name') or encoding.get('quality') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]', format_id, 'height', default=None)) + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': height, + }) + + if formats: + info_dict = { + 'formats': formats, + } + else: + # YouJizz's HTML5 player has invalid HTML + webpage = webpage.replace('"controls', '" controls') + info_dict = self._parse_html5_media_entries( + url, webpage, video_id)[0] + + duration = parse_duration(self._search_regex( + r'<strong>Runtime:</strong>([^<]+)', webpage, 'duration', + default=None)) + uploader = self._search_regex( + r'<strong>Uploaded By:.*?<a[^>]*>([^<]+)', webpage, 'uploader', + default=None) info_dict.update({ 'id': video_id, - 'title': video_title, - 'age_limit': age_limit, + 'title': title, + 'age_limit': self._rta_search(webpage), + 'duration': duration, + 'uploader': uploader, }) return info_dict diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 73ebe5759..0c4bc2eda 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -1,23 +1,18 @@ # coding: utf-8 from __future__ import unicode_literals -import base64 -import itertools import random import re import string import time from .common import InfoExtractor -from ..compat import ( - compat_ord, - compat_str, - compat_urllib_parse_urlencode, -) from ..utils import ( ExtractorError, - get_element_by_attribute, - try_get, + get_element_by_class, + js_to_json, + str_or_none, + strip_jsonp, ) @@ -26,7 +21,9 @@ class YoukuIE(InfoExtractor): IE_DESC = '优酷' _VALID_URL = r'''(?x) (?: - http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| + https?://( + (?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| + video\.tudou\.com/v/)| youku:) (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|) ''' @@ -35,9 +32,15 @@ class YoukuIE(InfoExtractor): # MD5 is unstable 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html', 'info_dict': { - 'id': 'XMTc1ODE5Njcy_part1', + 'id': 'XMTc1ODE5Njcy', 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.', - 'ext': 'flv' + 'ext': 'mp4', + 'duration': 74.73, + 'thumbnail': r're:^https?://.*', + 'uploader': '。躲猫猫、', + 'uploader_id': '36017967', + 'uploader_url': 'http://i.youku.com/u/UMTQ0MDcxODY4', + 'tags': list, } }, { 'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf', @@ -46,25 +49,42 @@ class YoukuIE(InfoExtractor): 'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html', 'info_dict': { 'id': 'XODgxNjg1Mzk2', + 'ext': 'mp4', 'title': '武媚娘传奇 85', + 'duration': 1999.61, + 'thumbnail': r're:^https?://.*', + 'uploader': '疯狂豆花', + 'uploader_id': '62583473', + 'uploader_url': 'http://i.youku.com/u/UMjUwMzMzODky', + 'tags': list, }, - 'playlist_count': 11, - 'skip': 'Available in China only', }, { 'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html', 'info_dict': { 'id': 'XMTI1OTczNDM5Mg', + 'ext': 'mp4', 'title': '花千骨 04', + 'duration': 2363, + 'thumbnail': r're:^https?://.*', + 'uploader': '放剧场-花千骨', + 'uploader_id': '772849359', + 'uploader_url': 'http://i.youku.com/u/UMzA5MTM5NzQzNg==', + 'tags': list, }, - 'playlist_count': 13, }, { 'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html', 'note': 'Video protected with password', 'info_dict': { 'id': 'XNjA1NzA2Njgw', + 'ext': 'mp4', 'title': '邢義田复旦讲座之想象中的胡人—从“左衽孔子”说起', + 'duration': 7264.5, + 'thumbnail': r're:^https?://.*', + 'uploader': 'FoxJin1006', + 'uploader_id': '322014285', + 'uploader_url': 'http://i.youku.com/u/UMTI4ODA1NzE0MA==', + 'tags': list, }, - 'playlist_count': 19, 'params': { 'videopassword': '100600', }, @@ -73,130 +93,38 @@ class YoukuIE(InfoExtractor): 'url': 'http://v.youku.com/v_show/id_XOTUxMzg4NDMy.html', 'info_dict': { 'id': 'XOTUxMzg4NDMy', + 'ext': 'mp4', 'title': '我的世界☆明月庄主☆车震猎杀☆杀人艺术Minecraft', + 'duration': 702.08, + 'thumbnail': r're:^https?://.*', + 'uploader': '明月庄主moon', + 'uploader_id': '38465621', + 'uploader_url': 'http://i.youku.com/u/UMTUzODYyNDg0', + 'tags': list, + }, + }, { + 'url': 'http://video.tudou.com/v/XMjIyNzAzMTQ4NA==.html?f=46177805', + 'info_dict': { + 'id': 'XMjIyNzAzMTQ4NA', + 'ext': 'mp4', + 'title': '卡马乔国足开大脚长传冲吊集锦', + 'duration': 289, + 'thumbnail': r're:^https?://.*', + 'uploader': '阿卜杜拉之星', + 'uploader_id': '2382249', + 'uploader_url': 'http://i.youku.com/u/UOTUyODk5Ng==', + 'tags': list, }, - 'playlist_count': 6, + }, { + 'url': 'http://video.tudou.com/v/XMjE4ODI3OTg2MA==.html', + 'only_matching': True, }] - def construct_video_urls(self, data): - # get sid, token - def yk_t(s1, s2): - ls = list(range(256)) - t = 0 - for i in range(256): - t = (t + ls[i] + compat_ord(s1[i % len(s1)])) % 256 - ls[i], ls[t] = ls[t], ls[i] - s = bytearray() - x, y = 0, 0 - for i in range(len(s2)): - y = (y + 1) % 256 - x = (x + ls[y]) % 256 - ls[x], ls[y] = ls[y], ls[x] - s.append(compat_ord(s2[i]) ^ ls[(ls[x] + ls[y]) % 256]) - return bytes(s) - - sid, token = yk_t( - b'becaf9be', base64.b64decode(data['security']['encrypt_string'].encode('ascii')) - ).decode('ascii').split('_') - - # get oip - oip = data['security']['ip'] - - fileid_dict = {} - for stream in data['stream']: - if stream.get('channel_type') == 'tail': - continue - format = stream.get('stream_type') - fileid = try_get( - stream, lambda x: x['segs'][0]['fileid'], - compat_str) or stream['stream_fileid'] - fileid_dict[format] = fileid - - def get_fileid(format, n): - number = hex(int(str(n), 10))[2:].upper() - if len(number) == 1: - number = '0' + number - streamfileids = fileid_dict[format] - fileid = streamfileids[0:8] + number + streamfileids[10:] - return fileid - - # get ep - def generate_ep(format, n): - fileid = get_fileid(format, n) - ep_t = yk_t( - b'bf7e5f01', - ('%s_%s_%s' % (sid, fileid, token)).encode('ascii') - ) - ep = base64.b64encode(ep_t).decode('ascii') - return ep - - # generate video_urls - video_urls_dict = {} - for stream in data['stream']: - if stream.get('channel_type') == 'tail': - continue - format = stream.get('stream_type') - video_urls = [] - for dt in stream['segs']: - n = str(stream['segs'].index(dt)) - param = { - 'K': dt['key'], - 'hd': self.get_hd(format), - 'myp': 0, - 'ypp': 0, - 'ctype': 12, - 'ev': 1, - 'token': token, - 'oip': oip, - 'ep': generate_ep(format, n) - } - video_url = \ - 'http://k.youku.com/player/getFlvPath/' + \ - 'sid/' + sid + \ - '_00' + \ - '/st/' + self.parse_ext_l(format) + \ - '/fileid/' + get_fileid(format, n) + '?' + \ - compat_urllib_parse_urlencode(param) - video_urls.append(video_url) - video_urls_dict[format] = video_urls - - return video_urls_dict - @staticmethod def get_ysuid(): return '%d%s' % (int(time.time()), ''.join([ random.choice(string.ascii_letters) for i in range(3)])) - def get_hd(self, fm): - hd_id_dict = { - '3gp': '0', - '3gphd': '1', - 'flv': '0', - 'flvhd': '0', - 'mp4': '1', - 'mp4hd': '1', - 'mp4hd2': '1', - 'mp4hd3': '1', - 'hd2': '2', - 'hd3': '3', - } - return hd_id_dict[fm] - - def parse_ext_l(self, fm): - ext_dict = { - '3gp': 'flv', - '3gphd': 'mp4', - 'flv': 'flv', - 'flvhd': 'flv', - 'mp4': 'mp4', - 'mp4hd': 'mp4', - 'mp4hd2': 'flv', - 'mp4hd3': 'flv', - 'hd2': 'flv', - 'hd3': 'flv', - } - return ext_dict[fm] - def get_format_name(self, fm): _dict = { '3gp': 'h6', @@ -210,32 +138,40 @@ class YoukuIE(InfoExtractor): 'hd2': 'h2', 'hd3': 'h1', } - return _dict[fm] + return _dict.get(fm) def _real_extract(self, url): video_id = self._match_id(url) self._set_cookie('youku.com', '__ysuid', self.get_ysuid()) + self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com') - def retrieve_data(req_url, note): - headers = { - 'Referer': req_url, - } - headers.update(self.geo_verification_headers()) - self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com') + _, urlh = self._download_webpage_handle( + 'https://log.mmstat.com/eg.js', video_id, 'Retrieving cna info') + # The etag header is '"foobar"'; let's remove the double quotes + cna = urlh.headers['etag'][1:-1] - raw_data = self._download_json(req_url, video_id, note=note, headers=headers) - - return raw_data['data'] + # request basic data + basic_data_params = { + 'vid': video_id, + 'ccode': '0402' if 'tudou.com' in url else '0401', + 'client_ip': '192.168.1.1', + 'utid': cna, + 'client_ts': time.time() / 1000, + } video_password = self._downloader.params.get('videopassword') - - # request basic data - basic_data_url = 'http://play.youku.com/play/get.json?vid=%s&ct=12' % video_id if video_password: - basic_data_url += '&pwd=%s' % video_password + basic_data_params['password'] = video_password - data = retrieve_data(basic_data_url, 'Downloading JSON metadata') + headers = { + 'Referer': url, + } + headers.update(self.geo_verification_headers()) + data = self._download_json( + 'https://ups.youku.com/ups/get.json', video_id, + 'Downloading JSON metadata', + query=basic_data_params, headers=headers)['data'] error = data.get('error') if error: @@ -253,86 +189,111 @@ class YoukuIE(InfoExtractor): raise ExtractorError(msg) # get video title - title = data['video']['title'] - - # generate video_urls_dict - video_urls_dict = self.construct_video_urls(data) - - # construct info - entries = [{ - 'id': '%s_part%d' % (video_id, i + 1), - 'title': title, - 'formats': [], - # some formats are not available for all parts, we have to detect - # which one has all - } for i in range(max(len(v.get('segs')) for v in data['stream']))] - for stream in data['stream']: - if stream.get('channel_type') == 'tail': - continue - fm = stream.get('stream_type') - video_urls = video_urls_dict[fm] - for video_url, seg, entry in zip(video_urls, stream['segs'], entries): - entry['formats'].append({ - 'url': video_url, - 'format_id': self.get_format_name(fm), - 'ext': self.parse_ext_l(fm), - 'filesize': int(seg['size']), - 'width': stream.get('width'), - 'height': stream.get('height'), - }) + video_data = data['video'] + title = video_data['title'] + + formats = [{ + 'url': stream['m3u8_url'], + 'format_id': self.get_format_name(stream.get('stream_type')), + 'ext': 'mp4', + 'protocol': 'm3u8_native', + 'filesize': int(stream.get('size')), + 'width': stream.get('width'), + 'height': stream.get('height'), + } for stream in data['stream'] if stream.get('channel_type') != 'tail'] + self._sort_formats(formats) return { - '_type': 'multi_video', 'id': video_id, 'title': title, - 'entries': entries, + 'formats': formats, + 'duration': video_data.get('seconds'), + 'thumbnail': video_data.get('logo'), + 'uploader': video_data.get('username'), + 'uploader_id': str_or_none(video_data.get('userid')), + 'uploader_url': data.get('uploader', {}).get('homepage'), + 'tags': video_data.get('tags'), } class YoukuShowIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?youku\.com/show_page/id_(?P<id>[0-9a-z]+)\.html' + _VALID_URL = r'https?://list\.youku\.com/show/id_(?P<id>[0-9a-z]+)\.html' IE_NAME = 'youku:show' - _TEST = { - 'url': 'http://www.youku.com/show_page/id_zc7c670be07ff11e48b3f.html', + _TESTS = [{ + 'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html', 'info_dict': { 'id': 'zc7c670be07ff11e48b3f', - 'title': '花千骨 未删减版', - 'description': 'md5:578d4f2145ae3f9128d9d4d863312910', + 'title': '花千骨 DVD版', + 'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558', }, 'playlist_count': 50, - } - - _PAGE_SIZE = 40 + }, { + # Episode number not starting from 1 + 'url': 'http://list.youku.com/show/id_zefbfbd70efbfbd780bef.html', + 'info_dict': { + 'id': 'zefbfbd70efbfbd780bef', + 'title': '超级飞侠3', + 'description': 'md5:275715156abebe5ccc2a1992e9d56b98', + }, + 'playlist_count': 24, + }, { + # Ongoing playlist. The initial page is the last one + 'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html', + 'only_matchine': True, + }] - def _find_videos_in_page(self, webpage): - videos = re.findall( - r'<li><a[^>]+href="(?P<url>https?://v\.youku\.com/[^"]+)"[^>]+title="(?P<title>[^"]+)"', webpage) - return [ - self.url_result(video_url, YoukuIE.ie_key(), title) - for video_url, title in videos] + def _extract_entries(self, playlist_data_url, show_id, note, query): + query['callback'] = 'cb' + playlist_data = self._download_json( + playlist_data_url, show_id, query=query, note=note, + transform_source=lambda s: js_to_json(strip_jsonp(s)))['html'] + drama_list = (get_element_by_class('p-drama-grid', playlist_data) or + get_element_by_class('p-drama-half-row', playlist_data)) + if drama_list is None: + raise ExtractorError('No episodes found') + video_urls = re.findall(r'<a[^>]+href="([^"]+)"', drama_list) + return playlist_data, [ + self.url_result(self._proto_relative_url(video_url, 'http:'), YoukuIE.ie_key()) + for video_url in video_urls] def _real_extract(self, url): show_id = self._match_id(url) webpage = self._download_webpage(url, show_id) - entries = self._find_videos_in_page(webpage) - - playlist_title = self._html_search_regex( - r'<span[^>]+class="name">([^<]+)</span>', webpage, 'playlist title', fatal=False) - detail_div = get_element_by_attribute('class', 'detail', webpage) or '' - playlist_description = self._html_search_regex( - r'<span[^>]+style="display:none"[^>]*>([^<]+)</span>', - detail_div, 'playlist description', fatal=False) - - for idx in itertools.count(1): - episodes_page = self._download_webpage( - 'http://www.youku.com/show_episode/id_%s.html' % show_id, - show_id, query={'divid': 'reload_%d' % (idx * self._PAGE_SIZE + 1)}, - note='Downloading episodes page %d' % idx) - new_entries = self._find_videos_in_page(episodes_page) + entries = [] + page_config = self._parse_json(self._search_regex( + r'var\s+PageConfig\s*=\s*({.+});', webpage, 'page config'), + show_id, transform_source=js_to_json) + first_page, initial_entries = self._extract_entries( + 'http://list.youku.com/show/module', show_id, + note='Downloading initial playlist data page', + query={ + 'id': page_config['showid'], + 'tab': 'showInfo', + }) + first_page_reload_id = self._html_search_regex( + r'<div[^>]+id="(reload_\d+)', first_page, 'first page reload id') + # The first reload_id has the same items as first_page + reload_ids = re.findall('<li[^>]+data-id="([^"]+)">', first_page) + for idx, reload_id in enumerate(reload_ids): + if reload_id == first_page_reload_id: + entries.extend(initial_entries) + continue + _, new_entries = self._extract_entries( + 'http://list.youku.com/show/episode', show_id, + note='Downloading playlist data page %d' % (idx + 1), + query={ + 'id': page_config['showid'], + 'stage': reload_id, + }) entries.extend(new_entries) - if len(new_entries) < self._PAGE_SIZE: - break - return self.playlist_result(entries, show_id, playlist_title, playlist_description) + desc = self._html_search_meta('description', webpage, fatal=False) + playlist_title = desc.split(',')[0] if desc else None + detail_li = get_element_by_class('p-intro', webpage) + playlist_description = get_element_by_class( + 'intro-more', detail_li) if detail_li else None + + return self.playlist_result( + entries, show_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 34ab878a4..547adefeb 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, sanitized_Request, @@ -26,7 +27,7 @@ class YouPornIE(InfoExtractor): 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Ask Dan And Jennifer', - 'upload_date': '20101221', + 'upload_date': '20101217', 'average_rating': int, 'view_count': int, 'comment_count': int, @@ -45,7 +46,7 @@ class YouPornIE(InfoExtractor): 'description': 'http://sweetlivegirls.com Big Tits Awesome Brunette On amazing webcam show.mp4', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Unknown', - 'upload_date': '20111125', + 'upload_date': '20110418', 'average_rating': int, 'view_count': int, 'comment_count': int, @@ -68,28 +69,46 @@ class YouPornIE(InfoExtractor): webpage = self._download_webpage(request, display_id) title = self._search_regex( - [r'(?:video_titles|videoTitle)\s*[:=]\s*(["\'])(?P<title>.+?)\1', - r'<h1[^>]+class=["\']heading\d?["\'][^>]*>([^<])<'], - webpage, 'title', group='title') + [r'(?:video_titles|videoTitle)\s*[:=]\s*(["\'])(?P<title>(?:(?!\1).)+)\1', + r'<h1[^>]+class=["\']heading\d?["\'][^>]*>(?P<title>[^<]+)<'], + webpage, 'title', group='title', + default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'title', webpage, fatal=True) links = [] + # Main source + definitions = self._parse_json( + self._search_regex( + r'mediaDefinition\s*=\s*(\[.+?\]);', webpage, + 'media definitions', default='[]'), + video_id, fatal=False) + if definitions: + for definition in definitions: + if not isinstance(definition, dict): + continue + video_url = definition.get('videoUrl') + if isinstance(video_url, compat_str) and video_url: + links.append(video_url) + + # Fallback #1, this also contains extra low quality 180p format + for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage): + links.append(link) + + # Fallback #2 (unavailable as at 22.06.2017) sources = self._search_regex( r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None) if sources: for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources): links.append(link) - # Fallback #1 + # Fallback #3 (unavailable as at 22.06.2017) for _, link in re.findall( - r'(?:videoUrl|videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage): - links.append(link) - - # Fallback #2, this also contains extra low quality 180p format - for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage): + r'(?:videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage): links.append(link) - # Fallback #3, encrypted links + # Fallback #4, encrypted links (unavailable as at 22.06.2017) for _, encrypted_link in re.findall( r'encryptedQuality\d{3,4}URL\s*=\s*(["\'])([\da-zA-Z+/=]+)\1', webpage): links.append(aes_decrypt_text(encrypted_link, title, 32).decode('utf-8')) @@ -124,7 +143,8 @@ class YouPornIE(InfoExtractor): r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( - r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>', + [r'Date\s+[Aa]dded:\s*<span>([^<]+)', + r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'], webpage, 'upload date', fatal=False)) age_limit = self._rta_search(webpage) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 480f403da..ad2e933ee 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -16,6 +16,7 @@ from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, + compat_kwargs, compat_parse_qs, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, @@ -38,7 +39,6 @@ from ..utils import ( parse_duration, remove_quotes, remove_start, - sanitized_Request, smuggle_url, str_to_int, try_get, @@ -54,7 +54,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' - _PASSWORD_CHALLENGE_URL = 'https://accounts.google.com/signin/challenge/sl/password' + + _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup' + _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' + _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' + _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False @@ -96,74 +100,157 @@ class YoutubeBaseInfoExtractor(InfoExtractor): login_form = self._hidden_inputs(login_page) - login_form.update({ - 'checkConnection': 'youtube', - 'Email': username, - 'Passwd': password, - }) + def req(url, f_req, note, errnote): + data = login_form.copy() + data.update({ + 'pstMsg': 1, + 'checkConnection': 'youtube', + 'checkedDomains': 'youtube', + 'hl': 'en', + 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]', + 'f.req': json.dumps(f_req), + 'flowName': 'GlifWebSignIn', + 'flowEntry': 'ServiceLogin', + }) + return self._download_json( + url, None, note=note, errnote=errnote, + transform_source=lambda s: re.sub(r'^[^[]*', '', s), + fatal=False, + data=urlencode_postdata(data), headers={ + 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', + 'Google-Accounts-XSRF': 1, + }) - login_results = self._download_webpage( - self._PASSWORD_CHALLENGE_URL, None, - note='Logging in', errnote='unable to log in', fatal=False, - data=urlencode_postdata(login_form)) - if login_results is False: - return False + def warn(message): + self._downloader.report_warning(message) + + lookup_req = [ + username, + None, [], None, 'US', None, None, 2, False, True, + [ + None, None, + [2, 1, None, 1, + 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', + None, [], 4], + 1, [None, None, []], None, None, None, True + ], + username, + ] - error_msg = self._html_search_regex( - r'<[^>]+id="errormsg_0_Passwd"[^>]*>([^<]+)<', - login_results, 'error message', default=None) - if error_msg: - raise ExtractorError('Unable to login: %s' % error_msg, expected=True) + lookup_results = req( + self._LOOKUP_URL, lookup_req, + 'Looking up account info', 'Unable to look up account info') - if re.search(r'id="errormsg_0_Passwd"', login_results) is not None: - raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True) + if lookup_results is False: + return False - # Two-Factor - # TODO add SMS and phone call support - these require making a request and then prompting the user + user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str) + if not user_hash: + warn('Unable to extract user hash') + return False - if re.search(r'(?i)<form[^>]+id="challenge"', login_results) is not None: - tfa_code = self._get_tfa_info('2-step verification code') + challenge_req = [ + user_hash, + None, 1, None, [1, None, None, None, [password, None, True]], + [ + None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], + 1, [None, None, []], None, None, None, True + ]] - if not tfa_code: - self._downloader.report_warning( - 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' - '(Note that only TOTP (Google Authenticator App) codes work at this time.)') - return False + challenge_results = req( + self._CHALLENGE_URL, challenge_req, + 'Logging in', 'Unable to log in') - tfa_code = remove_start(tfa_code, 'G-') + if challenge_results is False: + return - tfa_form_strs = self._form_hidden_inputs('challenge', login_results) + login_res = try_get(challenge_results, lambda x: x[0][5], list) + if login_res: + login_msg = try_get(login_res, lambda x: x[5], compat_str) + warn( + 'Unable to login: %s' % 'Invalid password' + if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg) + return False - tfa_form_strs.update({ - 'Pin': tfa_code, - 'TrustDevice': 'on', - }) + res = try_get(challenge_results, lambda x: x[0][-1], list) + if not res: + warn('Unable to extract result entry') + return False - tfa_data = urlencode_postdata(tfa_form_strs) + tfa = try_get(res, lambda x: x[0][0], list) + if tfa: + tfa_str = try_get(tfa, lambda x: x[2], compat_str) + if tfa_str == 'TWO_STEP_VERIFICATION': + # SEND_SUCCESS - TFA code has been successfully sent to phone + # QUOTA_EXCEEDED - reached the limit of TFA codes + status = try_get(tfa, lambda x: x[5], compat_str) + if status == 'QUOTA_EXCEEDED': + warn('Exceeded the limit of TFA codes, try later') + return False + + tl = try_get(challenge_results, lambda x: x[1][2], compat_str) + if not tl: + warn('Unable to extract TL') + return False + + tfa_code = self._get_tfa_info('2-step verification code') + + if not tfa_code: + warn( + 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' + '(Note that only TOTP (Google Authenticator App) codes work at this time.)') + return False + + tfa_code = remove_start(tfa_code, 'G-') + + tfa_req = [ + user_hash, None, 2, None, + [ + 9, None, None, None, None, None, None, None, + [None, tfa_code, True, 2] + ]] + + tfa_results = req( + self._TFA_URL.format(tl), tfa_req, + 'Submitting TFA code', 'Unable to submit TFA code') + + if tfa_results is False: + return False + + tfa_res = try_get(tfa_results, lambda x: x[0][5], list) + if tfa_res: + tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str) + warn( + 'Unable to finish TFA: %s' % 'Invalid TFA code' + if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg) + return False + + check_cookie_url = try_get( + tfa_results, lambda x: x[0][-1][2], compat_str) + else: + check_cookie_url = try_get(res, lambda x: x[2], compat_str) - tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data) - tfa_results = self._download_webpage( - tfa_req, None, - note='Submitting TFA code', errnote='unable to submit tfa', fatal=False) + if not check_cookie_url: + warn('Unable to extract CheckCookie URL') + return False - if tfa_results is False: - return False + check_cookie_results = self._download_webpage( + check_cookie_url, None, 'Checking cookie', fatal=False) - if re.search(r'(?i)<form[^>]+id="challenge"', tfa_results) is not None: - self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.') - return False - if re.search(r'(?i)<form[^>]+id="gaia_loginform"', tfa_results) is not None: - self._downloader.report_warning('unable to log in - did the page structure change?') - return False - if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None: - self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.') - return False + if check_cookie_results is False: + return False - if re.search(r'(?i)<form[^>]+id="gaia_loginform"', login_results) is not None: - self._downloader.report_warning('unable to log in: bad username or password') + if 'https://myaccount.google.com/' not in check_cookie_results: + warn('Unable to log in') return False + return True + def _download_webpage(self, *args, **kwargs): + kwargs.setdefault('query', {})['disable_polymer'] = 'true' + return super(YoutubeBaseInfoExtractor, self)._download_webpage( + *args, **compat_kwargs(kwargs)) + def _real_initialize(self): if self._downloader is None: return @@ -592,6 +679,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, }, # video_info is None (https://github.com/rg3/youtube-dl/issues/4421) + # YouTube Red ad is not captured for creator { 'url': '__2ABJjxzNo', 'info_dict': { @@ -922,6 +1010,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ], }, { + # The following content has been identified by the YouTube community + # as inappropriate or offensive to some audiences. + 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI', + 'info_dict': { + 'id': '6SJNVb0GnPI', + 'ext': 'mp4', + 'title': 'Race Differences in Intelligence', + 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1', + 'duration': 965, + 'upload_date': '20140124', + 'uploader': 'New Century Foundation', + 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', + 'license': 'Standard YouTube License', + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, + { # itag 212 'url': '1t24XAntNCY', 'only_matching': True, @@ -1188,37 +1297,57 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sub_lang_list[sub_lang] = sub_formats return sub_lang_list + def make_captions(sub_url, sub_langs): + parsed_sub_url = compat_urllib_parse_urlparse(sub_url) + caption_qs = compat_parse_qs(parsed_sub_url.query) + captions = {} + for sub_lang in sub_langs: + sub_formats = [] + for ext in self._SUBTITLE_FORMATS: + caption_qs.update({ + 'tlang': [sub_lang], + 'fmt': [ext], + }) + sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace( + query=compat_urllib_parse_urlencode(caption_qs, True))) + sub_formats.append({ + 'url': sub_url, + 'ext': ext, + }) + captions[sub_lang] = sub_formats + return captions + + # New captions format as of 22.06.2017 + player_response = args.get('player_response') + if player_response and isinstance(player_response, compat_str): + player_response = self._parse_json( + player_response, video_id, fatal=False) + if player_response: + renderer = player_response['captions']['playerCaptionsTracklistRenderer'] + base_url = renderer['captionTracks'][0]['baseUrl'] + sub_lang_list = [] + for lang in renderer['translationLanguages']: + lang_code = lang.get('languageCode') + if lang_code: + sub_lang_list.append(lang_code) + return make_captions(base_url, sub_lang_list) + # Some videos don't provide ttsurl but rather caption_tracks and # caption_translation_languages (e.g. 20LmZk1hakA) + # Does not used anymore as of 22.06.2017 caption_tracks = args['caption_tracks'] caption_translation_languages = args['caption_translation_languages'] caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] - parsed_caption_url = compat_urllib_parse_urlparse(caption_url) - caption_qs = compat_parse_qs(parsed_caption_url.query) - - sub_lang_list = {} + sub_lang_list = [] for lang in caption_translation_languages.split(','): lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) sub_lang = lang_qs.get('lc', [None])[0] - if not sub_lang: - continue - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - caption_qs.update({ - 'tlang': [sub_lang], - 'fmt': [ext], - }) - sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace( - query=compat_urllib_parse_urlencode(caption_qs, True))) - sub_formats.append({ - 'url': sub_url, - 'ext': ext, - }) - sub_lang_list[sub_lang] = sub_formats - return sub_lang_list + if sub_lang: + sub_lang_list.append(sub_lang) + return make_captions(caption_url, sub_lang_list) # An extractor error can be raise by the download process if there are # no automatic captions but there are subtitles - except (KeyError, ExtractorError): + except (KeyError, IndexError, ExtractorError): self._downloader.report_warning(err_msg) return {} @@ -1245,6 +1374,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor): playback_url, video_id, 'Marking watched', 'Unable to mark watched', fatal=False) + @staticmethod + def _extract_urls(webpage): + # Embedded YouTube player + entries = [ + unescapeHTML(mobj.group('url')) + for mobj in re.finditer(r'''(?x) + (?: + <iframe[^>]+?src=| + data-video-url=| + <embed[^>]+?src=| + embedSWF\(?:\s*| + <object[^>]+data=| + new\s+SWFObject\( + ) + (["\']) + (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ + (?:embed|v|p)/.+?) + \1''', webpage)] + + # lazyYT YouTube embed + entries.extend(list(map( + unescapeHTML, + re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) + + # Wordpress "YouTube Video Importer" plugin + matches = re.findall(r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ + data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) + entries.extend(m[-1] for m in matches) + + return entries + + @staticmethod + def _extract_url(webpage): + urls = YoutubeIE._extract_urls(webpage) + return urls[0] if urls else None + @classmethod def extract_id(cls, url): mobj = re.match(cls._VALID_URL, url, re.VERBOSE) @@ -1257,6 +1423,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor): url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.') + @staticmethod + def _extract_chapters(description, duration): + if not description: + return None + chapter_lines = re.findall( + r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)', + description) + if not chapter_lines: + return None + chapters = [] + for next_num, (chapter_line, time_point) in enumerate( + chapter_lines, start=1): + start_time = parse_duration(time_point) + if start_time is None: + continue + if start_time > duration: + break + end_time = (duration if next_num == len(chapter_lines) + else parse_duration(chapter_lines[next_num][1])) + if end_time is None: + continue + if end_time > duration: + end_time = duration + if start_time > end_time: + break + chapter_title = re.sub( + r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-') + chapter_title = re.sub(r'\s+', ' ', chapter_title) + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': chapter_title, + }) + return chapters + def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -1300,9 +1501,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if dash_mpd and dash_mpd[0] not in dash_mpds: dash_mpds.append(dash_mpd[0]) + is_live = None + view_count = None + + def extract_view_count(v_info): + return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) + # Get video info embed_webpage = None - is_live = None if re.search(r'player-age-gate-content">', video_webpage) is not None: age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} @@ -1325,6 +1531,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: age_gate = False video_info = None + sts = None # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) if ytplayer_config: @@ -1341,6 +1548,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) if args.get('livestream') == '1' or args.get('live_playback') == 1: is_live = True + sts = ytplayer_config.get('sts') if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): # We also try looking in get_video_info since it may contain different dashmpd # URL that points to a DASH manifest with possibly different itag set (some itags @@ -1349,17 +1557,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # The general idea is to take a union of itags of both DASH manifests (for example # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093) self.report_video_info_webpage_download(video_id) - for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = ( - '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' - % (proto, video_id, el_type)) + for el in ('info', 'embedded', 'detailpage', 'vevo', ''): + query = { + 'video_id': video_id, + 'ps': 'default', + 'eurl': '', + 'gl': 'US', + 'hl': 'en', + } + if el: + query['el'] = el + if sts: + query['sts'] = sts video_info_webpage = self._download_webpage( - video_info_url, + '%s://www.youtube.com/get_video_info' % proto, video_id, note=False, - errnote='unable to download video info webpage') + errnote='unable to download video info webpage', + fatal=False, query=query) + if not video_info_webpage: + continue get_video_info = compat_parse_qs(video_info_webpage) - if get_video_info.get('use_cipher_signature') != ['True']: - add_dash_mpd(get_video_info) + add_dash_mpd(get_video_info) + if view_count is None: + view_count = extract_view_count(get_video_info) if not video_info: video_info = get_video_info if 'token' in get_video_info: @@ -1399,9 +1619,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_title = '_' # description - video_description = get_element_by_id("eow-description", video_webpage) + description_original = video_description = get_element_by_id("eow-description", video_webpage) if video_description: - video_description = re.sub(r'''(?x) + description_original = video_description = re.sub(r'''(?x) <a\s+ (?:[a-zA-Z-]+="[^"]*"\s+)*? (?:title|href)="([^"]+)"\s+ @@ -1443,10 +1663,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return self.playlist_result(entries, video_id, video_title, video_description) self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - if 'view_count' in video_info: - view_count = int(video_info['view_count'][0]) - else: - view_count = None + if view_count is None: + view_count = extract_view_count(video_info) # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: @@ -1490,10 +1708,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not upload_date: upload_date = self._search_regex( [r'(?s)id="eow-date.*?>(.*?)</span>', - r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'], + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], video_webpage, 'upload date', default=None) - if upload_date: - upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) upload_date = unified_strdate(upload_date) video_license = self._html_search_regex( @@ -1501,7 +1717,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_webpage, 'license', default=None) m_music = re.search( - r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li', + r'''(?x) + <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s* + <ul[^>]*>\s* + <li>(?P<title>.+?) + by (?P<creator>.+?) + (?: + \(.+?\)| + <a[^>]* + (?: + \bhref=["\']/red[^>]*>| # drop possible + >\s*Listen ad-free with YouTube Red # YouTube Red ad + ) + .*? + )?</li + ''', video_webpage) if m_music: video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) @@ -1558,6 +1788,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if self._downloader.params.get('writeannotations', False): video_annotations = self._extract_annotations(video_id) + chapters = self._extract_chapters(description_original, video_duration) + if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() formats = [{ @@ -1591,12 +1823,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): format_id = url_data['itag'][0] url = url_data['url'][0] - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - encrypted_sig = url_data['s'][0] + if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' - jsplayer_url_json = self._search_regex( ASSETS_RE, embed_webpage if age_gate else video_webpage, @@ -1617,6 +1845,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_webpage, 'age gate player URL') player_url = json.loads(player_url_json) + if 'sig' in url_data: + url += '&signature=' + url_data['sig'][0] + elif 's' in url_data: + encrypted_sig = url_data['s'][0] + if self._downloader.params.get('verbose'): if player_url is None: player_version = 'unknown' @@ -1790,6 +2023,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': video_duration, 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, + 'chapters': chapters, 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, 'view_count': view_count, 'like_count': like_count, @@ -1861,7 +2095,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): | (%(playlist_id)s) )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true' + _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?' IE_NAME = 'youtube:playlist' _TESTS = [{ diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 24cdec28c..7bda59610 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -6,6 +6,7 @@ import re from .utils import ( ExtractorError, + remove_quotes, ) _OPERATORS = [ @@ -57,7 +58,6 @@ class JSInterpreter(object): def interpret_expression(self, expr, local_vars, allow_recursion): expr = expr.strip() - if expr == '': # Empty expression return None @@ -121,11 +121,19 @@ class JSInterpreter(object): pass m = re.match( - r'(?P<var>%s)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE, + r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr) + if m: + val = local_vars[m.group('in')] + idx = self.interpret_expression( + m.group('idx'), local_vars, allow_recursion - 1) + return val[idx] + + m = re.match( + r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE, expr) if m: variable = m.group('var') - member = m.group('member') + member = remove_quotes(m.group('member') or m.group('member2')) arg_str = m.group('args') if variable in local_vars: @@ -173,14 +181,6 @@ class JSInterpreter(object): return obj[member](argvals) - m = re.match( - r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr) - if m: - val = local_vars[m.group('in')] - idx = self.interpret_expression( - m.group('idx'), local_vars, allow_recursion - 1) - return val[idx] - for op, opfunc in _OPERATORS: m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr) if not m: @@ -211,21 +211,25 @@ class JSInterpreter(object): raise ExtractorError('Unsupported JS expression %r' % expr) def extract_object(self, objname): + _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' obj = {} obj_m = re.search( - (r'(?<!this\.)%s\s*=\s*\{' % re.escape(objname)) + - r'\s*(?P<fields>([a-zA-Z$0-9]+\s*:\s*function\(.*?\)\s*\{.*?\}(?:,\s*)?)*)' + - r'\}\s*;', + r'''(?x) + (?<!this\.)%s\s*=\s*{\s* + (?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*) + }\s*; + ''' % (re.escape(objname), _FUNC_NAME_RE), self.code) fields = obj_m.group('fields') # Currently, it only supports function definitions fields_m = re.finditer( - r'(?P<key>[a-zA-Z$0-9]+)\s*:\s*function' - r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}', + r'''(?x) + (?P<key>%s)\s*:\s*function\s*\((?P<args>[a-z,]+)\){(?P<code>[^}]+)} + ''' % _FUNC_NAME_RE, fields) for f in fields_m: argnames = f.group('args').split(',') - obj[f.group('key')] = self.build_function(argnames, f.group('code')) + obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code')) return obj diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 52309fb84..38439c971 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -20,6 +20,24 @@ from .utils import ( from .version import __version__ +def _hide_login_info(opts): + PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username']) + eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') + + def _scrub_eq(o): + m = eqre.match(o) + if m: + return m.group('key') + '=PRIVATE' + else: + return o + + opts = list(map(_scrub_eq, opts)) + for idx, opt in enumerate(opts): + if opt in PRIVATE_OPTS and idx + 1 < len(opts): + opts[idx + 1] = 'PRIVATE' + return opts + + def parseOpts(overrideArguments=None): def _readOptions(filename_bytes, default=[]): try: @@ -93,26 +111,6 @@ def parseOpts(overrideArguments=None): def _comma_separated_values_options_callback(option, opt_str, value, parser): setattr(parser.values, option.dest, value.split(',')) - def _hide_login_info(opts): - PRIVATE_OPTS = ['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'] - eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') - - def _scrub_eq(o): - m = eqre.match(o) - if m: - return m.group('key') + '=PRIVATE' - else: - return o - - opts = list(map(_scrub_eq, opts)) - for private_opt in PRIVATE_OPTS: - try: - i = opts.index(private_opt) - opts[i + 1] = 'PRIVATE' - except ValueError: - pass - return opts - # No need to wrap help messages if we're on a wide console columns = compat_get_terminal_size().columns max_width = columns if columns else 80 @@ -310,7 +308,7 @@ def parseOpts(overrideArguments=None): metavar='FILTER', dest='match_filter', default=None, help=( 'Generic video filter. ' - 'Specify any key (see help for -o for a list of available keys) to ' + 'Specify any key (see the "OUTPUT TEMPLATE" for a list of available keys) to ' 'match if the key is present, ' '!key to check if the key is not present, ' 'key > NUMBER (like "comment_count > 12", also works with ' @@ -618,7 +616,7 @@ def parseOpts(overrideArguments=None): verbosity.add_option( '-j', '--dump-json', action='store_true', dest='dumpjson', default=False, - help='Simulate, quiet but print JSON information. See --output for a description of available keys.') + help='Simulate, quiet but print JSON information. See the "OUTPUT TEMPLATE" for a description of available keys.') verbosity.add_option( '-J', '--dump-single-json', action='store_true', dest='dump_single_json', default=False, @@ -814,11 +812,12 @@ def parseOpts(overrideArguments=None): '--metadata-from-title', metavar='FORMAT', dest='metafromtitle', help='Parse additional metadata like song title / artist from the video title. ' - 'The format syntax is the same as --output, ' - 'the parsed parameters replace existing values. ' - 'Additional templates: %(album)s, %(artist)s. ' + 'The format syntax is the same as --output. Regular expression with ' + 'named capture groups may also be used. ' + 'The parsed parameters replace existing values. ' 'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like ' - '"Coldplay - Paradise"') + '"Coldplay - Paradise". ' + 'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"') postproc.add_option( '--xattrs', action='store_true', dest='xattrs', default=False, diff --git a/youtube_dl/postprocessor/execafterdownload.py b/youtube_dl/postprocessor/execafterdownload.py index 90630c2d7..64dabe790 100644 --- a/youtube_dl/postprocessor/execafterdownload.py +++ b/youtube_dl/postprocessor/execafterdownload.py @@ -4,7 +4,10 @@ import subprocess from .common import PostProcessor from ..compat import compat_shlex_quote -from ..utils import PostProcessingError +from ..utils import ( + encodeArgument, + PostProcessingError, +) class ExecAfterDownloadPP(PostProcessor): @@ -20,7 +23,7 @@ class ExecAfterDownloadPP(PostProcessor): cmd = cmd.replace('{}', compat_shlex_quote(information['filepath'])) self._downloader.to_screen('[exec] Executing command: %s' % cmd) - retCode = subprocess.call(cmd, shell=True) + retCode = subprocess.call(encodeArgument(cmd), shell=True) if retCode != 0: raise PostProcessingError( 'Command returned error code %d' % retCode) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index c91ec8588..51256a3fb 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -444,7 +444,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor): chapters = info.get('chapters', []) if chapters: - metadata_filename = encodeFilename(replace_extension(filename, 'meta')) + metadata_filename = replace_extension(filename, 'meta') with io.open(metadata_filename, 'wt', encoding='utf-8') as f: def ffmpeg_escape(text): return re.sub(r'(=|;|#|\\|\n)', r'\\\1', text) @@ -542,7 +542,7 @@ class FFmpegFixupM3u8PP(FFmpegPostProcessor): temp_filename = prepend_extension(filename, 'temp') options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] - self._downloader.to_screen('[ffmpeg] Fixing malformated aac bitstream in "%s"' % filename) + self._downloader.to_screen('[ffmpeg] Fixing malformed AAC bitstream in "%s"' % filename) self.run_ffmpeg(filename, temp_filename, options) os.remove(encodeFilename(filename)) diff --git a/youtube_dl/postprocessor/metadatafromtitle.py b/youtube_dl/postprocessor/metadatafromtitle.py index a7d637a3c..f5c14d974 100644 --- a/youtube_dl/postprocessor/metadatafromtitle.py +++ b/youtube_dl/postprocessor/metadatafromtitle.py @@ -9,7 +9,9 @@ class MetadataFromTitlePP(PostProcessor): def __init__(self, downloader, titleformat): super(MetadataFromTitlePP, self).__init__(downloader) self._titleformat = titleformat - self._titleregex = self.format_to_regex(titleformat) + self._titleregex = (self.format_to_regex(titleformat) + if re.search(r'%\(\w+\)s', titleformat) + else titleformat) def format_to_regex(self, fmt): r""" @@ -33,11 +35,14 @@ class MetadataFromTitlePP(PostProcessor): title = info['title'] match = re.match(self._titleregex, title) if match is None: - self._downloader.to_screen('[fromtitle] Could not interpret title of video as "%s"' % self._titleformat) + self._downloader.to_screen( + '[fromtitle] Could not interpret title of video as "%s"' + % self._titleformat) return [], info for attribute, value in match.groupdict().items(): - value = match.group(attribute) info[attribute] = value - self._downloader.to_screen('[fromtitle] parsed ' + attribute + ': ' + value) + self._downloader.to_screen( + '[fromtitle] parsed %s: %s' + % (attribute, value if value is not None else 'NA')) return [], info diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4d0685d83..9e4492d40 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -22,7 +22,6 @@ import locale import math import operator import os -import pipes import platform import random import re @@ -36,6 +35,7 @@ import xml.etree.ElementTree import zlib from .compat import ( + compat_HTMLParseError, compat_HTMLParser, compat_basestring, compat_chr, @@ -365,9 +365,9 @@ def get_elements_by_attribute(attribute, value, html, escape_value=True): retlist = [] for m in re.finditer(r'''(?xs) <([a-zA-Z0-9:._-]+) - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? \s+%s=['"]?%s['"]? - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? \s*> (?P<content>.*?) </\1> @@ -409,8 +409,12 @@ def extract_attributes(html_element): but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. """ parser = HTMLAttributeParser() - parser.feed(html_element) - parser.close() + try: + parser.feed(html_element) + parser.close() + # Older Python may throw HTMLParseError in case of malformed HTML + except compat_HTMLParseError: + pass return parser.attrs @@ -592,7 +596,7 @@ def unescapeHTML(s): assert type(s) == compat_str return re.sub( - r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) + r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) def get_subprocess_encoding(): @@ -932,14 +936,6 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): except zlib.error: return zlib.decompress(data) - @staticmethod - def addinfourl_wrapper(stream, headers, url, code): - if hasattr(compat_urllib_request.addinfourl, 'getcode'): - return compat_urllib_request.addinfourl(stream, headers, url, code) - ret = compat_urllib_request.addinfourl(stream, headers, url) - ret.code = code - return ret - def http_request(self, req): # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not # always respected by websites, some tend to give out URLs with non percent-encoded @@ -991,13 +987,13 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): break else: raise original_ioerror - resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code) + resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg del resp.headers['Content-encoding'] # deflate if resp.headers.get('Content-encoding', '') == 'deflate': gz = io.BytesIO(self.deflate(resp.read())) - resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) + resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg del resp.headers['Content-encoding'] # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see @@ -1187,7 +1183,7 @@ def unified_timestamp(date_str, day_first=True): if date_str is None: return None - date_str = date_str.replace(',', ' ') + date_str = re.sub(r'[,|]', '', date_str) pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 timezone, date_str = extract_timezone(date_str) @@ -1538,7 +1534,7 @@ def shell_quote(args): if isinstance(a, bytes): # We may get a filename encoded with 'encodeFilename' a = a.decode(encoding) - quoted_args.append(pipes.quote(a)) + quoted_args.append(compat_shlex_quote(a)) return ' '.join(quoted_args) @@ -1819,6 +1815,10 @@ def float_or_none(v, scale=1, invscale=1, default=None): return default +def bool_or_none(v, default=None): + return v if isinstance(v, bool) else default + + def strip_or_none(v): return None if v is None else v.strip() @@ -2098,7 +2098,7 @@ def update_Request(req, url=None, data=None, headers={}, query={}): return new_req -def try_multipart_encode(data, boundary): +def _multipart_encode_impl(data, boundary): content_type = 'multipart/form-data; boundary=%s' % boundary out = b'' @@ -2110,7 +2110,7 @@ def try_multipart_encode(data, boundary): v = v.encode('utf-8') # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578 # suggests sending UTF-8 directly. Firefox sends UTF-8, too - content = b'Content-Disposition: form-data; name="%s"\r\n\r\n' % k + v + b'\r\n' + content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n' if boundary.encode('ascii') in content: raise ValueError('Boundary overlaps with data') out += content @@ -2140,7 +2140,7 @@ def multipart_encode(data, boundary=None): boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff)) try: - out, content_type = try_multipart_encode(data, boundary) + out, content_type = _multipart_encode_impl(data, boundary) break except ValueError: if has_specified_boundary: @@ -2211,7 +2211,12 @@ def parse_age_limit(s): def strip_jsonp(code): return re.sub( - r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code) + r'''(?sx)^ + (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+) + (?:\s*&&\s*(?P=func_name))? + \s*\(\s*(?P<callback_data>.*)\);? + \s*?(?://[^\n]*)*$''', + r'\g<callback_data>', code) def js_to_json(code): @@ -2360,11 +2365,11 @@ def parse_codecs(codecs_str): if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'): if not vcodec: vcodec = full_codec - elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'): + elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): if not acodec: acodec = full_codec else: - write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr) + write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr) if not vcodec and not acodec: if len(splited_codecs) == 2: return { @@ -2732,6 +2737,8 @@ def cli_option(params, command_option, param): def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None): param = params.get(param) + if param is None: + return [] assert isinstance(param, bool) if separator: return [command_option + separator + (true_value if param else false_value)] diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c19ac49b0..8399c04fe 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.05.01' +__version__ = '2017.09.15' |