diff options
Diffstat (limited to 'youtube_dl')
83 files changed, 1536 insertions, 679 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 7953670a7..eb465c425 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -640,7 +640,7 @@ class YoutubeDL(object): NUMERIC_FIELDS = set(( 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx', - 'upload_year', 'upload_month', 'upload_day', + 'timestamp', 'upload_year', 'upload_month', 'upload_day', 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count', 'average_rating', 'comment_count', 'age_limit', 'start_time', 'end_time', @@ -672,8 +672,7 @@ class YoutubeDL(object): FORMAT_RE.format(numeric_field), r'%({0})s'.format(numeric_field), outtmpl) - tmpl = expand_path(outtmpl) - filename = tmpl % template_dict + filename = expand_path(outtmpl % template_dict) # Temporary fix for #4787 # 'Treat' all problem characters by passing filename through preferredencoding # to workaround encoding issues with subprocess on python2 @ Windows @@ -851,7 +850,14 @@ class YoutubeDL(object): new_result = info.copy() new_result.update(force_properties) - assert new_result.get('_type') != 'url_transparent' + # Extracted info may not be a video result (i.e. + # info.get('_type', 'video') != video) but rather an url or + # url_transparent. In such cases outer metadata (from ie_result) + # should be propagated to inner one (info). For this to happen + # _type of info should be overridden with url_transparent. This + # fixes issue from https://github.com/rg3/youtube-dl/pull/11163. + if new_result.get('_type') == 'url': + new_result['_type'] = 'url_transparent' return self.process_ie_result( new_result, download=download, extra_info=extra_info) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index f15606568..c4589411e 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -343,6 +343,7 @@ def _real_main(argv=None): 'retries': opts.retries, 'fragment_retries': opts.fragment_retries, 'skip_unavailable_fragments': opts.skip_unavailable_fragments, + 'keep_fragments': opts.keep_fragments, 'buffersize': opts.buffersize, 'noresizebuffer': opts.noresizebuffer, 'continuedl': opts.continue_dl, diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 2c4470a95..5d6621147 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -187,6 +187,9 @@ class FileDownloader(object): return filename[:-len('.part')] return filename + def ytdl_filename(self, filename): + return filename + '.ytdl' + def try_rename(self, old_filename, new_filename): try: if old_filename == new_filename: @@ -327,21 +330,22 @@ class FileDownloader(object): os.path.exists(encodeFilename(filename)) ) - continuedl_and_exists = ( - self.params.get('continuedl', True) and - os.path.isfile(encodeFilename(filename)) and - not self.params.get('nopart', False) - ) - - # Check file already present - if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists): - self.report_file_already_downloaded(filename) - self._hook_progress({ - 'filename': filename, - 'status': 'finished', - 'total_bytes': os.path.getsize(encodeFilename(filename)), - }) - return True + if not hasattr(filename, 'write'): + continuedl_and_exists = ( + self.params.get('continuedl', True) and + os.path.isfile(encodeFilename(filename)) and + not self.params.get('nopart', False) + ) + + # Check file already present + if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists): + self.report_file_already_downloaded(filename) + self._hook_progress({ + 'filename': filename, + 'status': 'finished', + 'total_bytes': os.path.getsize(encodeFilename(filename)), + }) + return True min_sleep_interval = self.params.get('sleep_interval') if min_sleep_interval: diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index e2ddc369e..7491fdad8 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -1,13 +1,7 @@ from __future__ import unicode_literals -import os - from .fragment import FragmentFD from ..compat import compat_urllib_error -from ..utils import ( - sanitize_open, - encodeFilename, -) class DashSegmentsFD(FragmentFD): @@ -28,31 +22,24 @@ class DashSegmentsFD(FragmentFD): self._prepare_and_start_frag_download(ctx) - segments_filenames = [] - fragment_retries = self.params.get('fragment_retries', 0) skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) - def process_segment(segment, tmp_filename, num): - segment_url = segment['url'] - segment_name = 'Frag%d' % num - target_filename = '%s-%s' % (tmp_filename, segment_name) + frag_index = 0 + for i, segment in enumerate(segments): + frag_index += 1 + if frag_index <= ctx['fragment_index']: + continue # In DASH, the first segment contains necessary headers to # generate a valid MP4 file, so always abort for the first segment - fatal = num == 0 or not skip_unavailable_fragments + fatal = i == 0 or not skip_unavailable_fragments count = 0 while count <= fragment_retries: try: - success = ctx['dl'].download(target_filename, { - 'url': segment_url, - 'http_headers': info_dict.get('http_headers'), - }) + success, frag_content = self._download_fragment(ctx, segment['url'], info_dict) if not success: return False - down, target_sanitized = sanitize_open(target_filename, 'rb') - ctx['dest_stream'].write(down.read()) - down.close() - segments_filenames.append(target_sanitized) + self._append_fragment(ctx, frag_content) break except compat_urllib_error.HTTPError as err: # YouTube may often return 404 HTTP error for a fragment causing the @@ -63,22 +50,14 @@ class DashSegmentsFD(FragmentFD): # HTTP error. count += 1 if count <= fragment_retries: - self.report_retry_fragment(err, segment_name, count, fragment_retries) + self.report_retry_fragment(err, frag_index, count, fragment_retries) if count > fragment_retries: if not fatal: - self.report_skip_fragment(segment_name) - return True + self.report_skip_fragment(frag_index) + continue self.report_error('giving up after %s fragment retries' % fragment_retries) return False - return True - - for i, segment in enumerate(segments): - if not process_segment(segment, ctx['tmpfilename'], i): - return False self._finish_frag_download(ctx) - for segment_file in segments_filenames: - os.remove(encodeFilename(segment_file)) - return True diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index e13cf547d..e78169a0d 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -29,7 +29,17 @@ class ExternalFD(FileDownloader): self.report_destination(filename) tmpfilename = self.temp_name(filename) - retval = self._call_downloader(tmpfilename, info_dict) + try: + retval = self._call_downloader(tmpfilename, info_dict) + except KeyboardInterrupt: + if not info_dict.get('is_live'): + raise + # Live stream downloading cancellation should be considered as + # correct and expected termination thus all postprocessing + # should take place + retval = 0 + self.to_screen('[%s] Interrupted by user' % self.get_basename()) + if retval == 0: fsize = os.path.getsize(encodeFilename(tmpfilename)) self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize)) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 688e086eb..c8fde9a89 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -3,7 +3,6 @@ from __future__ import division, unicode_literals import base64 import io import itertools -import os import time from .fragment import FragmentFD @@ -16,9 +15,7 @@ from ..compat import ( compat_struct_unpack, ) from ..utils import ( - encodeFilename, fix_xml_ampersands, - sanitize_open, xpath_text, ) @@ -366,17 +363,21 @@ class F4mFD(FragmentFD): dest_stream = ctx['dest_stream'] - write_flv_header(dest_stream) - if not live: - write_metadata_tag(dest_stream, metadata) + if ctx['complete_frags_downloaded_bytes'] == 0: + write_flv_header(dest_stream) + if not live: + write_metadata_tag(dest_stream, metadata) base_url_parsed = compat_urllib_parse_urlparse(base_url) self._start_frag_download(ctx) - frags_filenames = [] + frag_index = 0 while fragments_list: seg_i, frag_i = fragments_list.pop(0) + frag_index += 1 + if frag_index <= ctx['fragment_index']: + continue name = 'Seg%d-Frag%d' % (seg_i, frag_i) query = [] if base_url_parsed.query: @@ -386,17 +387,10 @@ class F4mFD(FragmentFD): if info_dict.get('extra_param_to_segment_url'): query.append(info_dict['extra_param_to_segment_url']) url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query)) - frag_filename = '%s-%s' % (ctx['tmpfilename'], name) try: - success = ctx['dl'].download(frag_filename, { - 'url': url_parsed.geturl(), - 'http_headers': info_dict.get('http_headers'), - }) + success, down_data = self._download_fragment(ctx, url_parsed.geturl(), info_dict) if not success: return False - (down, frag_sanitized) = sanitize_open(frag_filename, 'rb') - down_data = down.read() - down.close() reader = FlvReader(down_data) while True: try: @@ -411,12 +405,8 @@ class F4mFD(FragmentFD): break raise if box_type == b'mdat': - dest_stream.write(box_data) + self._append_fragment(ctx, box_data) break - if live: - os.remove(encodeFilename(frag_sanitized)) - else: - frags_filenames.append(frag_sanitized) except (compat_urllib_error.HTTPError, ) as err: if live and (err.code == 404 or err.code == 410): # We didn't keep up with the live window. Continue @@ -436,7 +426,4 @@ class F4mFD(FragmentFD): self._finish_frag_download(ctx) - for frag_file in frags_filenames: - os.remove(encodeFilename(frag_file)) - return True diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 56f975266..bccc8ecc1 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -2,6 +2,7 @@ from __future__ import division, unicode_literals import os import time +import json from .common import FileDownloader from .http import HttpFD @@ -28,15 +29,37 @@ class FragmentFD(FileDownloader): and hlsnative only) skip_unavailable_fragments: Skip unavailable fragments (DASH and hlsnative only) + keep_fragments: Keep downloaded fragments on disk after downloading is + finished + + For each incomplete fragment download youtube-dl keeps on disk a special + bookkeeping file with download state and metadata (in future such files will + be used for any incomplete download handled by youtube-dl). This file is + used to properly handle resuming, check download file consistency and detect + potential errors. The file has a .ytdl extension and represents a standard + JSON file of the following format: + + extractor: + Dictionary of extractor related data. TBD. + + downloader: + Dictionary of downloader related data. May contain following data: + current_fragment: + Dictionary with current (being downloaded) fragment data: + index: 0-based index of current fragment among all fragments + fragment_count: + Total count of fragments + + This feature is experimental and file format may change in future. """ - def report_retry_fragment(self, err, fragment_name, count, retries): + def report_retry_fragment(self, err, frag_index, count, retries): self.to_screen( - '[download] Got server HTTP error: %s. Retrying fragment %s (attempt %d of %s)...' - % (error_to_compat_str(err), fragment_name, count, self.format_retries(retries))) + '[download] Got server HTTP error: %s. Retrying fragment %d (attempt %d of %s)...' + % (error_to_compat_str(err), frag_index, count, self.format_retries(retries))) - def report_skip_fragment(self, fragment_name): - self.to_screen('[download] Skipping fragment %s...' % fragment_name) + def report_skip_fragment(self, frag_index): + self.to_screen('[download] Skipping fragment %d...' % frag_index) def _prepare_url(self, info_dict, url): headers = info_dict.get('http_headers') @@ -46,6 +69,51 @@ class FragmentFD(FileDownloader): self._prepare_frag_download(ctx) self._start_frag_download(ctx) + @staticmethod + def __do_ytdl_file(ctx): + return not ctx['live'] and not ctx['tmpfilename'] == '-' + + def _read_ytdl_file(self, ctx): + stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r') + ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index'] + stream.close() + + def _write_ytdl_file(self, ctx): + frag_index_stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'w') + downloader = { + 'current_fragment': { + 'index': ctx['fragment_index'], + }, + } + if ctx.get('fragment_count') is not None: + downloader['fragment_count'] = ctx['fragment_count'] + frag_index_stream.write(json.dumps({'downloader': downloader})) + frag_index_stream.close() + + def _download_fragment(self, ctx, frag_url, info_dict, headers=None): + fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index']) + success = ctx['dl'].download(fragment_filename, { + 'url': frag_url, + 'http_headers': headers or info_dict.get('http_headers'), + }) + if not success: + return False, None + down, frag_sanitized = sanitize_open(fragment_filename, 'rb') + ctx['fragment_filename_sanitized'] = frag_sanitized + frag_content = down.read() + down.close() + return True, frag_content + + def _append_fragment(self, ctx, frag_content): + try: + ctx['dest_stream'].write(frag_content) + finally: + if self.__do_ytdl_file(ctx): + self._write_ytdl_file(ctx) + if not self.params.get('keep_fragments', False): + os.remove(ctx['fragment_filename_sanitized']) + del ctx['fragment_filename_sanitized'] + def _prepare_frag_download(self, ctx): if 'live' not in ctx: ctx['live'] = False @@ -66,11 +134,36 @@ class FragmentFD(FileDownloader): } ) tmpfilename = self.temp_name(ctx['filename']) - dest_stream, tmpfilename = sanitize_open(tmpfilename, 'wb') + open_mode = 'wb' + resume_len = 0 + + # Establish possible resume length + if os.path.isfile(encodeFilename(tmpfilename)): + open_mode = 'ab' + resume_len = os.path.getsize(encodeFilename(tmpfilename)) + + # Should be initialized before ytdl file check + ctx.update({ + 'tmpfilename': tmpfilename, + 'fragment_index': 0, + }) + + if self.__do_ytdl_file(ctx): + if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): + self._read_ytdl_file(ctx) + else: + self._write_ytdl_file(ctx) + if ctx['fragment_index'] > 0: + assert resume_len > 0 + + dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode) + ctx.update({ 'dl': dl, 'dest_stream': dest_stream, 'tmpfilename': tmpfilename, + # Total complete fragments downloaded so far in bytes + 'complete_frags_downloaded_bytes': resume_len, }) def _start_frag_download(self, ctx): @@ -79,9 +172,9 @@ class FragmentFD(FileDownloader): # hook state = { 'status': 'downloading', - 'downloaded_bytes': 0, - 'frag_index': 0, - 'frag_count': total_frags, + 'downloaded_bytes': ctx['complete_frags_downloaded_bytes'], + 'fragment_index': ctx['fragment_index'], + 'fragment_count': total_frags, 'filename': ctx['filename'], 'tmpfilename': ctx['tmpfilename'], } @@ -89,8 +182,6 @@ class FragmentFD(FileDownloader): start = time.time() ctx.update({ 'started': start, - # Total complete fragments downloaded so far in bytes - 'complete_frags_downloaded_bytes': 0, # Amount of fragment's bytes downloaded by the time of the previous # frag progress hook invocation 'prev_frag_downloaded_bytes': 0, @@ -106,11 +197,12 @@ class FragmentFD(FileDownloader): if not ctx['live']: estimated_size = ( (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) / - (state['frag_index'] + 1) * total_frags) + (state['fragment_index'] + 1) * total_frags) state['total_bytes_estimate'] = estimated_size if s['status'] == 'finished': - state['frag_index'] += 1 + state['fragment_index'] += 1 + ctx['fragment_index'] = state['fragment_index'] state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes'] ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes'] ctx['prev_frag_downloaded_bytes'] = 0 @@ -132,6 +224,10 @@ class FragmentFD(FileDownloader): def _finish_frag_download(self, ctx): ctx['dest_stream'].close() + if self.__do_ytdl_file(ctx): + ytdl_filename = encodeFilename(self.ytdl_filename(ctx['filename'])) + if os.path.isfile(ytdl_filename): + os.remove(ytdl_filename) elapsed = time.time() - ctx['started'] self.try_rename(ctx['tmpfilename'], ctx['filename']) fsize = os.path.getsize(encodeFilename(ctx['filename'])) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index d0a5f7ba4..0e29c8a2a 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import os.path import re import binascii try: @@ -18,8 +17,6 @@ from ..compat import ( compat_struct_pack, ) from ..utils import ( - encodeFilename, - sanitize_open, parse_m3u8_attributes, update_url_query, ) @@ -103,17 +100,18 @@ class HlsFD(FragmentFD): media_sequence = 0 decrypt_info = {'METHOD': 'NONE'} byte_range = {} - frags_filenames = [] + frag_index = 0 for line in s.splitlines(): line = line.strip() if line: if not line.startswith('#'): + frag_index += 1 + if frag_index <= ctx['fragment_index']: + continue frag_url = ( line if re.match(r'^https?://', line) else compat_urlparse.urljoin(man_url, line)) - frag_name = 'Frag%d' % i - frag_filename = '%s-%s' % (ctx['tmpfilename'], frag_name) if extra_query: frag_url = update_url_query(frag_url, extra_query) count = 0 @@ -122,15 +120,10 @@ class HlsFD(FragmentFD): headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end']) while count <= fragment_retries: try: - success = ctx['dl'].download(frag_filename, { - 'url': frag_url, - 'http_headers': headers, - }) + success, frag_content = self._download_fragment( + ctx, frag_url, info_dict, headers) if not success: return False - down, frag_sanitized = sanitize_open(frag_filename, 'rb') - frag_content = down.read() - down.close() break except compat_urllib_error.HTTPError as err: # Unavailable (possibly temporary) fragments may be served. @@ -139,28 +132,29 @@ class HlsFD(FragmentFD): # https://github.com/rg3/youtube-dl/issues/10448). count += 1 if count <= fragment_retries: - self.report_retry_fragment(err, frag_name, count, fragment_retries) + self.report_retry_fragment(err, frag_index, count, fragment_retries) if count > fragment_retries: if skip_unavailable_fragments: i += 1 media_sequence += 1 - self.report_skip_fragment(frag_name) + self.report_skip_fragment(frag_index) continue self.report_error( 'giving up after %s fragment retries' % fragment_retries) return False if decrypt_info['METHOD'] == 'AES-128': iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) + decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen(decrypt_info['URI']).read() frag_content = AES.new( decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) - ctx['dest_stream'].write(frag_content) - frags_filenames.append(frag_sanitized) + self._append_fragment(ctx, frag_content) # We only download the first fragment during the test if test: break i += 1 media_sequence += 1 elif line.startswith('#EXT-X-KEY'): + decrypt_url = decrypt_info.get('URI') decrypt_info = parse_m3u8_attributes(line[11:]) if decrypt_info['METHOD'] == 'AES-128': if 'IV' in decrypt_info: @@ -170,7 +164,8 @@ class HlsFD(FragmentFD): man_url, decrypt_info['URI']) if extra_query: decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query) - decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read() + if decrypt_url != decrypt_info['URI']: + decrypt_info['KEY'] = None elif line.startswith('#EXT-X-MEDIA-SEQUENCE'): media_sequence = int(line[22:]) elif line.startswith('#EXT-X-BYTERANGE'): @@ -183,7 +178,4 @@ class HlsFD(FragmentFD): self._finish_frag_download(ctx) - for frag_file in frags_filenames: - os.remove(encodeFilename(frag_file)) - return True diff --git a/youtube_dl/downloader/ism.py b/youtube_dl/downloader/ism.py index 63a636cb7..5f6f9faef 100644 --- a/youtube_dl/downloader/ism.py +++ b/youtube_dl/downloader/ism.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import os import time import struct import binascii @@ -8,10 +7,6 @@ import io from .fragment import FragmentFD from ..compat import compat_urllib_error -from ..utils import ( - sanitize_open, - encodeFilename, -) u8 = struct.Struct(b'>B') @@ -225,50 +220,39 @@ class IsmFD(FragmentFD): self._prepare_and_start_frag_download(ctx) - segments_filenames = [] - fragment_retries = self.params.get('fragment_retries', 0) skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) track_written = False + frag_index = 0 for i, segment in enumerate(segments): - segment_url = segment['url'] - segment_name = 'Frag%d' % i - target_filename = '%s-%s' % (ctx['tmpfilename'], segment_name) + frag_index += 1 + if frag_index <= ctx['fragment_index']: + continue count = 0 while count <= fragment_retries: try: - success = ctx['dl'].download(target_filename, { - 'url': segment_url, - 'http_headers': info_dict.get('http_headers'), - }) + success, frag_content = self._download_fragment(ctx, segment['url'], info_dict) if not success: return False - down, target_sanitized = sanitize_open(target_filename, 'rb') - down_data = down.read() if not track_written: - tfhd_data = extract_box_data(down_data, [b'moof', b'traf', b'tfhd']) + tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd']) info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0] write_piff_header(ctx['dest_stream'], info_dict['_download_params']) track_written = True - ctx['dest_stream'].write(down_data) - down.close() - segments_filenames.append(target_sanitized) + self._append_fragment(ctx, frag_content) break except compat_urllib_error.HTTPError as err: count += 1 if count <= fragment_retries: - self.report_retry_fragment(err, segment_name, count, fragment_retries) + self.report_retry_fragment(err, frag_index, count, fragment_retries) if count > fragment_retries: if skip_unavailable_fragments: - self.report_skip_fragment(segment_name) + self.report_skip_fragment(frag_index) continue self.report_error('giving up after %s fragment retries' % fragment_retries) return False self._finish_frag_download(ctx) - for segment_file in segments_filenames: - os.remove(encodeFilename(segment_file)) - return True diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index 100cf997f..7da96c65c 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -1308,6 +1308,12 @@ class AdobePassIE(InfoExtractor): _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' _MVPD_CACHE = 'ap-mvpd' + def _download_webpage_handle(self, *args, **kwargs): + headers = kwargs.get('headers', {}) + headers.update(self.geo_verification_headers()) + kwargs['headers'] = headers + return super(AdobePassIE, self)._download_webpage_handle(*args, **kwargs) + @staticmethod def _get_mvpd_resource(provider_id, title, guid, rating): channel = etree.Element('channel') diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index c01c67303..2dcdba9d2 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -101,10 +101,14 @@ class AENetworksIE(AENetworksBaseIE): for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): entries.append(self.url_result( compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) - return self.playlist_result( - entries, self._html_search_meta('aetn:SeriesId', webpage), - self._html_search_meta('aetn:SeriesTitle', webpage)) - elif url_parts_len == 2: + if entries: + return self.playlist_result( + entries, self._html_search_meta('aetn:SeriesId', webpage), + self._html_search_meta('aetn:SeriesTitle', webpage)) + else: + # single season + url_parts_len = 2 + if url_parts_len == 2: entries = [] for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage): episode_attributes = extract_attributes(episode_item) @@ -112,7 +116,7 @@ class AENetworksIE(AENetworksBaseIE): url, episode_attributes['data-canonical']) entries.append(self.url_result( episode_url, 'AENetworks', - episode_attributes['data-videoid'])) + episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id'))) return self.playlist_result( entries, self._html_search_meta('aetn:SeasonId', webpage)) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 78d29c861..c8cb91dcb 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -207,11 +207,10 @@ class AfreecaTVIE(InfoExtractor): file_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', note='Downloading part %d m3u8 information' % file_num) - title = title if one else '%s (part %d)' % (title, file_num) file_info = common_entry.copy() file_info.update({ 'id': format_id, - 'title': title, + 'title': title if one else '%s (part %d)' % (title, file_num), 'upload_date': upload_date, 'duration': file_duration, 'formats': formats, diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index e8e40126b..fde1a8ff7 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -7,15 +7,19 @@ from ..utils import ( parse_iso8601, mimetype2ext, determine_ext, + ExtractorError, ) class AMPIE(InfoExtractor): # parse Akamai Adaptive Media Player feed def _extract_feed_info(self, url): - item = self._download_json( + feed = self._download_json( url, None, 'Downloading Akamai AMP feed', - 'Unable to download Akamai AMP feed')['channel']['item'] + 'Unable to download Akamai AMP feed') + item = feed.get('channel', {}).get('item') + if not item: + raise ExtractorError('%s said: %s' % (self.IE_NAME, feed['error'])) video_id = item['guid'] @@ -30,9 +34,12 @@ class AMPIE(InfoExtractor): if isinstance(media_thumbnail, dict): media_thumbnail = [media_thumbnail] for thumbnail_data in media_thumbnail: - thumbnail = thumbnail_data['@attributes'] + thumbnail = thumbnail_data.get('@attributes', {}) + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue thumbnails.append({ - 'url': self._proto_relative_url(thumbnail['url'], 'http:'), + 'url': self._proto_relative_url(thumbnail_url, 'http:'), 'width': int_or_none(thumbnail.get('width')), 'height': int_or_none(thumbnail.get('height')), }) @@ -43,9 +50,14 @@ class AMPIE(InfoExtractor): if isinstance(media_subtitle, dict): media_subtitle = [media_subtitle] for subtitle_data in media_subtitle: - subtitle = subtitle_data['@attributes'] - lang = subtitle.get('lang') or 'en' - subtitles[lang] = [{'url': subtitle['href']}] + subtitle = subtitle_data.get('@attributes', {}) + subtitle_href = subtitle.get('href') + if not subtitle_href: + continue + subtitles.setdefault(subtitle.get('lang') or 'en', []).append({ + 'url': subtitle_href, + 'ext': mimetype2ext(subtitle.get('type')) or determine_ext(subtitle_href), + }) formats = [] media_content = get_media_node('content') diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py index 623f44dce..8023da702 100644 --- a/youtube_dl/extractor/anvato.py +++ b/youtube_dl/extractor/anvato.py @@ -5,6 +5,7 @@ import base64 import hashlib import json import random +import re import time from .common import InfoExtractor @@ -16,6 +17,7 @@ from ..utils import ( intlist_to_bytes, int_or_none, strip_jsonp, + unescapeHTML, ) @@ -26,6 +28,8 @@ def md5_text(s): class AnvatoIE(InfoExtractor): + _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)' + # Copied from anvplayer.min.js _ANVACK_TABLE = { 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', @@ -114,6 +118,22 @@ class AnvatoIE(InfoExtractor): 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ' } + _MCP_TO_ACCESS_KEY_TABLE = { + 'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922', + 'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749', + 'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa', + 'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa', + 'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a', + 'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336', + 'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336', + 'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3', + 'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900', + 'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99', + 'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe', + 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' + } + + _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' def __init__(self, *args, **kwargs): @@ -178,12 +198,7 @@ class AnvatoIE(InfoExtractor): } if ext == 'm3u8' or media_format in ('m3u8', 'm3u8-variant'): - # Not using _extract_m3u8_formats here as individual media - # playlists are also included in published_urls. - if tbr is None: - formats.append(self._m3u8_meta_format(video_url, ext='mp4', m3u8_id='hls')) - continue - else: + if tbr is not None: a_format.update({ 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), 'ext': 'mp4', @@ -222,9 +237,42 @@ class AnvatoIE(InfoExtractor): 'subtitles': subtitles, } + @staticmethod + def _extract_urls(ie, webpage, video_id): + entries = [] + for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage): + anvplayer_data = ie._parse_json( + mobj.group('anvp'), video_id, transform_source=unescapeHTML, + fatal=False) + if not anvplayer_data: + continue + video = anvplayer_data.get('video') + if not isinstance(video, compat_str) or not video.isdigit(): + continue + access_key = anvplayer_data.get('accessKey') + if not access_key: + mcp = anvplayer_data.get('mcp') + if mcp: + access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get( + mcp.lower()) + if not access_key: + continue + entries.append(ie.url_result( + 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(), + video_id=video)) + return entries + def _extract_anvato_videos(self, webpage, video_id): - anvplayer_data = self._parse_json(self._html_search_regex( - r'<script[^>]+data-anvp=\'([^\']+)\'', webpage, - 'Anvato player data'), video_id) + anvplayer_data = self._parse_json( + self._html_search_regex( + self._ANVP_RE, webpage, 'Anvato player data', group='anvp'), + video_id) return self._get_anvato_videos( anvplayer_data['accessKey'], anvplayer_data['video']) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + access_key, video_id = mobj.group('access_key_or_mcp', 'id') + if access_key not in self._ANVACK_TABLE: + access_key = self._MCP_TO_ACCESS_KEY_TABLE[access_key] + return self._get_anvato_videos(access_key, video_id) diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py index ea7a70393..a84b8b1eb 100644 --- a/youtube_dl/extractor/appleconnect.py +++ b/youtube_dl/extractor/appleconnect.py @@ -12,13 +12,13 @@ class AppleConnectIE(InfoExtractor): _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)' _TEST = { 'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3', - 'md5': '10d0f2799111df4cb1c924520ca78f98', + 'md5': 'e7c38568a01ea45402570e6029206723', 'info_dict': { 'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3', 'ext': 'm4v', 'title': 'Energy', 'uploader': 'Drake', - 'thumbnail': 'http://is5.mzstatic.com/image/thumb/Video5/v4/78/61/c5/7861c5fa-ad6d-294b-1464-cf7605b911d6/source/1920x1080sr.jpg', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20150710', 'timestamp': 1436545535, }, diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index a6801f3d4..b45b431e1 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -70,7 +70,8 @@ class AppleTrailersIE(InfoExtractor): }, { 'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/', 'info_dict': { - 'id': 'blackthorn', + 'id': '4489', + 'title': 'Blackthorn', }, 'playlist_mincount': 2, 'expected_warnings': ['Unable to download JSON metadata'], @@ -261,7 +262,7 @@ class AppleTrailersSectionIE(InfoExtractor): 'title': 'Most Popular', 'id': 'mostpopular', }, - 'playlist_mincount': 80, + 'playlist_mincount': 30, }, { 'url': 'http://trailers.apple.com/#section=moviestudios', 'info_dict': { diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index e21045bed..3c7d7250b 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -24,12 +24,12 @@ class ArchiveOrgIE(InfoExtractor): } }, { 'url': 'https://archive.org/details/Cops1922', - 'md5': 'bc73c8ab3838b5a8fc6c6651fa7b58ba', + 'md5': '0869000b4ce265e8ca62738b336b268a', 'info_dict': { 'id': 'Cops1922', 'ext': 'mp4', 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:b4544662605877edd99df22f9620d858', + 'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6', } }, { 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 69a23e88c..56baef29d 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -180,7 +180,7 @@ class ArteTVBaseIE(InfoExtractor): class ArteTVPlus7IE(ArteTVBaseIE): IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/[^/]+/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/(?:[^/]+/)?(?P<lang>fr|de|en|es)/(?:videos/)?(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D', @@ -188,6 +188,9 @@ class ArteTVPlus7IE(ArteTVBaseIE): }, { 'url': 'http://sites.arte.tv/karambolage/de/video/karambolage-22', 'only_matching': True, + }, { + 'url': 'http://www.arte.tv/de/videos/048696-000-A/der-kluge-bauch-unser-zweites-gehirn', + 'only_matching': True, }] @classmethod diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 99af6dc5a..01fa308ff 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -36,7 +36,7 @@ class AtresPlayerIE(InfoExtractor): }, { 'url': 'http://www.atresplayer.com/television/especial/videoencuentros/temporada-1/capitulo-112-david-bustamante_2014121600375.html', - 'md5': '0d0e918533bbd4b263f2de4d197d4aac', + 'md5': '6e52cbb513c405e403dbacb7aacf8747', 'info_dict': { 'id': 'capitulo-112-david-bustamante', 'ext': 'flv', diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py index 8fc5f65c6..e48bb8972 100644 --- a/youtube_dl/extractor/audioboom.py +++ b/youtube_dl/extractor/audioboom.py @@ -16,7 +16,7 @@ class AudioBoomIE(InfoExtractor): 'title': '3/09/2016 Czaban Hour 3', 'description': 'Guest: Nate Davis - NFL free agency, Guest: Stan Gans', 'duration': 2245.72, - 'uploader': 'Steve Czaban', + 'uploader': 'SB Nation A.M.', 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio', } }, { diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py deleted file mode 100644 index 3ba2f00d3..000000000 --- a/youtube_dl/extractor/azubu.py +++ /dev/null @@ -1,140 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - sanitized_Request, -) - - -class AzubuIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?azubu\.(?:tv|uol.com.br)/[^/]+#!/play/(?P<id>\d+)' - _TESTS = [ - { - 'url': 'http://www.azubu.tv/GSL#!/play/15575/2014-hot6-cup-last-big-match-ro8-day-1', - 'md5': 'a88b42fcf844f29ad6035054bd9ecaf4', - 'info_dict': { - 'id': '15575', - 'ext': 'mp4', - 'title': '2014 HOT6 CUP LAST BIG MATCH Ro8 Day 1', - 'description': 'md5:d06bdea27b8cc4388a90ad35b5c66c01', - 'thumbnail': r're:^https?://.*\.jpe?g', - 'timestamp': 1417523507.334, - 'upload_date': '20141202', - 'duration': 9988.7, - 'uploader': 'GSL', - 'uploader_id': 414310, - 'view_count': int, - }, - }, - { - 'url': 'http://www.azubu.tv/FnaticTV#!/play/9344/-fnatic-at-worlds-2014:-toyz---%22i-love-rekkles,-he-has-amazing-mechanics%22-', - 'md5': 'b72a871fe1d9f70bd7673769cdb3b925', - 'info_dict': { - 'id': '9344', - 'ext': 'mp4', - 'title': 'Fnatic at Worlds 2014: Toyz - "I love Rekkles, he has amazing mechanics"', - 'description': 'md5:4a649737b5f6c8b5c5be543e88dc62af', - 'thumbnail': r're:^https?://.*\.jpe?g', - 'timestamp': 1410530893.320, - 'upload_date': '20140912', - 'duration': 172.385, - 'uploader': 'FnaticTV', - 'uploader_id': 272749, - 'view_count': int, - }, - 'skip': 'Channel offline', - }, - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - - data = self._download_json( - 'http://www.azubu.tv/api/video/%s' % video_id, video_id)['data'] - - title = data['title'].strip() - description = data.get('description') - thumbnail = data.get('thumbnail') - view_count = data.get('view_count') - user = data.get('user', {}) - uploader = user.get('username') - uploader_id = user.get('id') - - stream_params = json.loads(data['stream_params']) - - timestamp = float_or_none(stream_params.get('creationDate'), 1000) - duration = float_or_none(stream_params.get('length'), 1000) - - renditions = stream_params.get('renditions') or [] - video = stream_params.get('FLVFullLength') or stream_params.get('videoFullLength') - if video: - renditions.append(video) - - if not renditions and not user.get('channel', {}).get('is_live', True): - raise ExtractorError('%s said: channel is offline.' % self.IE_NAME, expected=True) - - formats = [{ - 'url': fmt['url'], - 'width': fmt['frameWidth'], - 'height': fmt['frameHeight'], - 'vbr': float_or_none(fmt['encodingRate'], 1000), - 'filesize': fmt['size'], - 'vcodec': fmt['videoCodec'], - 'container': fmt['videoContainer'], - } for fmt in renditions if fmt['url']] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'view_count': view_count, - 'formats': formats, - } - - -class AzubuLiveIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?azubu\.(?:tv|uol.com.br)/(?P<id>[^/]+)$' - - _TESTS = [{ - 'url': 'http://www.azubu.tv/MarsTVMDLen', - 'only_matching': True, - }, { - 'url': 'http://azubu.uol.com.br/adolfz', - 'only_matching': True, - }] - - def _real_extract(self, url): - user = self._match_id(url) - - info = self._download_json( - 'http://api.azubu.tv/public/modules/last-video/{0}/info'.format(user), - user)['data'] - if info['type'] != 'STREAM': - raise ExtractorError('{0} is not streaming live'.format(user), expected=True) - - req = sanitized_Request( - 'https://edge-elb.api.brightcove.com/playback/v1/accounts/3361910549001/videos/ref:' + info['reference_id']) - req.add_header('Accept', 'application/json;pk=BCpkADawqM1gvI0oGWg8dxQHlgT8HkdE2LnAlWAZkOlznO39bSZX726u4JqnDsK3MDXcO01JxXK2tZtJbgQChxgaFzEVdHRjaDoxaOu8hHOO8NYhwdxw9BzvgkvLUlpbDNUuDoc4E4wxDToV') - bc_info = self._download_json(req, user) - m3u8_url = next(source['src'] for source in bc_info['sources'] if source['container'] == 'M2TS') - formats = self._extract_m3u8_formats(m3u8_url, user, ext='mp4') - self._sort_formats(formats) - - return { - 'id': info['id'], - 'title': self._live_title(info['title']), - 'uploader_id': user, - 'formats': formats, - 'is_live': True, - 'thumbnail': bc_info['poster'], - } diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 056e06376..df2972f26 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -34,12 +34,12 @@ class BandcampIE(InfoExtractor): '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '73d0b3171568232574e45652f8720b5c', + 'md5': '0369ace6b939f0927e62c67a1a8d9fa7', 'info_dict': { 'id': '2650410135', - 'ext': 'mp3', - 'title': 'Lanius (Battle)', - 'uploader': 'Ben Prunty Music', + 'ext': 'aiff', + 'title': 'Ben Prunty - Lanius (Battle)', + 'uploader': 'Ben Prunty', }, }] diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index b0b7914d8..d5c5822f2 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -16,7 +16,7 @@ class BeegIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P<id>\d+)' _TEST = { 'url': 'http://beeg.com/5416503', - 'md5': '46c384def73b33dbc581262e5ee67cef', + 'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820', 'info_dict': { 'id': '5416503', 'ext': 'mp4', diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index 7a8e1f60b..e829974ff 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -35,7 +35,7 @@ class BleacherReportIE(InfoExtractor): 'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', 'timestamp': 1446839961, 'uploader': 'Sean Fay', - 'description': 'md5:825e94e0f3521df52fa83b2ed198fa20', + 'description': 'md5:b1601e2314c4d8eec23b6eafe086a757', 'uploader_id': 6466954, 'upload_date': '20151011', }, @@ -90,17 +90,13 @@ class BleacherReportCMSIE(AMPIE): _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'md5': '8c2c12e3af7805152675446c905d159b', + 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index ff0aa11b1..2c32b6ae2 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -77,7 +77,7 @@ class BRIE(InfoExtractor): 'description': 'md5:bb659990e9e59905c3d41e369db1fbe3', 'duration': 893, 'uploader': 'Eva Maria Steimle', - 'upload_date': '20140117', + 'upload_date': '20170208', } }, ] diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 124497e95..3f017a2b1 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -131,6 +131,12 @@ class BrightcoveLegacyIE(InfoExtractor): }, 'playlist_mincount': 10, }, + { + # playerID inferred from bcpid + # from http://www.un.org/chinese/News/story.asp?NewsID=27724 + 'url': 'https://link.brightcove.com/services/player/bcpid1722935254001/?bctid=5360463607001&autoStart=false&secureConnections=true&width=650&height=350', + 'only_matching': True, # Tested in GenericIE + } ] FLV_VCODECS = { 1: 'SORENSON', @@ -266,9 +272,13 @@ class BrightcoveLegacyIE(InfoExtractor): if matches: return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) - return list(filter(None, [ - cls._build_brighcove_url_from_js(custom_bc) - for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)])) + matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage) + if matches: + return list(filter(None, [ + cls._build_brighcove_url_from_js(custom_bc) + for custom_bc in matches])) + return [src for _, src in re.findall( + r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)] def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -285,6 +295,10 @@ class BrightcoveLegacyIE(InfoExtractor): if videoPlayer: # We set the original url as the default 'Referer' header referer = smuggled_data.get('Referer', url) + if 'playerID' not in query: + mobj = re.search(r'/bcpid(\d+)', url) + if mobj is not None: + query['playerID'] = [mobj.group(1)] return self._get_video_info( videoPlayer[0], query, referer=referer) elif 'playerKey' in query: @@ -484,8 +498,8 @@ class BrightcoveNewIE(InfoExtractor): }] @staticmethod - def _extract_url(webpage): - urls = BrightcoveNewIE._extract_urls(webpage) + def _extract_url(ie, webpage): + urls = BrightcoveNewIE._extract_urls(ie, webpage) return urls[0] if urls else None @staticmethod @@ -508,7 +522,7 @@ class BrightcoveNewIE(InfoExtractor): # [2] looks like: for video, script_tag, account_id, player_id, embed in re.findall( r'''(?isx) - (<video\s+[^>]+>) + (<video\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>) (?:.*? (<script[^>]+ src=["\'](?:https?:)?//players\.brightcove\.net/ diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index f1f128c45..acd87e371 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -16,13 +16,10 @@ class Canalc2IE(InfoExtractor): 'md5': '060158428b650f896c542dfbb3d6487f', 'info_dict': { 'id': '12163', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Terrasses du Numérique', 'duration': 122, }, - 'params': { - 'skip_download': True, # Requires rtmpdump - } }, { 'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui', 'only_matching': True, diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index cf678e7f8..87ad14e91 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -96,6 +96,7 @@ class CBCIE(InfoExtractor): 'info_dict': { 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', 'id': 'dog-indoor-exercise-winter-1.3928238', + 'description': 'md5:c18552e41726ee95bd75210d1ca9194c', }, 'playlist_mincount': 6, }] @@ -165,12 +166,11 @@ class CBCPlayerIE(InfoExtractor): 'uploader': 'CBCC-NEW', }, }, { - # available only when we add `formats=MPEG4,FLV,MP3` to theplatform url 'url': 'http://www.cbc.ca/player/play/2164402062', - 'md5': '17a61eb813539abea40618d6323a7f82', + 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', 'info_dict': { 'id': '2164402062', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Cancer survivor four times over', 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', 'timestamp': 1320410746, diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py index 8d5f11dd1..7d78e3aae 100644 --- a/youtube_dl/extractor/cbslocal.py +++ b/youtube_dl/extractor/cbslocal.py @@ -60,8 +60,8 @@ class CBSLocalIE(AnvatoIE): 'title': 'A Very Blue Anniversary', 'description': 'CBS2’s Cindy Hsu has more.', 'thumbnail': 're:^https?://.*', - 'timestamp': 1479962220, - 'upload_date': '20161124', + 'timestamp': int, + 'upload_date': r're:^\d{8}$', 'uploader': 'CBS', 'subtitles': { 'en': 'mincount:5', diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 1ee35b501..78b7a923c 100755 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -9,7 +9,10 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + multipart_encode, parse_duration, + random_birthday, + urljoin, ) @@ -27,7 +30,8 @@ class CDAIE(InfoExtractor): 'description': 'md5:269ccd135d550da90d1662651fcb9772', 'thumbnail': r're:^https?://.*\.jpg$', 'average_rating': float, - 'duration': 39 + 'duration': 39, + 'age_limit': 0, } }, { 'url': 'http://www.cda.pl/video/57413289', @@ -41,13 +45,41 @@ class CDAIE(InfoExtractor): 'uploader': 'crash404', 'view_count': int, 'average_rating': float, - 'duration': 137 + 'duration': 137, + 'age_limit': 0, } }, { + # Age-restricted + 'url': 'http://www.cda.pl/video/1273454c4', + 'info_dict': { + 'id': '1273454c4', + 'ext': 'mp4', + 'title': 'Bronson (2008) napisy HD 1080p', + 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', + 'height': 1080, + 'uploader': 'boniek61', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 5554, + 'age_limit': 18, + 'view_count': int, + 'average_rating': float, + }, + }, { 'url': 'http://ebd.cda.pl/0x0/5749950c', 'only_matching': True, }] + def _download_age_confirm_page(self, url, video_id, *args, **kwargs): + form_data = random_birthday('rok', 'miesiac', 'dzien') + form_data.update({'return': url, 'module': 'video', 'module_id': video_id}) + data, content_type = multipart_encode(form_data) + return self._download_webpage( + urljoin(url, '/a/validatebirth'), video_id, *args, + data=data, headers={ + 'Referer': url, + 'Content-Type': content_type, + }, **kwargs) + def _real_extract(self, url): video_id = self._match_id(url) self._set_cookie('cda.pl', 'cda.player', 'html5') @@ -57,6 +89,13 @@ class CDAIE(InfoExtractor): if 'Ten film jest dostępny dla użytkowników premium' in webpage: raise ExtractorError('This video is only available for premium users.', expected=True) + need_confirm_age = False + if self._html_search_regex(r'(<form[^>]+action="/a/validatebirth")', + webpage, 'birthday validate form', default=None): + webpage = self._download_age_confirm_page( + url, video_id, note='Confirming age') + need_confirm_age = True + formats = [] uploader = self._search_regex(r'''(?x) @@ -81,6 +120,7 @@ class CDAIE(InfoExtractor): 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, 'duration': None, + 'age_limit': 18 if need_confirm_age else 0, } def extract_format(page, version): @@ -121,7 +161,12 @@ class CDAIE(InfoExtractor): for href, resolution in re.findall( r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)', webpage): - webpage = self._download_webpage( + if need_confirm_age: + handler = self._download_age_confirm_page + else: + handler = self._download_webpage + + webpage = handler( self._BASE_URL + href, video_id, 'Downloading %s version information' % resolution, fatal=False) if not webpage: @@ -129,6 +174,7 @@ class CDAIE(InfoExtractor): # invalid version is requested. self.report_warning('Unable to download %s version information' % resolution) continue + extract_format(webpage, resolution) self._sort_formats(formats) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index bb52e0c6f..0920f6219 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -12,7 +12,7 @@ class ClipfishIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/', - 'md5': '720563e467b86374c194bdead08d207d', + 'md5': 'b9a5dc46294154c1193e2d10e0c95693', 'info_dict': { 'id': '4343170', 'ext': 'mp4', diff --git a/youtube_dl/extractor/collegerama.py b/youtube_dl/extractor/collegerama.py index 18c734766..6a41db87c 100644 --- a/youtube_dl/extractor/collegerama.py +++ b/youtube_dl/extractor/collegerama.py @@ -21,7 +21,7 @@ class CollegeRamaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.', 'description': '', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', 'duration': 7713.088, 'timestamp': 1413309600, 'upload_date': '20141014', @@ -35,6 +35,7 @@ class CollegeRamaIE(InfoExtractor): 'ext': 'wmv', 'title': '64ste Vakantiecursus: Afvalwater', 'description': 'md5:7fd774865cc69d972f542b157c328305', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', 'duration': 10853, 'timestamp': 1326446400, 'upload_date': '20120113', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e54adc9f0..76b5378e9 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -245,6 +245,10 @@ class InfoExtractor(object): specified in the URL. end_time: Time in seconds where the reproduction should end, as specified in the URL. + chapters: A list of dictionaries, with the following entries: + * "start_time" - The start time of the chapter in seconds + * "end_time" - The end time of the chapter in seconds + * "title" (optional, string) The following fields should only be used when the video belongs to some logical chapter or section: @@ -976,6 +980,23 @@ class InfoExtractor(object): return info if isinstance(json_ld, dict): json_ld = [json_ld] + + def extract_video_object(e): + assert e['@type'] == 'VideoObject' + info.update({ + 'url': e.get('contentUrl'), + 'title': unescapeHTML(e.get('name')), + 'description': unescapeHTML(e.get('description')), + 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), + 'duration': parse_duration(e.get('duration')), + 'timestamp': unified_timestamp(e.get('uploadDate')), + 'filesize': float_or_none(e.get('contentSize')), + 'tbr': int_or_none(e.get('bitrate')), + 'width': int_or_none(e.get('width')), + 'height': int_or_none(e.get('height')), + 'view_count': int_or_none(e.get('interactionCount')), + }) + for e in json_ld: if e.get('@context') == 'http://schema.org': item_type = e.get('@type') @@ -1000,18 +1021,11 @@ class InfoExtractor(object): 'description': unescapeHTML(e.get('articleBody')), }) elif item_type == 'VideoObject': - info.update({ - 'url': e.get('contentUrl'), - 'title': unescapeHTML(e.get('name')), - 'description': unescapeHTML(e.get('description')), - 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), - 'duration': parse_duration(e.get('duration')), - 'timestamp': unified_timestamp(e.get('uploadDate')), - 'filesize': float_or_none(e.get('contentSize')), - 'tbr': int_or_none(e.get('bitrate')), - 'width': int_or_none(e.get('width')), - 'height': int_or_none(e.get('height')), - }) + extract_video_object(e) + elif item_type == 'WebPage': + video = e.get('video') + if isinstance(video, dict) and video.get('@type') == 'VideoObject': + extract_video_object(video) break return dict((k, v) for k, v in info.items() if v is not None) @@ -1303,40 +1317,50 @@ class InfoExtractor(object): entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, fatal=True, live=False): - res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', fatal=fatal) + if res is False: return [] + m3u8_doc, urlh = res m3u8_url = urlh.geturl() + return self._parse_m3u8_formats( + m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, + preference=preference, m3u8_id=m3u8_id, live=live) + + def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, + entry_protocol='m3u8', preference=None, + m3u8_id=None, live=False): if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access return [] - formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] + formats = [] format_url = lambda u: ( u if re.match(r'^https?://', u) else compat_urlparse.urljoin(m3u8_url, u)) - # We should try extracting formats only from master playlists [1], i.e. - # playlists that describe available qualities. On the other hand media - # playlists [2] should be returned as is since they contain just the media - # without qualities renditions. + # References: + # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 + # 2. https://github.com/rg3/youtube-dl/issues/12211 + + # We should try extracting formats only from master playlists [1, 4.3.4], + # i.e. playlists that describe available qualities. On the other hand + # media playlists [1, 4.3.3] should be returned as is since they contain + # just the media without qualities renditions. # Fortunately, master playlist can be easily distinguished from media - # playlist based on particular tags availability. As of [1, 2] master - # playlist tags MUST NOT appear in a media playist and vice versa. - # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist - # and MUST NOT appear in master playlist thus we can clearly detect media - # playlist with this criterion. - # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4 - # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 - # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 + # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] + # master playlist tags MUST NOT appear in a media playist and vice versa. + # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every + # media playlist and MUST NOT appear in master playlist thus we can + # clearly detect media playlist with this criterion. + if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is return [{ 'url': m3u8_url, @@ -1345,52 +1369,72 @@ class InfoExtractor(object): 'protocol': entry_protocol, 'preference': preference, }] - audio_in_video_stream = {} - last_info = {} - last_media = {} + + groups = {} + last_stream_inf = {} + + def extract_media(x_media_line): + media = parse_m3u8_attributes(x_media_line) + # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED + media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME') + if not (media_type and group_id and name): + return + groups.setdefault(group_id, []).append(media) + if media_type not in ('VIDEO', 'AUDIO'): + return + media_url = media.get('URI') + if media_url: + format_id = [] + for v in (group_id, name): + if v: + format_id.append(v) + f = { + 'format_id': '-'.join(format_id), + 'url': format_url(media_url), + 'manifest_url': m3u8_url, + 'language': media.get('LANGUAGE'), + 'ext': ext, + 'protocol': entry_protocol, + 'preference': preference, + } + if media_type == 'AUDIO': + f['vcodec'] = 'none' + formats.append(f) + + def build_stream_name(): + # Despite specification does not mention NAME attribute for + # EXT-X-STREAM-INF tag it still sometimes may be present (see [1] + # or vidio test in TestInfoExtractor.test_parse_m3u8_formats) + # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015 + stream_name = last_stream_inf.get('NAME') + if stream_name: + return stream_name + # If there is no NAME in EXT-X-STREAM-INF it will be obtained + # from corresponding rendition group + stream_group_id = last_stream_inf.get('VIDEO') + if not stream_group_id: + return + stream_group = groups.get(stream_group_id) + if not stream_group: + return stream_group_id + rendition = stream_group[0] + return rendition.get('NAME') or stream_group_id + for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): - last_info = parse_m3u8_attributes(line) + last_stream_inf = parse_m3u8_attributes(line) elif line.startswith('#EXT-X-MEDIA:'): - media = parse_m3u8_attributes(line) - media_type = media.get('TYPE') - if media_type in ('VIDEO', 'AUDIO'): - group_id = media.get('GROUP-ID') - media_url = media.get('URI') - if media_url: - format_id = [] - for v in (group_id, media.get('NAME')): - if v: - format_id.append(v) - f = { - 'format_id': '-'.join(format_id), - 'url': format_url(media_url), - 'language': media.get('LANGUAGE'), - 'ext': ext, - 'protocol': entry_protocol, - 'preference': preference, - } - if media_type == 'AUDIO': - f['vcodec'] = 'none' - if group_id and not audio_in_video_stream.get(group_id): - audio_in_video_stream[group_id] = False - formats.append(f) - else: - # When there is no URI in EXT-X-MEDIA let this tag's - # data be used by regular URI lines below - last_media = media - if media_type == 'AUDIO' and group_id: - audio_in_video_stream[group_id] = True + extract_media(line) elif line.startswith('#') or not line.strip(): continue else: - tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000) + tbr = float_or_none( + last_stream_inf.get('AVERAGE-BANDWIDTH') or + last_stream_inf.get('BANDWIDTH'), scale=1000) format_id = [] if m3u8_id: format_id.append(m3u8_id) - # Despite specification does not mention NAME attribute for - # EXT-X-STREAM-INF it still sometimes may be present - stream_name = last_info.get('NAME') or last_media.get('NAME') + stream_name = build_stream_name() # Bandwidth of live streams may differ over time thus making # format_id unpredictable. So it's better to keep provided # format_id intact. @@ -1400,14 +1444,14 @@ class InfoExtractor(object): f = { 'format_id': '-'.join(format_id), 'url': manifest_url, - 'manifest_url': manifest_url, + 'manifest_url': m3u8_url, 'tbr': tbr, 'ext': ext, - 'fps': float_or_none(last_info.get('FRAME-RATE')), + 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')), 'protocol': entry_protocol, 'preference': preference, } - resolution = last_info.get('RESOLUTION') + resolution = last_stream_inf.get('RESOLUTION') if resolution: mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution) if mobj: @@ -1423,13 +1467,26 @@ class InfoExtractor(object): 'vbr': vbr, 'abr': abr, }) - f.update(parse_codecs(last_info.get('CODECS'))) - if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none': - # TODO: update acodec for audio only formats with the same GROUP-ID - f['acodec'] = 'none' + codecs = parse_codecs(last_stream_inf.get('CODECS')) + f.update(codecs) + audio_group_id = last_stream_inf.get('AUDIO') + # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which + # references a rendition group MUST have a CODECS attribute. + # However, this is not always respected, for example, [2] + # contains EXT-X-STREAM-INF tag which references AUDIO + # rendition group but does not have CODECS and despite + # referencing audio group an audio group, it represents + # a complete (with audio and video) format. So, for such cases + # we will ignore references to rendition groups and treat them + # as complete formats. + if audio_group_id and codecs and f.get('vcodec') != 'none': + audio_group = groups.get(audio_group_id) + if audio_group and audio_group[0].get('URI'): + # TODO: update acodec for audio only formats with + # the same GROUP-ID + f['acodec'] = 'none' formats.append(f) - last_info = {} - last_media = {} + last_stream_inf = {} return formats @staticmethod @@ -1803,7 +1860,7 @@ class InfoExtractor(object): 'ext': mimetype2ext(mime_type), 'width': int_or_none(representation_attrib.get('width')), 'height': int_or_none(representation_attrib.get('height')), - 'tbr': int_or_none(bandwidth, 1000), + 'tbr': float_or_none(bandwidth, 1000), 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), 'fps': int_or_none(representation_attrib.get('frameRate')), 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, @@ -2182,7 +2239,7 @@ class InfoExtractor(object): def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): mobj = re.search( - r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)', + r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)', webpage) if mobj: try: @@ -2258,11 +2315,17 @@ class InfoExtractor(object): def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): + urls = [] formats = [] for source in jwplayer_sources_data: - source_url = self._proto_relative_url(source['file']) + source_url = self._proto_relative_url(source.get('file')) + if not source_url: + continue if base_url: source_url = compat_urlparse.urljoin(base_url, source_url) + if source_url in urls: + continue + urls.append(source_url) source_type = source.get('type') or '' ext = mimetype2ext(source_type) or determine_ext(source_url) if source_type == 'hls' or ext == 'm3u8': diff --git a/youtube_dl/extractor/coub.py b/youtube_dl/extractor/coub.py index 5fa1f006b..6ea03e65c 100644 --- a/youtube_dl/extractor/coub.py +++ b/youtube_dl/extractor/coub.py @@ -24,12 +24,11 @@ class CoubIE(InfoExtractor): 'duration': 4.6, 'timestamp': 1428527772, 'upload_date': '20150408', - 'uploader': 'Артём Лоскутников', + 'uploader': 'Artyom Loskutnikov', 'uploader_id': 'artyom.loskutnikov', 'view_count': int, 'like_count': int, 'repost_count': int, - 'comment_count': int, 'age_limit': 0, }, }, { @@ -118,7 +117,6 @@ class CoubIE(InfoExtractor): view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count')) like_count = int_or_none(coub.get('likes_count')) repost_count = int_or_none(coub.get('recoubs_count')) - comment_count = int_or_none(coub.get('comments_count')) age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin')) if age_restricted is not None: @@ -137,7 +135,6 @@ class CoubIE(InfoExtractor): 'view_count': view_count, 'like_count': like_count, 'repost_count': repost_count, - 'comment_count': comment_count, 'age_limit': age_limit, 'formats': formats, } diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 2ed8b30bb..2ffa4a7f8 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -171,7 +171,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'info_dict': { 'id': '727589', 'ext': 'mp4', - 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance from this Judicial Injustice!", + 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance From This Judicial Injustice!", 'description': 'md5:cbcf05e528124b0f3a0a419fc805ea7d', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Kadokawa Pictures Inc.', @@ -179,7 +179,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'series': "KONOSUBA -God's blessing on this wonderful world!", 'season': "KONOSUBA -God's blessing on this wonderful world! 2", 'season_number': 2, - 'episode': 'Give Me Deliverance from this Judicial Injustice!', + 'episode': 'Give Me Deliverance From This Judicial Injustice!', 'episode_number': 1, }, 'params': { diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 246efde43..441114d19 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -51,6 +51,24 @@ class DailymotionIE(DailymotionBaseInfoExtractor): _TESTS = [ { + 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', + 'md5': '074b95bdee76b9e3654137aee9c79dfe', + 'info_dict': { + 'id': 'x5kesuj', + 'ext': 'mp4', + 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller', + 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller', + 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', + 'duration': 187, + 'timestamp': 1493651285, + 'upload_date': '20170501', + 'uploader': 'Deadline', + 'uploader_id': 'x1xm8ri', + 'age_limit': 0, + 'view_count': int, + }, + }, + { 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', 'md5': '2137c41a8e78554bb09225b8eb322406', 'info_dict': { @@ -66,7 +84,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'uploader_id': 'xijv66', 'age_limit': 0, 'view_count': int, - } + }, + 'skip': 'video gone', }, # Vevo video { diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index bdfe638b4..5c9c0ecdc 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -21,7 +21,8 @@ class DemocracynowIE(InfoExtractor): 'info_dict': { 'id': '2015-0703-001', 'ext': 'mp4', - 'title': 'Daily Show', + 'title': 'Daily Show for July 03, 2015', + 'description': 'md5:80eb927244d6749900de6072c7cc2c86', }, }, { 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py index 1f75352ca..148605c0b 100644 --- a/youtube_dl/extractor/dotsub.py +++ b/youtube_dl/extractor/dotsub.py @@ -35,7 +35,7 @@ class DotsubIE(InfoExtractor): 'thumbnail': 're:^https?://dotsub.com/media/747bcf58-bd59-45b7-8c8c-ac312d084ee6/p', 'duration': 290, 'timestamp': 1476767794.2809999, - 'upload_date': '20160525', + 'upload_date': '20161018', 'uploader': 'parthivi001', 'uploader_id': 'user52596202', 'view_count': int, diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 82d8a042f..d22133d24 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -20,7 +20,7 @@ class DouyuTVIE(InfoExtractor): 'id': '17732', 'display_id': 'iseven', 'ext': 'flv', - 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': r're:.*m7show@163\.com.*', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': '7师傅', @@ -51,7 +51,7 @@ class DouyuTVIE(InfoExtractor): 'id': '17732', 'display_id': '17732', 'ext': 'flv', - 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': r're:.*m7show@163\.com.*', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': '7师傅', diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1671090f4..c0020dd7d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -41,6 +41,7 @@ from .alphaporno import AlphaPornoIE from .amcnetworks import AMCNetworksIE from .animeondemand import AnimeOnDemandIE from .anitube import AnitubeIE +from .anvato import AnvatoIE from .anysex import AnySexIE from .aol import AolIE from .allocine import AllocineIE @@ -87,7 +88,6 @@ from .azmedien import ( AZMedienPlaylistIE, AZMedienShowPlaylistIE, ) -from .azubu import AzubuIE, AzubuLiveIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE @@ -663,6 +663,7 @@ from .nintendo import NintendoIE from .njpwworld import NJPWWorldIE from .nobelprize import NobelPrizeIE from .noco import NocoIE +from .noovo import NoovoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE from .nova import NovaIE @@ -939,6 +940,7 @@ from .srmediathek import SRMediathekIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE from .streamable import StreamableIE +from .streamango import StreamangoIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE @@ -1233,7 +1235,10 @@ from .wrzuta import ( WrzutaIE, WrzutaPlaylistIE, ) -from .wsj import WSJIE +from .wsj import ( + WSJIE, + WSJArticleIE, +) from .xbef import XBefIE from .xboxclips import XboxClipsIE from .xfileshare import XFileShareIE @@ -1295,5 +1300,6 @@ from .youtube import ( YoutubeWatchLaterIE, ) from .zapiks import ZapiksIE +from .zaq1 import Zaq1IE from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py index a3bb98377..985542727 100644 --- a/youtube_dl/extractor/foxsports.py +++ b/youtube_dl/extractor/foxsports.py @@ -11,10 +11,10 @@ class FoxSportsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)' _TEST = { - 'url': 'http://www.foxsports.com/video?vid=432609859715', + 'url': 'http://www.foxsports.com/tennessee/video/432609859715', 'md5': 'b49050e955bebe32c301972e4012ac17', 'info_dict': { - 'id': 'i0qKWsk3qJaM', + 'id': 'bwduI3X_TgUB', 'ext': 'mp4', 'title': 'Courtney Lee on going up 2-0 in series vs. Blazers', 'description': 'Courtney Lee talks about Memphis being focused.', @@ -31,8 +31,9 @@ class FoxSportsIE(InfoExtractor): webpage = self._download_webpage(url, video_id) config = self._parse_json( - self._search_regex( - r"data-player-config='([^']+)'", webpage, 'data player config'), + self._html_search_regex( + r"""class="[^"]*(?:fs-player|platformPlayer-wrapper)[^"]*".+?data-player-config='([^']+)'""", + webpage, 'data player config'), video_id) return self.url_result(smuggle_url(update_url_query( diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 81c0ce9a3..49409369c 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -58,8 +58,7 @@ class FunnyOrDieIE(InfoExtractor): m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) source_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) + lambda f: f.get('vcodec') != 'none', m3u8_formats)) bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)(?=[,/])', m3u8_url)] bitrates.sort() diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 682c49e79..00d311158 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -78,8 +78,7 @@ class GameSpotIE(OnceIE): if m3u8_formats: self._sort_formats(m3u8_formats) m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) + lambda f: f.get('vcodec') != 'none', m3u8_formats)) if len(qualities) == len(m3u8_formats): for q, m3u8_format in zip(qualities, m3u8_formats): f = m3u8_format.copy() diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index bc7c21f7a..b06f43446 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -85,6 +85,9 @@ from .ustream import UstreamIE from .openload import OpenloadIE from .videopress import VideoPressIE from .rutube import RutubeIE +from .limelight import LimelightBaseIE +from .anvato import AnvatoIE +from .washingtonpost import WashingtonPostIE class GenericIE(InfoExtractor): @@ -430,6 +433,22 @@ class GenericIE(InfoExtractor): }, }, { + # Brightcove video in <iframe> + 'url': 'http://www.un.org/chinese/News/story.asp?NewsID=27724', + 'md5': '36d74ef5e37c8b4a2ce92880d208b968', + 'info_dict': { + 'id': '5360463607001', + 'ext': 'mp4', + 'title': '叙利亚失明儿童在废墟上演唱《心跳》 呼吁获得正常童年生活', + 'description': '联合国儿童基金会中东和北非区域大使、作曲家扎德·迪拉尼(Zade Dirani)在3月15日叙利亚冲突爆发7周年纪念日之际发布了为叙利亚谱写的歌曲《心跳》(HEARTBEAT),为受到六年冲突影响的叙利亚儿童发出强烈呐喊,呼吁世界做出共同努力,使叙利亚儿童重新获得享有正常童年生活的权利。', + 'uploader': 'United Nations', + 'uploader_id': '1362235914001', + 'timestamp': 1489593889, + 'upload_date': '20170315', + }, + 'add_ie': ['BrightcoveLegacy'], + }, + { # Brightcove with alternative playerID key 'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html', 'info_dict': { @@ -1410,6 +1429,22 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # Brightcove embed with whitespace around attribute names + 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill', + 'info_dict': { + 'id': '3167554373001', + 'ext': 'mp4', + 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill", + 'description': 'md5:57bacb0e0f29349de4972bfda3191713', + 'uploader_id': '1079349493', + 'upload_date': '20140207', + 'timestamp': 1391810548, + }, + 'params': { + 'skip_download': True, + }, + }, # Another form of arte.tv embed { 'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html', @@ -1651,6 +1686,38 @@ class GenericIE(InfoExtractor): }, 'add_ie': [SenateISVPIE.ie_key()], }, + { + # Limelight embeds (1 channel embed + 4 media embeds) + 'url': 'http://www.sedona.com/FacilitatorTraining2017', + 'info_dict': { + 'id': 'FacilitatorTraining2017', + 'title': 'Facilitator Training 2017', + }, + 'playlist_mincount': 5, + }, + { + 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/', + 'info_dict': { + 'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest', + 'title': 'Standoff with Walnut Creek murder suspect ends', + 'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788', + }, + 'playlist_mincount': 4, + }, + { + # WashingtonPost embed + 'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches', + 'info_dict': { + 'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac', + 'ext': 'mp4', + 'title': "No one has seen the drama series based on Trump's life \u2014 until now", + 'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.', + 'timestamp': 1455216756, + 'uploader': 'The Washington Post', + 'upload_date': '20160211', + }, + 'add_ie': [WashingtonPostIE.ie_key()], + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -1693,7 +1760,7 @@ class GenericIE(InfoExtractor): continue entries.append({ - '_type': 'url', + '_type': 'url_transparent', 'url': next_url, 'title': it.find('title').text, }) @@ -2483,6 +2550,11 @@ class GenericIE(InfoExtractor): return self.url_result(piksel_url, PikselIE.ie_key()) # Look for Limelight embeds + limelight_urls = LimelightBaseIE._extract_urls(webpage, url) + if limelight_urls: + return self.playlist_result( + limelight_urls, video_id, video_title, video_description) + mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage) if mobj: lm = { @@ -2506,6 +2578,12 @@ class GenericIE(InfoExtractor): 'limelight:media:%s' % mobj.group('id'), {'source_url': url}), 'LimelightMedia', mobj.group('id')) + # Look for Anvato embeds + anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id) + if anvato_urls: + return self.playlist_result( + anvato_urls, video_id, video_title, video_description) + # Look for AdobeTVVideo embeds mobj = re.search( r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', @@ -2623,6 +2701,12 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( rutube_urls, ie=RutubeIE.ie_key()) + # Look for WashingtonPost embeds + wapo_urls = WashingtonPostIE._extract_urls(webpage) + if wapo_urls: + return self.playlist_from_matches( + wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 4c9be47b4..9c7b1bd37 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -36,22 +36,26 @@ class GoIE(AdobePassIE): 'requestor_id': 'DisneyXD', } } - _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys()) + _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys()) _TESTS = [{ - 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx', + 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643', 'info_dict': { - 'id': '0_g86w5onx', + 'id': 'VDKA3807643', 'ext': 'mp4', - 'title': 'Sneak Peek: Language Arts', - 'description': 'md5:7dcdab3b2d17e5217c953256af964e9c', + 'title': 'The Traitor in the White House', + 'description': 'md5:05b009d2d145a1e85d25111bd37222e8', }, 'params': { # m3u8 download 'skip_download': True, }, }, { - 'url': 'http://abc.go.com/shows/after-paradise/video/most-recent/vdka3335601', - 'only_matching': True, + 'url': 'http://watchdisneyxd.go.com/doraemon', + 'info_dict': { + 'title': 'Doraemon', + 'id': 'SH55574025', + }, + 'playlist_mincount': 51, }, { 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', 'only_matching': True, @@ -60,19 +64,36 @@ class GoIE(AdobePassIE): 'only_matching': True, }] + def _extract_videos(self, brand, video_id='-1', show_id='-1'): + display_id = video_id if video_id != '-1' else show_id + return self._download_json( + 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/%s/-1/%s/-1/-1.json' % (brand, show_id, video_id), + display_id)['video'] + def _real_extract(self, url): sub_domain, video_id, display_id = re.match(self._VALID_URL, url).groups() + site_info = self._SITE_INFO[sub_domain] + brand = site_info['brand'] if not video_id: webpage = self._download_webpage(url, display_id) video_id = self._search_regex( # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood - r'data-video-id=["\']*VDKA(\w+)', webpage, 'video id') - site_info = self._SITE_INFO[sub_domain] - brand = site_info['brand'] - video_data = self._download_json( - 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (brand, video_id), - video_id)['video'][0] + r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id', default=None) + if not video_id: + # show extraction works for Disney, DisneyJunior and DisneyXD + # ABC and Freeform has different layout + show_id = self._search_regex(r'data-show-id=["\']*(SH\d+)', webpage, 'show id') + videos = self._extract_videos(brand, show_id=show_id) + show_title = self._search_regex(r'data-show-title="([^"]+)"', webpage, 'show title', fatal=False) + entries = [] + for video in videos: + entries.append(self.url_result( + video['url'], 'Go', video.get('id'), video.get('title'))) + entries.reverse() + return self.playlist_result(entries, show_id, show_title) + video_data = self._extract_videos(brand, video_id)[0] + video_id = video_data['id'] title = video_data['title'] formats = [] @@ -105,7 +126,7 @@ class GoIE(AdobePassIE): self._initialize_geo_bypass(['US']) entitlement = self._download_json( 'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json', - video_id, data=urlencode_postdata(data), headers=self.geo_verification_headers()) + video_id, data=urlencode_postdata(data)) errors = entitlement.get('errors', {}).get('errors', []) if errors: for error in errors: diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index 3550eca7c..9b2e1c164 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, int_or_none, parse_iso8601, ) @@ -18,7 +19,7 @@ class Go90IE(InfoExtractor): 'info_dict': { 'id': '84BUqjLpf9D', 'ext': 'mp4', - 'title': 'Inside The Utah Coalition Against Pornography Convention', + 'title': 'Daily VICE - Inside The Utah Coalition Against Pornography Convention', 'description': 'VICE\'s Karley Sciortino meets with activists who discuss the state\'s strong anti-porn stance. Then, VICE Sports explains NFL contracts.', 'timestamp': 1491868800, 'upload_date': '20170411', @@ -32,11 +33,28 @@ class Go90IE(InfoExtractor): video_id, headers={ 'Content-Type': 'application/json; charset=utf-8', }, data=b'{"client":"web","device_type":"pc"}') - title = video_data['title'] main_video_asset = video_data['main_video_asset'] + episode_number = int_or_none(video_data.get('episode_number')) + series = None + season = None + season_id = None + season_number = None + for metadata in video_data.get('__children', {}).get('Item', {}).values(): + if metadata.get('type') == 'show': + series = metadata.get('title') + elif metadata.get('type') == 'season': + season = metadata.get('title') + season_id = metadata.get('id') + season_number = int_or_none(metadata.get('season_number')) + + title = episode = video_data.get('title') or series + if series and series != title: + title = '%s - %s' % (series, title) + thumbnails = [] formats = [] + subtitles = {} for asset in video_data.get('assets'): if asset.get('id') == main_video_asset: for source in asset.get('sources', []): @@ -70,6 +88,15 @@ class Go90IE(InfoExtractor): 'height': int_or_none(source.get('height')), 'tbr': int_or_none(source.get('bitrate')), }) + + for caption in asset.get('caption_metadata', []): + caption_url = caption.get('source_url') + if not caption_url: + continue + subtitles.setdefault(caption.get('language', 'en'), []).append({ + 'url': caption_url, + 'ext': determine_ext(caption_url, 'vtt'), + }) elif asset.get('type') == 'image': asset_location = asset.get('location') if not asset_location: @@ -89,4 +116,11 @@ class Go90IE(InfoExtractor): 'description': video_data.get('short_description'), 'like_count': int_or_none(video_data.get('like_count')), 'timestamp': parse_iso8601(video_data.get('released_at')), + 'series': series, + 'episode': episode, + 'season': season, + 'season_id': season_id, + 'season_number': season_number, + 'episode_number': episode_number, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 9fb71e8ef..fe425e786 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -87,8 +87,8 @@ class InfoQIE(BokeCCBaseIE): def _extract_http_audio(self, webpage, video_id): fields = self._hidden_inputs(webpage) - http_audio_url = fields['filename'] - if http_audio_url is None: + http_audio_url = fields.get('filename') + if not http_audio_url: return [] cookies_header = {'Cookie': self._extract_cookies(webpage)} diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index c1921cbcf..4667335e0 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -112,7 +112,8 @@ class InstagramIE(InfoExtractor): webpage = self._download_webpage(url, video_id) (video_url, description, thumbnail, timestamp, uploader, - uploader_id, like_count, comment_count, height, width) = [None] * 10 + uploader_id, like_count, comment_count, comments, height, + width) = [None] * 11 shared_data = self._parse_json( self._search_regex( @@ -121,7 +122,10 @@ class InstagramIE(InfoExtractor): video_id, fatal=False) if shared_data: media = try_get( - shared_data, lambda x: x['entry_data']['PostPage'][0]['media'], dict) + shared_data, + (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], + lambda x: x['entry_data']['PostPage'][0]['media']), + dict) if media: video_url = media.get('video_url') height = int_or_none(media.get('dimensions', {}).get('height')) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 2af6a6db4..fdfa7de9e 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -189,7 +189,11 @@ class IqiyiIE(InfoExtractor): 'only_matching': True, }, { 'url': 'http://yule.iqiyi.com/pcb.html', - 'only_matching': True, + 'info_dict': { + 'id': '4a0af228fddb55ec96398a364248ed7f', + 'ext': 'mp4', + 'title': '第2017-04-21期 女艺人频遭极端粉丝骚扰', + }, }, { # VIP-only video. The first 2 parts (6 minutes) are available without login # MD5 sums omitted as values are different on Travis CI and my machine @@ -337,15 +341,18 @@ class IqiyiIE(InfoExtractor): url, 'temp_id', note='download video page') # There's no simple way to determine whether an URL is a playlist or not - # So detect it - playlist_result = self._extract_playlist(webpage) - if playlist_result: - return playlist_result - + # Sometimes there are playlist links in individual videos, so treat it + # as a single video first tvid = self._search_regex( - r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid') + r'data-(?:player|shareplattrigger)-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid', default=None) + if tvid is None: + playlist_result = self._extract_playlist(webpage) + if playlist_result: + return playlist_result + raise ExtractorError('Can\'t find any video') + video_id = self._search_regex( - r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') + r'data-(?:player|shareplattrigger)-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') formats = [] for _ in range(5): @@ -377,7 +384,8 @@ class IqiyiIE(InfoExtractor): self._sort_formats(formats) title = (get_element_by_id('widget-videotitle', webpage) or - clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage))) + clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)) or + self._html_search_regex(r'<span[^>]+data-videochanged-title="word"[^>]*>([^<]+)</span>', webpage, 'title')) return { 'id': video_id, diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 021c6b278..f3156804d 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -116,13 +116,25 @@ class ITVIE(InfoExtractor): if not play_path: continue tbr = int_or_none(media_file.get('bitrate'), 1000) - formats.append({ + f = { 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), - 'url': rtmp_url, 'play_path': play_path, + # Providing this swfVfy allows to avoid truncated downloads + 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', + 'page_url': url, 'tbr': tbr, 'ext': 'flv', - }) + } + app = self._search_regex( + 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) + if app: + f.update({ + 'url': rtmp_url.split('?', 1)[0], + 'app': app, + }) + else: + f['url'] = rtmp_url + formats.append(f) ios_playlist_url = params.get('data-video-playlist') hmac = params.get('data-video-hmac') @@ -172,7 +184,9 @@ class ITVIE(InfoExtractor): href = ios_base_url + href ext = determine_ext(href) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats(href, video_id, 'mp4', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_m3u8_formats( + href, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) else: formats.append({ 'url': href, @@ -189,7 +203,8 @@ class ITVIE(InfoExtractor): 'ext': 'ttml' if ext == 'xml' else ext, }) - return { + info = self._search_json_ld(webpage, video_id, default={}) + info.update({ 'id': video_id, 'title': title, 'formats': formats, @@ -198,4 +213,5 @@ class ITVIE(InfoExtractor): 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), 'series': xpath_text(playlist, 'ProgrammeTitle'), 'duartion': parse_duration(xpath_text(playlist, 'Duration')), - } + }) + return info diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index 9eda956d2..0a07c1320 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -23,7 +23,6 @@ from ..utils import ( str_or_none, url_basename, urshift, - update_url_query, ) @@ -51,7 +50,7 @@ class LeIE(InfoExtractor): 'id': '1415246', 'ext': 'mp4', 'title': '美人天下01', - 'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda', + 'description': 'md5:28942e650e82ed4fcc8e4de919ee854d', }, 'params': { 'hls_prefer_native': True, @@ -69,7 +68,6 @@ class LeIE(InfoExtractor): 'params': { 'hls_prefer_native': True, }, - 'skip': 'Only available in China', }, { 'url': 'http://sports.le.com/video/25737697.html', 'only_matching': True, @@ -81,7 +79,7 @@ class LeIE(InfoExtractor): 'only_matching': True, }] - # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf + # ror() and calc_time_key() are reversed from a embedded swf file in LetvPlayer.swf def ror(self, param1, param2): _loc3_ = 0 while _loc3_ < param2: @@ -90,15 +88,8 @@ class LeIE(InfoExtractor): return param1 def calc_time_key(self, param1): - _loc2_ = 773625421 - _loc3_ = self.ror(param1, _loc2_ % 13) - _loc3_ = _loc3_ ^ _loc2_ - _loc3_ = self.ror(_loc3_, _loc2_ % 17) - return _loc3_ - - # reversed from http://jstatic.letvcdn.com/sdk/player.js - def get_mms_key(self, time): - return self.ror(time, 8) ^ 185025305 + _loc2_ = 185025305 + return self.ror(param1, _loc2_ % 17) ^ _loc2_ # see M3U8Encryption class in KLetvPlayer.swf @staticmethod @@ -122,7 +113,7 @@ class LeIE(InfoExtractor): def _check_errors(self, play_json): # Check for errors - playstatus = play_json['playstatus'] + playstatus = play_json['msgs']['playstatus'] if playstatus['status'] == 0: flag = playstatus['flag'] if flag == 1: @@ -134,58 +125,31 @@ class LeIE(InfoExtractor): media_id = self._match_id(url) page = self._download_webpage(url, media_id) - play_json_h5 = self._download_json( - 'http://api.le.com/mms/out/video/playJsonH5', - media_id, 'Downloading html5 playJson data', query={ - 'id': media_id, - 'platid': 3, - 'splatid': 304, - 'format': 1, - 'tkey': self.get_mms_key(int(time.time())), - 'domain': 'www.le.com', - 'tss': 'no', - }, - headers=self.geo_verification_headers()) - self._check_errors(play_json_h5) - play_json_flash = self._download_json( - 'http://api.le.com/mms/out/video/playJson', + 'http://player-pc.le.com/mms/out/video/playJson', media_id, 'Downloading flash playJson data', query={ 'id': media_id, 'platid': 1, 'splatid': 101, 'format': 1, + 'source': 1000, 'tkey': self.calc_time_key(int(time.time())), 'domain': 'www.le.com', + 'region': 'cn', }, headers=self.geo_verification_headers()) self._check_errors(play_json_flash) - def get_h5_urls(media_url, format_id): - location = self._download_json( - media_url, media_id, - 'Download JSON metadata for format %s' % format_id, query={ - 'format': 1, - 'expect': 3, - 'tss': 'no', - })['location'] - - return { - 'http': update_url_query(location, {'tss': 'no'}), - 'hls': update_url_query(location, {'tss': 'ios'}), - } - def get_flash_urls(media_url, format_id): - media_url += '&' + compat_urllib_parse_urlencode({ - 'm3v': 1, - 'format': 1, - 'expect': 3, - 'rateid': format_id, - }) - nodes_data = self._download_json( media_url, media_id, - 'Download JSON metadata for format %s' % format_id) + 'Download JSON metadata for format %s' % format_id, + query={ + 'm3v': 1, + 'format': 1, + 'expect': 3, + 'tss': 'ios', + }) req = self._request_webpage( nodes_data['nodelist'][0]['location'], media_id, @@ -199,29 +163,28 @@ class LeIE(InfoExtractor): extracted_formats = [] formats = [] - for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)): - playurl = play_json['playurl'] - play_domain = playurl['domain'][0] - - for format_id, format_data in playurl.get('dispatch', []).items(): - if format_id in extracted_formats: - continue - extracted_formats.append(format_id) - - media_url = play_domain + format_data[0] - for protocol, format_url in get_urls(media_url, format_id).items(): - f = { - 'url': format_url, - 'ext': determine_ext(format_data[1]), - 'format_id': '%s-%s' % (protocol, format_id), - 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', - 'quality': int_or_none(format_id), - } - - if format_id[-1:] == 'p': - f['height'] = int_or_none(format_id[:-1]) - - formats.append(f) + playurl = play_json_flash['msgs']['playurl'] + play_domain = playurl['domain'][0] + + for format_id, format_data in playurl.get('dispatch', []).items(): + if format_id in extracted_formats: + continue + extracted_formats.append(format_id) + + media_url = play_domain + format_data[0] + for protocol, format_url in get_flash_urls(media_url, format_id).items(): + f = { + 'url': format_url, + 'ext': determine_ext(format_data[1]), + 'format_id': '%s-%s' % (protocol, format_id), + 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + 'quality': int_or_none(format_id), + } + + if format_id[-1:] == 'p': + f['height'] = int_or_none(format_id[:-1]) + + formats.append(f) self._sort_formats(formats, ('height', 'quality', 'format_id')) publish_time = parse_iso8601(self._html_search_regex( diff --git a/youtube_dl/extractor/lego.py b/youtube_dl/extractor/lego.py index d3bca6435..b312e77f1 100644 --- a/youtube_dl/extractor/lego.py +++ b/youtube_dl/extractor/lego.py @@ -86,7 +86,7 @@ class LEGOIE(InfoExtractor): formats = self._extract_akamai_formats( '%si/s/public/%s_,%s,.mp4.csmil/master.m3u8' % (streaming_base, path, streaming_path), video_id) m3u8_formats = list(filter( - lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none', formats)) if len(m3u8_formats) == len(self._BITRATES): self._sort_formats(m3u8_formats) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index f52c2e169..0a5a3956c 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -9,6 +9,7 @@ from ..utils import ( determine_ext, float_or_none, int_or_none, + smuggle_url, unsmuggle_url, ExtractorError, ) @@ -18,6 +19,42 @@ class LimelightBaseIE(InfoExtractor): _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s' _API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json' + @classmethod + def _extract_urls(cls, webpage, source_url): + lm = { + 'Media': 'media', + 'Channel': 'channel', + 'ChannelList': 'channel_list', + } + entries = [] + for kind, video_id in re.findall( + r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', + webpage): + entries.append(cls.url_result( + smuggle_url( + 'limelight:%s:%s' % (lm[kind], video_id), + {'source_url': source_url}), + 'Limelight%s' % kind, video_id)) + for mobj in re.finditer( + # As per [1] class attribute should be exactly equal to + # LimelightEmbeddedPlayerFlash but numerous examples seen + # that don't exactly match it (e.g. [2]). + # 1. http://support.3playmedia.com/hc/en-us/articles/227732408-Limelight-Embedding-the-Captions-Plugin-with-the-Limelight-Player-on-Your-Webpage + # 2. http://www.sedona.com/FacilitatorTraining2017 + r'''(?sx) + <object[^>]+class=(["\'])(?:(?!\1).)*\bLimelightEmbeddedPlayerFlash\b(?:(?!\1).)*\1[^>]*>.*? + <param[^>]+ + name=(["\'])flashVars\2[^>]+ + value=(["\'])(?:(?!\3).)*(?P<kind>media|channel(?:List)?)Id=(?P<id>[a-z0-9]{32}) + ''', webpage): + kind, video_id = mobj.group('kind'), mobj.group('id') + entries.append(cls.url_result( + smuggle_url( + 'limelight:%s:%s' % (kind, video_id), + {'source_url': source_url}), + 'Limelight%s' % kind.capitalize(), video_id)) + return entries + def _call_playlist_service(self, item_id, method, fatal=True, referer=None): headers = {} if referer: diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py new file mode 100644 index 000000000..f7fa098a5 --- /dev/null +++ b/youtube_dl/extractor/noovo.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + smuggle_url, + try_get, +) + + +class NoovoIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?noovo\.ca/videos/(?P<id>[^/]+/[^/?#&]+)' + _TESTS = [{ + # clip + 'url': 'http://noovo.ca/videos/rpm-plus/chrysler-imperial', + 'info_dict': { + 'id': '5386045029001', + 'ext': 'mp4', + 'title': 'Chrysler Imperial', + 'description': 'md5:de3c898d1eb810f3e6243e08c8b4a056', + 'timestamp': 1491399228, + 'upload_date': '20170405', + 'uploader_id': '618566855001', + 'creator': 'vtele', + 'view_count': int, + 'series': 'RPM+', + }, + 'params': { + 'skip_download': True, + }, + }, { + # episode + 'url': 'http://noovo.ca/videos/l-amour-est-dans-le-pre/episode-13-8', + 'info_dict': { + 'id': '5395865725001', + 'title': 'Épisode 13 : Les retrouvailles', + 'description': 'md5:336d5ebc5436534e61d16e63ddfca327', + 'ext': 'mp4', + 'timestamp': 1492019320, + 'upload_date': '20170412', + 'uploader_id': '618566855001', + 'creator': 'vtele', + 'view_count': int, + 'series': "L'amour est dans le pré", + 'season_number': 5, + 'episode': 'Épisode 13', + 'episode_number': 13, + }, + 'params': { + 'skip_download': True, + }, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/618566855001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = self._download_json( + 'http://api.noovo.ca/api/v1/pages/single-episode/%s' % video_id, + video_id)['data'] + + content = try_get(data, lambda x: x['contents'][0]) + + brightcove_id = data.get('brightcoveId') or content['brightcoveId'] + + series = try_get( + data, ( + lambda x: x['show']['title'], + lambda x: x['season']['show']['title']), + compat_str) + + episode = None + og = data.get('og') + if isinstance(og, dict) and og.get('type') == 'video.episode': + episode = og.get('title') + + video = content or data + + return { + '_type': 'url_transparent', + 'ie_key': BrightcoveNewIE.ie_key(), + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['CA']}), + 'id': brightcove_id, + 'title': video.get('title'), + 'creator': video.get('source'), + 'view_count': int_or_none(video.get('viewsCount')), + 'series': series, + 'season_number': int_or_none(try_get( + data, lambda x: x['season']['seasonNumber'])), + 'episode': episode, + 'episode_number': int_or_none(data.get('episodeNumber')), + } diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index b6c5ee6e4..f26dafb8f 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -28,7 +28,7 @@ class NownessBaseIE(InfoExtractor): bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code) if bc_url: return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) - bc_url = BrightcoveNewIE._extract_url(player_code) + bc_url = BrightcoveNewIE._extract_url(self, player_code) if bc_url: return self.url_result(bc_url, BrightcoveNewIE.ie_key()) raise ExtractorError('Could not find player definition') diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 0ee56a45b..854b6800c 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_parse_qs, compat_urllib_parse_unquote, compat_urllib_parse_urlparse, @@ -37,7 +38,7 @@ class OdnoklassnikiIE(InfoExtractor): }, { # metadataUrl 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', - 'md5': '9676cf86eff5391d35dea675d224e131', + 'md5': '6ff470ea2dd51d5d18c295a355b0b6bc', 'info_dict': { 'id': '63567059965189-0', 'ext': 'mp4', @@ -53,7 +54,7 @@ class OdnoklassnikiIE(InfoExtractor): }, { # YouTube embed (metadataUrl, provider == USER_YOUTUBE) 'url': 'http://ok.ru/video/64211978996595-1', - 'md5': '5d7475d428845cd2e13bae6f1a992278', + 'md5': '2f206894ffb5dbfcce2c5a14b909eea5', 'info_dict': { 'id': '64211978996595-1', 'ext': 'mp4', @@ -61,8 +62,8 @@ class OdnoklassnikiIE(InfoExtractor): 'description': 'md5:848eb8b85e5e3471a3a803dae1343ed0', 'duration': 440, 'upload_date': '20150826', - 'uploader_id': '750099571', - 'uploader': 'Алина П', + 'uploader_id': 'tvroscosmos', + 'uploader': 'Телестудия Роскосмоса', 'age_limit': 0, }, }, { @@ -81,6 +82,7 @@ class OdnoklassnikiIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Video has not been found', }, { 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', 'only_matching': True, @@ -176,14 +178,32 @@ class OdnoklassnikiIE(InfoExtractor): }) return info - quality = qualities(('mobile', 'lowest', 'low', 'sd', 'hd', 'full')) + quality = qualities(('4', '0', '1', '2', '3', '5')) formats = [{ 'url': f['url'], 'ext': 'mp4', 'format_id': f['name'], - 'quality': quality(f['name']), } for f in metadata['videos']] + + m3u8_url = metadata.get('hlsManifestUrl') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + dash_manifest = metadata.get('metadataEmbedded') + if dash_manifest: + formats.extend(self._parse_mpd_formats( + compat_etree_fromstring(dash_manifest), 'mpd')) + + for fmt in formats: + fmt_type = self._search_regex( + r'\btype[/=](\d)', fmt['url'], + 'format type', default=None) + if fmt_type: + fmt['quality'] = quality(fmt_type) + self._sort_formats(formats) info['formats'] = formats diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 3e51b4dd7..0727e381b 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -8,6 +8,7 @@ from ..utils import ( ExtractorError, determine_ext, int_or_none, + float_or_none, js_to_json, strip_jsonp, strip_or_none, @@ -464,6 +465,7 @@ class PBSIE(InfoExtractor): redirects.append(redirect) redirect_urls.add(redirect_url) + chapters = [] # Player pages may also serve different qualities for page in ('widget/partnerplayer', 'portalplayer'): player = self._download_webpage( @@ -479,6 +481,20 @@ class PBSIE(InfoExtractor): extract_redirect_urls(video_info) if not info: info = video_info + if not chapters: + for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player): + chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False) + if not chapter: + continue + start_time = float_or_none(chapter.get('start_time'), 1000) + duration = float_or_none(chapter.get('duration'), 1000) + if start_time is None or duration is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': start_time + duration, + 'title': chapter.get('title'), + }) formats = [] http_url = None @@ -515,7 +531,7 @@ class PBSIE(InfoExtractor): http_url = format_url self._remove_duplicate_formats(formats) m3u8_formats = list(filter( - lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', formats)) if http_url: for m3u8_format in m3u8_formats: @@ -588,4 +604,5 @@ class PBSIE(InfoExtractor): 'upload_date': upload_date, 'formats': formats, 'subtitles': subtitles, + 'chapters': chapters, } diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index 073fc3e21..24c3600fe 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -1,10 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlencode, -) from .common import InfoExtractor from ..utils import ( parse_duration, @@ -19,7 +15,7 @@ class Porn91IE(InfoExtractor): _TEST = { 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134', - 'md5': '6df8f6d028bc8b14f5dbd73af742fb20', + 'md5': '7fcdb5349354f40d41689bd0fa8db05a', 'info_dict': { 'id': '7e42283b4f5ab36da134', 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', @@ -43,24 +39,7 @@ class Porn91IE(InfoExtractor): r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title') title = title.replace('\n', '') - # get real url - file_id = self._search_regex( - r'so.addVariable\(\'file\',\'(\d+)\'', webpage, 'file id') - sec_code = self._search_regex( - r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code') - max_vid = self._search_regex( - r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid') - url_params = compat_urllib_parse_urlencode({ - 'VID': file_id, - 'mp4': '1', - 'seccode': sec_code, - 'max_vid': max_vid, - }) - info_cn = self._download_webpage( - 'http://91porn.com/getfile.php?' + url_params, video_id, - 'Downloading real video url') - video_url = compat_urllib_parse_unquote(self._search_regex( - r'file=([^&]+)&', info_cn, 'url')) + info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] duration = parse_duration(self._search_regex( r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False)) @@ -68,11 +47,12 @@ class Porn91IE(InfoExtractor): comment_count = int_or_none(self._search_regex( r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False)) - return { + info_dict.update({ 'id': video_id, 'title': title, - 'url': video_url, 'duration': duration, 'comment_count': comment_count, 'age_limit': self._rta_search(webpage), - } + }) + + return info_dict diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py index ed38c77eb..e2202d603 100644 --- a/youtube_dl/extractor/r7.py +++ b/youtube_dl/extractor/r7.py @@ -62,8 +62,7 @@ class R7IE(InfoExtractor): # m3u8 format always matches the http format, let's copy metadata from # one to another m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - formats)) + lambda f: f.get('vcodec') != 'none', formats)) if len(m3u8_formats) == 1: f_copy = m3u8_formats[0].copy() f_copy.update(f) diff --git a/youtube_dl/extractor/streamable.py b/youtube_dl/extractor/streamable.py index 9f5c237ef..34725274e 100644 --- a/youtube_dl/extractor/streamable.py +++ b/youtube_dl/extractor/streamable.py @@ -12,7 +12,7 @@ from ..utils import ( class StreamableIE(InfoExtractor): - _VALID_URL = r'https?://streamable\.com/(?:e/)?(?P<id>\w+)' + _VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)' _TESTS = [ { 'url': 'https://streamable.com/dnd1', @@ -47,6 +47,10 @@ class StreamableIE(InfoExtractor): { 'url': 'https://streamable.com/e/dnd1', 'only_matching': True, + }, + { + 'url': 'https://streamable.com/s/okkqk/drxjds', + 'only_matching': True, } ] diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py new file mode 100644 index 000000000..aa4fad162 --- /dev/null +++ b/youtube_dl/extractor/streamango.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + js_to_json, +) + + +class StreamangoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4', + 'md5': 'e992787515a182f55e38fc97588d802a', + 'info_dict': { + 'id': 'clapasobsptpkdfe', + 'ext': 'mp4', + 'title': '20170315_150006.mp4', + } + }, { + 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + + formats = [] + for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage): + video = self._parse_json( + format_, video_id, transform_source=js_to_json, fatal=False) + if not video: + continue + src = video.get('src') + if not src: + continue + ext = determine_ext(src, default_ext=None) + if video.get('type') == 'application/dash+xml' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': src, + 'ext': ext or 'mp4', + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'tbr': int_or_none(video.get('bitrate')), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'url': url, + 'title': title, + 'formats': formats, + } diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 1b1afab32..3f3c681ae 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -210,7 +210,7 @@ class TEDIE(InfoExtractor): resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False)) m3u8_formats = list(filter( - lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', formats)) if http_url: for m3u8_format in m3u8_formats: diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 06ea2b40a..c5b3288ad 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -150,8 +150,7 @@ class TVPEmbedIE(InfoExtractor): 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) self._sort_formats(m3u8_formats) m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) + lambda f: f.get('vcodec') != 'none', m3u8_formats)) formats.extend(m3u8_formats) for i, m3u8_format in enumerate(m3u8_formats, 2): http_url = '%s-%d.mp4' % (video_url_base, i) diff --git a/youtube_dl/extractor/tvplayer.py b/youtube_dl/extractor/tvplayer.py index b6537141a..ebde6053f 100644 --- a/youtube_dl/extractor/tvplayer.py +++ b/youtube_dl/extractor/tvplayer.py @@ -2,9 +2,13 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( extract_attributes, + try_get, urlencode_postdata, ExtractorError, ) @@ -34,25 +38,32 @@ class TVPlayerIE(InfoExtractor): webpage, 'channel element')) title = current_channel['data-name'] - resource_id = self._search_regex( - r'resourceId\s*=\s*"(\d+)"', webpage, 'resource id') - platform = self._search_regex( - r'platform\s*=\s*"([^"]+)"', webpage, 'platform') + resource_id = current_channel['data-id'] + token = self._search_regex( - r'token\s*=\s*"([^"]+)"', webpage, 'token', default='null') - validate = self._search_regex( - r'validate\s*=\s*"([^"]+)"', webpage, 'validate', default='null') + r'data-token=(["\'])(?P<token>(?!\1).+)\1', webpage, + 'token', group='token') + + context = self._download_json( + 'https://tvplayer.com/watch/context', display_id, + 'Downloading JSON context', query={ + 'resource': resource_id, + 'nonce': token, + }) + + validate = context['validate'] + platform = try_get( + context, lambda x: x['platform']['key'], compat_str) or 'firefox' try: response = self._download_json( 'http://api.tvplayer.com/api/v2/stream/live', - resource_id, headers={ + display_id, 'Downloading JSON stream', headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', }, data=urlencode_postdata({ + 'id': resource_id, 'service': 1, 'platform': platform, - 'id': resource_id, - 'token': token, 'validate': validate, }))['tvplayer']['response'] except ExtractorError as e: @@ -63,7 +74,7 @@ class TVPlayerIE(InfoExtractor): '%s said: %s' % (self.IE_NAME, response['error']), expected=True) raise - formats = self._extract_m3u8_formats(response['stream'], resource_id, 'mp4') + formats = self._extract_m3u8_formats(response['stream'], display_id, 'mp4') self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 9aa38bc5a..890a149ea 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import re +import json from .common import InfoExtractor from ..compat import ( @@ -11,7 +12,6 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, - sanitized_Request, parse_iso8601, ) @@ -154,19 +154,24 @@ class VevoIE(VevoBaseIE): } def _initialize_api(self, video_id): - req = sanitized_Request( - 'http://www.vevo.com/auth', data=b'') webpage = self._download_webpage( - req, None, + 'https://accounts.vevo.com/token', None, note='Retrieving oauth token', - errnote='Unable to retrieve oauth token') + errnote='Unable to retrieve oauth token', + data=json.dumps({ + 'client_id': 'SPupX1tvqFEopQ1YS6SS', + 'grant_type': 'urn:vevo:params:oauth:grant-type:anonymous', + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) if re.search(r'(?i)THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION', webpage): self.raise_geo_restricted( '%s said: This page is currently unavailable in your region' % self.IE_NAME) auth_info = self._parse_json(webpage, video_id) - self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token'] + self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['legacy_token'] def _call_api(self, path, *args, **kwargs): try: diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py index 049db25a5..e5f964d39 100644 --- a/youtube_dl/extractor/videopress.py +++ b/youtube_dl/extractor/videopress.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import random import re from .common import InfoExtractor @@ -11,6 +10,7 @@ from ..utils import ( float_or_none, parse_age_limit, qualities, + random_birthday, try_get, unified_timestamp, urljoin, @@ -47,13 +47,10 @@ class VideoPressIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + query = random_birthday('birth_year', 'birth_month', 'birth_day') video = self._download_json( 'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id, - video_id, query={ - 'birth_month': random.randint(1, 12), - 'birth_day': random.randint(1, 31), - 'birth_year': random.randint(1950, 1995), - }) + video_id, query=query) title = video['title'] diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py index 4e4b4e38c..701bb1d01 100644 --- a/youtube_dl/extractor/vidio.py +++ b/youtube_dl/extractor/vidio.py @@ -49,8 +49,11 @@ class VidioIE(InfoExtractor): thumbnail = clip.get('image') m3u8_url = m3u8_url or self._search_regex( - r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>.+?)\1', webpage, 'hls url') - formats = self._extract_m3u8_formats(m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native') + r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>(?!\1).+)\1', + webpage, 'hls url') + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native') + self._sort_formats(formats) duration = int_or_none(duration or self._search_regex( r'data-video-duration=(["\'])(?P<duartion>\d+)\1', webpage, 'duration')) diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index d0556297e..e64873bce 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -42,14 +42,15 @@ class VidziIE(InfoExtractor): title = self._html_search_regex( r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') - packed_codes = [mobj.group(0) for mobj in re.finditer( - PACKED_CODES_RE, webpage)] - for num, pc in enumerate(packed_codes, 1): - code = decode_packed_codes(pc).replace('\\\'', '\'') + codes = [webpage] + codes.extend([ + decode_packed_codes(mobj.group(0)).replace('\\\'', '\'') + for mobj in re.finditer(PACKED_CODES_RE, webpage)]) + for num, code in enumerate(codes, 1): jwplayer_data = self._parse_json( self._search_regex( r'setup\(([^)]+)\)', code, 'jwplayer data', - default=NO_DEFAULT if num == len(packed_codes) else '{}'), + default=NO_DEFAULT if num == len(codes) else '{}'), video_id, transform_source=js_to_json) if jwplayer_data: break diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index fcf0cb100..d5d5b4c69 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -176,8 +176,7 @@ class ViewsterIE(InfoExtractor): if m3u8_formats: self._sort_formats(m3u8_formats) m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) + lambda f: f.get('vcodec') != 'none', m3u8_formats)) if len(qualities) == len(m3u8_formats): for q, m3u8_format in zip(qualities, m3u8_formats): f = m3u8_format.copy() diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py index 839cad986..625d0a1cc 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/youtube_dl/extractor/washingtonpost.py @@ -13,6 +13,7 @@ from ..utils import ( class WashingtonPostIE(InfoExtractor): IE_NAME = 'washingtonpost' _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' _TEST = { 'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d', 'md5': '6f537e1334b714eb15f9563bd4b9cdfa', @@ -27,6 +28,11 @@ class WashingtonPostIE(InfoExtractor): }, } + @classmethod + def _extract_urls(cls, webpage): + return re.findall( + r'<iframe[^>]+\bsrc=["\'](%s)' % cls._EMBED_URL, webpage) + def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index deb7483ae..45cfca7c5 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -10,12 +10,14 @@ from ..utils import ( class WSJIE(InfoExtractor): - _VALID_URL = r'''(?x)https?:// - (?: - video-api\.wsj\.com/api-video/player/iframe\.html\?guid=| - (?:www\.)?wsj\.com/video/[^/]+/ - ) - (?P<id>[a-zA-Z0-9-]+)''' + _VALID_URL = r'''(?x) + (?: + https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=| + https?://(?:www\.)?wsj\.com/video/[^/]+/| + wsj: + ) + (?P<id>[a-fA-F0-9-]{36}) + ''' IE_DESC = 'Wall Street Journal' _TESTS = [{ 'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', @@ -38,12 +40,17 @@ class WSJIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - api_url = ( - 'http://video-api.wsj.com/api-video/find_all_videos.asp?' - 'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,' - 'thumbnailList,author,description,name,duration,videoURL,' - 'titletag,formattedCreationDate,keywords,editor' % video_id) - info = self._download_json(api_url, video_id)['items'][0] + info = self._download_json( + 'http://video-api.wsj.com/api-video/find_all_videos.asp', video_id, + query={ + 'type': 'guid', + 'count': 1, + 'query': video_id, + 'fields': ','.join(( + 'type', 'hls', 'videoMP4List', 'thumbnailList', 'author', + 'description', 'name', 'duration', 'videoURL', 'titletag', + 'formattedCreationDate', 'keywords', 'editor')), + })['items'][0] title = info.get('name', info.get('titletag')) formats = [] @@ -87,3 +94,24 @@ class WSJIE(InfoExtractor): 'title': title, 'categories': info.get('keywords'), } + + +class WSJArticleIE(InfoExtractor): + _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?', + 'info_dict': { + 'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362', + 'ext': 'mp4', + 'upload_date': '20170221', + 'uploader_id': 'ralcaraz', + 'title': 'Bao Bao the Panda Leaves for China', + } + } + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + video_id = self._search_regex( + r'data-src=["\']([a-fA-F0-9-]{36})', webpage, 'video id') + return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index 1b5cd122a..13f8be6cb 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -17,24 +17,24 @@ from ..utils import ( class XFileShareIE(InfoExtractor): _SITES = ( - ('daclips.in', 'DaClips'), - ('filehoot.com', 'FileHoot'), - ('gorillavid.in', 'GorillaVid'), - ('movpod.in', 'MovPod'), - ('powerwatch.pw', 'PowerWatch'), - ('rapidvideo.ws', 'Rapidvideo.ws'), - ('thevideobee.to', 'TheVideoBee'), - ('vidto.me', 'Vidto'), - ('streamin.to', 'Streamin.To'), - ('xvidstage.com', 'XVIDSTAGE'), - ('vidabc.com', 'Vid ABC'), - ('vidbom.com', 'VidBom'), - ('vidlo.us', 'vidlo'), + (r'daclips\.(?:in|com)', 'DaClips'), + (r'filehoot\.com', 'FileHoot'), + (r'gorillavid\.(?:in|com)', 'GorillaVid'), + (r'movpod\.in', 'MovPod'), + (r'powerwatch\.pw', 'PowerWatch'), + (r'rapidvideo\.ws', 'Rapidvideo.ws'), + (r'thevideobee\.to', 'TheVideoBee'), + (r'vidto\.me', 'Vidto'), + (r'streamin\.to', 'Streamin.To'), + (r'xvidstage\.com', 'XVIDSTAGE'), + (r'vidabc\.com', 'Vid ABC'), + (r'vidbom\.com', 'VidBom'), + (r'vidlo\.us', 'vidlo'), ) IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) _VALID_URL = (r'https?://(?P<host>(?:www\.)?(?:%s))/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' - % '|'.join(re.escape(site) for site in list(zip(*_SITES))[0])) + % '|'.join(site for site in list(zip(*_SITES))[0])) _FILE_NOT_FOUND_REGEXES = ( r'>(?:404 - )?File Not Found<', diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 5584674a0..bea9b87ad 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( int_or_none, + js_to_json, orderedSet, parse_duration, sanitized_Request, @@ -38,6 +39,22 @@ class XTubeIE(InfoExtractor): 'age_limit': 18, } }, { + # FLV videos with duplicated formats + 'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752', + 'md5': 'a406963eb349dd43692ec54631efd88b', + 'info_dict': { + 'id': '9299752', + 'display_id': 'A-Super-Run-Part-1-YT', + 'ext': 'flv', + 'title': 'A Super Run - Part 1 (YT)', + 'description': 'md5:ca0d47afff4a9b2942e4b41aa970fd93', + 'uploader': 'tshirtguy59', + 'duration': 579, + 'view_count': int, + 'comment_count': int, + 'age_limit': 18, + }, + }, { # new URL schema 'url': 'http://www.xtube.com/video-watch/strange-erotica-625837', 'only_matching': True, @@ -68,8 +85,9 @@ class XTubeIE(InfoExtractor): }) sources = self._parse_json(self._search_regex( - r'(["\'])sources\1\s*:\s*(?P<sources>{.+?}),', - webpage, 'sources', group='sources'), video_id) + r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),', + webpage, 'sources', group='sources'), video_id, + transform_source=js_to_json) formats = [] for format_id, format_url in sources.items(): @@ -78,6 +96,7 @@ class XTubeIE(InfoExtractor): 'format_id': format_id, 'height': int_or_none(format_id), }) + self._remove_duplicate_formats(formats) self._sort_formats(formats) title = self._search_regex( diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 30825daae..eca603028 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -6,8 +6,10 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote from ..utils import ( clean_html, - ExtractorError, determine_ext, + ExtractorError, + int_or_none, + parse_duration, ) @@ -20,6 +22,7 @@ class XVideosIE(InfoExtractor): 'id': '4588838', 'ext': 'mp4', 'title': 'Biker Takes his Girl', + 'duration': 108, 'age_limit': 18, } } @@ -36,6 +39,11 @@ class XVideosIE(InfoExtractor): r'<title>(.*?)\s+-\s+XVID', webpage, 'title') video_thumbnail = self._search_regex( r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) + video_duration = int_or_none(self._og_search_property( + 'duration', webpage, default=None)) or parse_duration( + self._search_regex( + r'<span[^>]+class=["\']duration["\'][^>]*>.*?(\d[^<]+)', + webpage, 'duration', fatal=False)) formats = [] @@ -67,6 +75,7 @@ class XVideosIE(InfoExtractor): 'id': video_id, 'formats': formats, 'title': video_title, + 'duration': video_duration, 'thumbnail': video_thumbnail, 'age_limit': 18, } diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 4951414e9..38f82bf44 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -258,7 +258,7 @@ class YahooIE(InfoExtractor): return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) # Look for Brightcove New Studio embeds - bc_url = BrightcoveNewIE._extract_url(webpage) + bc_url = BrightcoveNewIE._extract_url(self, webpage) if bc_url: return self.url_result(bc_url, BrightcoveNewIE.ie_key()) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index fd6268ba4..eb1062142 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -234,7 +234,8 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'overembed': 'false', })['playlist'] - tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds']) + tracks = playlist['tracks'] + track_ids = [compat_str(track_id) for track_id in playlist['trackIds']] # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks, # missing tracks should be retrieved manually. diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9e2b9115c..480f403da 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -963,7 +963,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$', + r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -1629,7 +1629,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_desc = 'flash player %s' % player_version else: player_version = self._search_regex( - [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'], + [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', + r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'], player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version diff --git a/youtube_dl/extractor/zaq1.py b/youtube_dl/extractor/zaq1.py new file mode 100644 index 000000000..889aff5d8 --- /dev/null +++ b/youtube_dl/extractor/zaq1.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_timestamp, +) + + +class Zaq1IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://zaq1.pl/video/xev0e', + 'md5': '24a5eb3f052e604ae597c4d0d19b351e', + 'info_dict': { + 'id': 'xev0e', + 'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa', + 'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147', + 'ext': 'mp4', + 'duration': 511, + 'timestamp': 1490896361, + 'uploader': 'Anonim', + 'upload_date': '20170330', + 'view_count': int, + } + }, { + # malformed JSON-LD + 'url': 'http://zaq1.pl/video/x81vn', + 'info_dict': { + 'id': 'x81vn', + 'title': 'SEKRETNE ŻYCIE WALTERA MITTY', + 'ext': 'mp4', + 'duration': 6234, + 'timestamp': 1493494860, + 'uploader': 'Anonim', + 'upload_date': '20170429', + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to parse JSON'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + r'data-video-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'video url', group='url') + + info = self._search_json_ld(webpage, video_id, fatal=False) + + def extract_data(field, name, fatal=False): + return self._search_regex( + r'data-%s=(["\'])(?P<field>(?:(?!\1).)+)\1' % field, + webpage, field, fatal=fatal, group='field') + + if not info.get('title'): + info['title'] = extract_data('file-name', 'title', fatal=True) + + if not info.get('duration'): + info['duration'] = int_or_none(extract_data('duration', 'duration')) + + if not info.get('thumbnail'): + info['thumbnail'] = extract_data('photo-url', 'thumbnail') + + if not info.get('timestamp'): + info['timestamp'] = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')) + + if not info.get('interactionCount'): + info['view_count'] = int_or_none(self._html_search_meta( + 'interactionCount', webpage, 'view count')) + + uploader = self._html_search_regex( + r'Wideo dodał:\s*<a[^>]*>([^<]+)</a>', webpage, 'uploader', + fatal=False) + + width = int_or_none(self._html_search_meta( + 'width', webpage, fatal=False)) + height = int_or_none(self._html_search_meta( + 'height', webpage, fatal=False)) + + info.update({ + 'id': video_id, + 'formats': [{ + 'url': video_url, + 'width': width, + 'height': height, + 'http_headers': { + 'Referer': url, + }, + }], + 'uploader': uploader, + }) + + return info diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 2d2f5e47b..52309fb84 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -469,6 +469,10 @@ def parseOpts(overrideArguments=None): action='store_false', dest='skip_unavailable_fragments', help='Abort downloading when some fragment is not available') downloader.add_option( + '--keep-fragments', + action='store_true', dest='keep_fragments', default=False, + help='Keep downloaded fragments on disk after downloading is finished; fragments are erased by default') + downloader.add_option( '--buffer-size', dest='buffersize', metavar='SIZE', default='1024', help='Size of download buffer (e.g. 1024 or 16K) (default is %default)') diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 665109558..c91ec8588 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -4,6 +4,7 @@ import io import os import subprocess import time +import re from .common import AudioConversionError, PostProcessor @@ -22,6 +23,7 @@ from ..utils import ( subtitles_filename, dfxp2srt, ISO639Utils, + replace_extension, ) @@ -429,17 +431,40 @@ class FFmpegMetadataPP(FFmpegPostProcessor): filename = info['filepath'] temp_filename = prepend_extension(filename, 'temp') + in_filenames = [filename] + options = [] if info['ext'] == 'm4a': - options = ['-vn', '-acodec', 'copy'] + options.extend(['-vn', '-acodec', 'copy']) else: - options = ['-c', 'copy'] + options.extend(['-c', 'copy']) for (name, value) in metadata.items(): options.extend(['-metadata', '%s=%s' % (name, value)]) + chapters = info.get('chapters', []) + if chapters: + metadata_filename = encodeFilename(replace_extension(filename, 'meta')) + with io.open(metadata_filename, 'wt', encoding='utf-8') as f: + def ffmpeg_escape(text): + return re.sub(r'(=|;|#|\\|\n)', r'\\\1', text) + + metadata_file_content = ';FFMETADATA1\n' + for chapter in chapters: + metadata_file_content += '[CHAPTER]\nTIMEBASE=1/1000\n' + metadata_file_content += 'START=%d\n' % (chapter['start_time'] * 1000) + metadata_file_content += 'END=%d\n' % (chapter['end_time'] * 1000) + chapter_title = chapter.get('title') + if chapter_title: + metadata_file_content += 'title=%s\n' % ffmpeg_escape(chapter_title) + f.write(metadata_file_content) + in_filenames.append(metadata_filename) + options.extend(['-map_metadata', '1']) + self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename) - self.run_ffmpeg(filename, temp_filename, options) + self.run_ffmpeg_multiple_files(in_filenames, temp_filename, options) + if chapters: + os.remove(metadata_filename) os.remove(encodeFilename(filename)) os.rename(encodeFilename(temp_filename), encodeFilename(filename)) return [], info diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py index 0f5d7bdb2..5d4adbe72 100644 --- a/youtube_dl/socks.py +++ b/youtube_dl/socks.py @@ -193,9 +193,10 @@ class sockssocket(socket.socket): self._check_response_version(SOCKS5_VERSION, version) - if method == Socks5Auth.AUTH_NO_ACCEPTABLE: + if method == Socks5Auth.AUTH_NO_ACCEPTABLE or ( + method == Socks5Auth.AUTH_USER_PASS and (not self._proxy.username or not self._proxy.password)): self.close() - raise Socks5Error(method) + raise Socks5Error(Socks5Auth.AUTH_NO_ACCEPTABLE) if method == Socks5Auth.AUTH_USER_PASS: username = self._proxy.username.encode('utf-8') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 84aaac664..c67f95ac9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -11,6 +11,7 @@ import contextlib import ctypes import datetime import email.utils +import email.header import errno import functools import gzip @@ -421,8 +422,8 @@ def clean_html(html): # Newline vs <br /> html = html.replace('\n', ' ') - html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) - html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) + html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html) + html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) # Strip html tags html = re.sub('<.*?>', '', html) # Replace html entities @@ -1194,6 +1195,11 @@ def unified_timestamp(date_str, day_first=True): # Remove AM/PM + timezone date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + # Remove unrecognized timezones from ISO 8601 alike timestamps + m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str) + if m: + date_str = date_str[:-len(m.group('tz'))] + for expression in date_formats(day_first): try: dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) @@ -2092,6 +2098,58 @@ def update_Request(req, url=None, data=None, headers={}, query={}): return new_req +def try_multipart_encode(data, boundary): + content_type = 'multipart/form-data; boundary=%s' % boundary + + out = b'' + for k, v in data.items(): + out += b'--' + boundary.encode('ascii') + b'\r\n' + if isinstance(k, compat_str): + k = k.encode('utf-8') + if isinstance(v, compat_str): + v = v.encode('utf-8') + # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578 + # suggests sending UTF-8 directly. Firefox sends UTF-8, too + content = b'Content-Disposition: form-data; name="%s"\r\n\r\n' % k + v + b'\r\n' + if boundary.encode('ascii') in content: + raise ValueError('Boundary overlaps with data') + out += content + + out += b'--' + boundary.encode('ascii') + b'--\r\n' + + return out, content_type + + +def multipart_encode(data, boundary=None): + ''' + Encode a dict to RFC 7578-compliant form-data + + data: + A dict where keys and values can be either Unicode or bytes-like + objects. + boundary: + If specified a Unicode object, it's used as the boundary. Otherwise + a random boundary is generated. + + Reference: https://tools.ietf.org/html/rfc7578 + ''' + has_specified_boundary = boundary is not None + + while True: + if boundary is None: + boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff)) + + try: + out, content_type = try_multipart_encode(data, boundary) + break + except ValueError: + if has_specified_boundary: + raise + boundary = None + + return out, content_type + + def dict_get(d, key_or_keys, default=None, skip_false_values=True): if isinstance(key_or_keys, (list, tuple)): for key in key_or_keys: @@ -2103,13 +2161,16 @@ def dict_get(d, key_or_keys, default=None, skip_false_values=True): def try_get(src, getter, expected_type=None): - try: - v = getter(src) - except (AttributeError, KeyError, TypeError, IndexError): - pass - else: - if expected_type is None or isinstance(v, expected_type): - return v + if not isinstance(getter, (list, tuple)): + getter = [getter] + for get in getter: + try: + v = get(src) + except (AttributeError, KeyError, TypeError, IndexError): + pass + else: + if expected_type is None or isinstance(v, expected_type): + return v def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): @@ -2270,10 +2331,8 @@ def mimetype2ext(mt): return { '3gpp': '3gp', 'smptett+xml': 'tt', - 'srt': 'srt', 'ttaf+xml': 'dfxp', 'ttml+xml': 'ttml', - 'vtt': 'vtt', 'x-flv': 'flv', 'x-mp4-fragmented': 'mp4', 'x-ms-wmv': 'wmv', @@ -2281,11 +2340,11 @@ def mimetype2ext(mt): 'x-mpegurl': 'm3u8', 'vnd.apple.mpegurl': 'm3u8', 'dash+xml': 'mpd', - 'f4m': 'f4m', 'f4m+xml': 'f4m', 'hds+xml': 'f4m', 'vnd.ms-sstr+xml': 'ism', 'quicktime': 'mov', + 'mp2t': 'ts', }.get(res, res) @@ -2508,27 +2567,97 @@ def srt_subtitles_timecode(seconds): def dfxp2srt(dfxp_data): + LEGACY_NAMESPACES = ( + ('http://www.w3.org/ns/ttml', [ + 'http://www.w3.org/2004/11/ttaf1', + 'http://www.w3.org/2006/04/ttaf1', + 'http://www.w3.org/2006/10/ttaf1', + ]), + ('http://www.w3.org/ns/ttml#styling', [ + 'http://www.w3.org/ns/ttml#style', + ]), + ) + + SUPPORTED_STYLING = [ + 'color', + 'fontFamily', + 'fontSize', + 'fontStyle', + 'fontWeight', + 'textDecoration' + ] + _x = functools.partial(xpath_with_ns, ns_map={ 'ttml': 'http://www.w3.org/ns/ttml', - 'ttaf1': 'http://www.w3.org/2006/10/ttaf1', - 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1', + 'tts': 'http://www.w3.org/ns/ttml#styling', }) + styles = {} + default_style = {} + class TTMLPElementParser(object): - out = '' + _out = '' + _unclosed_elements = [] + _applied_styles = [] def start(self, tag, attrib): - if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): - self.out += '\n' + if tag in (_x('ttml:br'), 'br'): + self._out += '\n' + else: + unclosed_elements = [] + style = {} + element_style_id = attrib.get('style') + if default_style: + style.update(default_style) + if element_style_id: + style.update(styles.get(element_style_id, {})) + for prop in SUPPORTED_STYLING: + prop_val = attrib.get(_x('tts:' + prop)) + if prop_val: + style[prop] = prop_val + if style: + font = '' + for k, v in sorted(style.items()): + if self._applied_styles and self._applied_styles[-1].get(k) == v: + continue + if k == 'color': + font += ' color="%s"' % v + elif k == 'fontSize': + font += ' size="%s"' % v + elif k == 'fontFamily': + font += ' face="%s"' % v + elif k == 'fontWeight' and v == 'bold': + self._out += '<b>' + unclosed_elements.append('b') + elif k == 'fontStyle' and v == 'italic': + self._out += '<i>' + unclosed_elements.append('i') + elif k == 'textDecoration' and v == 'underline': + self._out += '<u>' + unclosed_elements.append('u') + if font: + self._out += '<font' + font + '>' + unclosed_elements.append('font') + applied_style = {} + if self._applied_styles: + applied_style.update(self._applied_styles[-1]) + applied_style.update(style) + self._applied_styles.append(applied_style) + self._unclosed_elements.append(unclosed_elements) def end(self, tag): - pass + if tag not in (_x('ttml:br'), 'br'): + unclosed_elements = self._unclosed_elements.pop() + for element in reversed(unclosed_elements): + self._out += '</%s>' % element + if unclosed_elements and self._applied_styles: + self._applied_styles.pop() def data(self, data): - self.out += data + self._out += data def close(self): - return self.out.strip() + return self._out.strip() def parse_node(node): target = TTMLPElementParser() @@ -2536,13 +2665,45 @@ def dfxp2srt(dfxp_data): parser.feed(xml.etree.ElementTree.tostring(node)) return parser.close() + for k, v in LEGACY_NAMESPACES: + for ns in v: + dfxp_data = dfxp_data.replace(ns, k) + dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) out = [] - paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p') + paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') if not paras: raise ValueError('Invalid dfxp/TTML subtitle') + repeat = False + while True: + for style in dfxp.findall(_x('.//ttml:style')): + style_id = style.get('id') + parent_style_id = style.get('style') + if parent_style_id: + if parent_style_id not in styles: + repeat = True + continue + styles[style_id] = styles[parent_style_id].copy() + for prop in SUPPORTED_STYLING: + prop_val = style.get(_x('tts:' + prop)) + if prop_val: + styles.setdefault(style_id, {})[prop] = prop_val + if repeat: + repeat = False + else: + break + + for p in ('body', 'div'): + ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p]) + if ele is None: + continue + style = styles.get(ele.get('style')) + if not style: + continue + default_style.update(style) + for para, index in zip(paras, itertools.count(1)): begin_time = parse_dfxp_time_expr(para.attrib.get('begin')) end_time = parse_dfxp_time_expr(para.attrib.get('end')) @@ -3862,3 +4023,10 @@ class PhantomJSwrapper(object): return (html, encodeArgument(out)) + +def random_birthday(year_field, month_field, day_field): + return { + year_field: str(random.randint(1950, 1995)), + month_field: str(random.randint(1, 12)), + day_field: str(random.randint(1, 31)), + } diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 612b50f7b..c19ac49b0 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.04.15' +__version__ = '2017.05.01' |