diff options
Diffstat (limited to 'youtube_dl')
96 files changed, 3937 insertions, 1672 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5036289b0..ba72ec6f3 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -196,8 +196,8 @@ class YoutubeDL(object): prefer_insecure: Use HTTP instead of HTTPS to retrieve information. At the moment, this is only supported by YouTube. proxy: URL of the proxy server to use - cn_verification_proxy: URL of the proxy to use for IP address verification - on Chinese sites. (Experimental) + geo_verification_proxy: URL of the proxy to use for IP address verification + on geo-restricted sites. (Experimental) socket_timeout: Time to wait for unresponsive hosts, in seconds bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi @@ -304,6 +304,11 @@ class YoutubeDL(object): self.params.update(params) self.cache = Cache(self) + if self.params.get('cn_verification_proxy') is not None: + self.report_warning('--cn-verification-proxy is deprecated. Use --geo-verification-proxy instead.') + if self.params.get('geo_verification_proxy') is None: + self.params['geo_verification_proxy'] = self.params['cn_verification_proxy'] + if params.get('bidi_workaround', False): try: import pty diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 4905674ad..2b34bf9c2 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -382,6 +382,8 @@ def _real_main(argv=None): 'external_downloader_args': external_downloader_args, 'postprocessor_args': postprocessor_args, 'cn_verification_proxy': opts.cn_verification_proxy, + 'geo_verification_proxy': opts.geo_verification_proxy, + } with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 67db1c7c6..b8aaf5a46 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import binascii @@ -2594,15 +2595,19 @@ except ImportError: # Python < 3.3 return "'" + s.replace("'", "'\"'\"'") + "'" -if sys.version_info >= (2, 7, 3): +try: + args = shlex.split('中文') + assert (isinstance(args, list) and + isinstance(args[0], compat_str) and + args[0] == '中文') compat_shlex_split = shlex.split -else: +except (AssertionError, UnicodeEncodeError): # Working around shlex issue with unicode strings on some python 2 # versions (see http://bugs.python.org/issue1548891) def compat_shlex_split(s, comments=False, posix=True): if isinstance(s, compat_str): s = s.encode('utf-8') - return shlex.split(s, comments, posix) + return list(map(lambda s: s.decode('utf-8'), shlex.split(s, comments, posix))) def compat_ord(c): diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 8f88b0241..80c21d40b 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -196,6 +196,11 @@ def build_fragments_list(boot_info): first_frag_number = fragment_run_entry_table[0]['first'] fragments_counter = itertools.count(first_frag_number) for segment, fragments_count in segment_run_table['segment_run']: + # In some live HDS streams (for example Rai), `fragments_count` is + # abnormal and causing out-of-memory errors. It's OK to change the + # number of fragments for live streams as they are updated periodically + if fragments_count == 4294967295 and boot_info['live']: + fragments_count = 2 for _ in range(fragments_count): res.append((segment, next(fragments_counter))) @@ -329,7 +334,11 @@ class F4mFD(FragmentFD): base_url = compat_urlparse.urljoin(man_url, media.attrib['url']) bootstrap_node = doc.find(_add_ns('bootstrapInfo')) - boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, base_url) + # From Adobe F4M 3.0 spec: + # The <baseURL> element SHALL be the base URL for all relative + # (HTTP-based) URLs in the manifest. If <baseURL> is not present, said + # URLs should be relative to the location of the containing document. + boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, man_url) live = boot_info['live'] metadata_node = media.find(_add_ns('metadata')) if metadata_node is not None: diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 1bbfe2641..8f53050c9 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -2,41 +2,33 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .theplatform import ThePlatformIE from ..utils import ( smuggle_url, update_url_query, unescapeHTML, + extract_attributes, + get_element_by_attribute, ) +from ..compat import ( + compat_urlparse, +) + +class AENetworksBaseIE(ThePlatformIE): + _THEPLATFORM_KEY = 'crazyjava' + _THEPLATFORM_SECRET = 's3cr3t' -class AENetworksIE(InfoExtractor): + +class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' - _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?P<type>[^/]+)/(?:[^/]+/)+(?P<id>[^/]+?)(?:$|[?#])' - + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|movies/(?P<movie_display_id>[^/]+)/full-movie)' _TESTS = [{ - 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', - 'info_dict': { - 'id': 'g12m5Gyt3fdR', - 'ext': 'mp4', - 'title': "Bet You Didn't Know: Valentine's Day", - 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', - 'timestamp': 1375819729, - 'upload_date': '20130806', - 'uploader': 'AENE-NEW', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['ThePlatform'], - 'expected_warnings': ['JSON-LD'], - }, { 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', 'md5': '8ff93eb073449f151d6b90c0ae1ef0c7', 'info_dict': { - 'id': 'eg47EERs_JsZ', + 'id': '22253814', 'ext': 'mp4', 'title': 'Winter Is Coming', 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', @@ -46,42 +38,168 @@ class AENetworksIE(InfoExtractor): }, 'add_ie': ['ThePlatform'], }, { - 'url': 'http://www.aetv.com/shows/duck-dynasty/video/inlawful-entry', + 'url': 'http://www.history.com/shows/ancient-aliens/season-1', + 'info_dict': { + 'id': '71889446852', + }, + 'playlist_mincount': 5, + }, { + 'url': 'http://www.mylifetime.com/shows/atlanta-plastic', + 'info_dict': { + 'id': 'SERIES4317', + 'title': 'Atlanta Plastic', + }, + 'playlist_mincount': 2, + }, { + 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', 'only_matching': True }, { - 'url': 'http://www.fyi.tv/shows/tiny-house-nation/videos/207-sq-ft-minnesota-prairie-cottage', + 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', 'only_matching': True }, { - 'url': 'http://www.mylifetime.com/shows/project-runway-junior/video/season-1/episode-6/superstar-clients', + 'url': 'http://www.mylifetime.com/shows/project-runway-junior/season-1/episode-6', + 'only_matching': True + }, { + 'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie', 'only_matching': True }] + _DOMAIN_TO_REQUESTOR_ID = { + 'history.com': 'HISTORY', + 'aetv.com': 'AETV', + 'mylifetime.com': 'LIFETIME', + 'fyi.tv': 'FYI', + } def _real_extract(self, url): - page_type, video_id = re.match(self._VALID_URL, url).groups() + domain, show_path, movie_display_id = re.match(self._VALID_URL, url).groups() + display_id = show_path or movie_display_id + webpage = self._download_webpage(url, display_id) + if show_path: + url_parts = show_path.split('/') + url_parts_len = len(url_parts) + if url_parts_len == 1: + entries = [] + for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): + entries.append(self.url_result( + compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) + return self.playlist_result( + entries, self._html_search_meta('aetn:SeriesId', webpage), + self._html_search_meta('aetn:SeriesTitle', webpage)) + elif url_parts_len == 2: + entries = [] + for episode_item in re.findall(r'(?s)<div[^>]+class="[^"]*episode-item[^"]*"[^>]*>', webpage): + episode_attributes = extract_attributes(episode_item) + episode_url = compat_urlparse.urljoin( + url, episode_attributes['data-canonical']) + entries.append(self.url_result( + episode_url, 'AENetworks', + episode_attributes['data-videoid'])) + return self.playlist_result( + entries, self._html_search_meta('aetn:SeasonId', webpage)) + + query = { + 'mbr': 'true', + 'assetTypes': 'medium_video_s3' + } + video_id = self._html_search_meta('aetn:VideoID', webpage) + media_url = self._search_regex( + r"media_url\s*=\s*'([^']+)'", webpage, 'video url') + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + if theplatform_metadata.get('AETN$isBehindWall'): + requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] + resource = '<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/"><channel><title>%s</title><item><title>%s</title><guid>%s</guid><media:rating scheme="urn:v-chip">%s</media:rating></item></channel></rss>' % (requestor_id, theplatform_metadata['title'], theplatform_metadata['AETN$PPL_pplProgramId'], theplatform_metadata['ratings'][0]['rating']) + query['auth'] = self._extract_mvpd_auth( + url, video_id, requestor_id, resource) + info.update(self._search_json_ld(webpage, video_id, fatal=False)) + media_url = update_url_query(media_url, query) + media_url = self._sign_url(media_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET) + formats, subtitles = self._extract_theplatform_smil(media_url, video_id) + self._sort_formats(formats) + info.update({ + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + }) + return info - webpage = self._download_webpage(url, video_id) - video_url_re = [ - r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id, - r"media_url\s*=\s*'([^']+)'" - ] - video_url = unescapeHTML(self._search_regex(video_url_re, webpage, 'video url')) - query = {'mbr': 'true'} - if page_type == 'shows': - query['assetTypes'] = 'medium_video_s3' - if 'switch=hds' in video_url: - query['switch'] = 'hls' +class HistoryTopicIE(AENetworksBaseIE): + IE_NAME = 'history:topic' + IE_DESC = 'History.com Topic' + _VALID_URL = r'https?://(?:www\.)?history\.com/topics/(?:[^/]+/)?(?P<topic_id>[^/]+)(?:/[^/]+(?:/(?P<video_display_id>[^/?#]+))?)?' + _TESTS = [{ + 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', + 'info_dict': { + 'id': '40700995724', + 'ext': 'mp4', + 'title': "Bet You Didn't Know: Valentine's Day", + 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', + 'timestamp': 1375819729, + 'upload_date': '20130806', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], + }, { + 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/videos', + 'info_dict': + { + 'id': 'world-war-i-history', + 'title': 'World War I History', + }, + 'playlist_mincount': 24, + }, { + 'url': 'http://www.history.com/topics/world-war-i-history/videos', + 'only_matching': True, + }, { + 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history', + 'only_matching': True, + }, { + 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/speeches', + 'only_matching': True, + }] - info = self._search_json_ld(webpage, video_id, fatal=False) - info.update({ + def theplatform_url_result(self, theplatform_url, video_id, query): + return { '_type': 'url_transparent', + 'id': video_id, 'url': smuggle_url( - update_url_query(video_url, query), + update_url_query(theplatform_url, query), { 'sig': { - 'key': 'crazyjava', - 'secret': 's3cr3t'}, + 'key': self._THEPLATFORM_KEY, + 'secret': self._THEPLATFORM_SECRET, + }, 'force_smil_url': True }), - }) - return info + 'ie_key': 'ThePlatform', + } + + def _real_extract(self, url): + topic_id, video_display_id = re.match(self._VALID_URL, url).groups() + if video_display_id: + webpage = self._download_webpage(url, video_display_id) + release_url, video_id = re.search(r"_videoPlayer.play\('([^']+)'\s*,\s*'[^']+'\s*,\s*'(\d+)'\)", webpage).groups() + release_url = unescapeHTML(release_url) + + return self.theplatform_url_result( + release_url, video_id, { + 'mbr': 'true', + 'switch': 'hls' + }) + else: + webpage = self._download_webpage(url, topic_id) + entries = [] + for episode_item in re.findall(r'<a.+?data-release-url="[^"]+"[^>]*>', webpage): + video_attributes = extract_attributes(episode_item) + entries.append(self.theplatform_url_result( + video_attributes['data-release-url'], video_attributes['data-id'], { + 'mbr': 'true', + 'switch': 'hls' + })) + return self.playlist_result(entries, topic_id, get_element_by_attribute('class', 'show-title', webpage)) diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 8545681be..e8e40126b 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -5,6 +5,8 @@ from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, + mimetype2ext, + determine_ext, ) @@ -50,21 +52,25 @@ class AMPIE(InfoExtractor): if isinstance(media_content, dict): media_content = [media_content] for media_data in media_content: - media = media_data['@attributes'] - media_type = media['type'] - if media_type in ('video/f4m', 'application/f4m+xml'): + media = media_data.get('@attributes', {}) + media_url = media.get('url') + if not media_url: + continue + ext = mimetype2ext(media.get('type')) or determine_ext(media_url) + if ext == 'f4m': formats.extend(self._extract_f4m_formats( - media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', + media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id, f4m_id='hds', fatal=False)) - elif media_type == 'application/x-mpegURL': + elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False)) + media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: formats.append({ 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), 'url': media['url'], 'tbr': int_or_none(media.get('bitrate')), 'filesize': int_or_none(media.get('fileSize')), + 'ext': ext, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 9b01e38f5..9e28f2579 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -22,6 +22,7 @@ class AnimeOnDemandIE(InfoExtractor): _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' _NETRC_MACHINE = 'animeondemand' _TESTS = [{ + # jap, OmU 'url': 'https://www.anime-on-demand.de/anime/161', 'info_dict': { 'id': '161', @@ -30,17 +31,21 @@ class AnimeOnDemandIE(InfoExtractor): }, 'playlist_mincount': 4, }, { - # Film wording is used instead of Episode + # Film wording is used instead of Episode, ger/jap, Dub/OmU 'url': 'https://www.anime-on-demand.de/anime/39', 'only_matching': True, }, { - # Episodes without titles + # Episodes without titles, jap, OmU 'url': 'https://www.anime-on-demand.de/anime/162', 'only_matching': True, }, { # ger/jap, Dub/OmU, account required 'url': 'https://www.anime-on-demand.de/anime/169', 'only_matching': True, + }, { + # Full length film, non-series, ger/jap, Dub/OmU, account required + 'url': 'https://www.anime-on-demand.de/anime/185', + 'only_matching': True, }] def _login(self): @@ -110,35 +115,12 @@ class AnimeOnDemandIE(InfoExtractor): entries = [] - for num, episode_html in enumerate(re.findall( - r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage), 1): - episodebox_title = self._search_regex( - (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', - r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), - episode_html, 'episodebox title', default=None, group='title') - if not episodebox_title: - continue - - episode_number = int(self._search_regex( - r'(?:Episode|Film)\s*(\d+)', - episodebox_title, 'episode number', default=num)) - episode_title = self._search_regex( - r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', - episodebox_title, 'episode title', default=None) - - video_id = 'episode-%d' % episode_number - - common_info = { - 'id': video_id, - 'series': anime_title, - 'episode': episode_title, - 'episode_number': episode_number, - } - + def extract_info(html, video_id, num=None): + title, description = [None] * 2 formats = [] for input_ in re.findall( - r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', episode_html): + r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', html): attributes = extract_attributes(input_) playlist_urls = [] for playlist_key in ('data-playlist', 'data-otherplaylist'): @@ -161,7 +143,7 @@ class AnimeOnDemandIE(InfoExtractor): format_id_list.append(lang) if kind: format_id_list.append(kind) - if not format_id_list: + if not format_id_list and num is not None: format_id_list.append(compat_str(num)) format_id = '-'.join(format_id_list) format_note = ', '.join(filter(None, (kind, lang_note))) @@ -215,28 +197,74 @@ class AnimeOnDemandIE(InfoExtractor): }) formats.extend(file_formats) - if formats: - self._sort_formats(formats) + return { + 'title': title, + 'description': description, + 'formats': formats, + } + + def extract_entries(html, video_id, common_info, num=None): + info = extract_info(html, video_id, num) + + if info['formats']: + self._sort_formats(info['formats']) f = common_info.copy() - f.update({ - 'title': title, - 'description': description, - 'formats': formats, - }) + f.update(info) entries.append(f) - # Extract teaser only when full episode is not available - if not formats: + # Extract teaser/trailer only when full episode is not available + if not info['formats']: m = re.search( - r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<', - episode_html) + r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>(?P<kind>Teaser|Trailer)<', + html) if m: f = common_info.copy() f.update({ - 'id': '%s-teaser' % f['id'], + 'id': '%s-%s' % (f['id'], m.group('kind').lower()), 'title': m.group('title'), 'url': compat_urlparse.urljoin(url, m.group('href')), }) entries.append(f) + def extract_episodes(html): + for num, episode_html in enumerate(re.findall( + r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', html), 1): + episodebox_title = self._search_regex( + (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', + r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), + episode_html, 'episodebox title', default=None, group='title') + if not episodebox_title: + continue + + episode_number = int(self._search_regex( + r'(?:Episode|Film)\s*(\d+)', + episodebox_title, 'episode number', default=num)) + episode_title = self._search_regex( + r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', + episodebox_title, 'episode title', default=None) + + video_id = 'episode-%d' % episode_number + + common_info = { + 'id': video_id, + 'series': anime_title, + 'episode': episode_title, + 'episode_number': episode_number, + } + + extract_entries(episode_html, video_id, common_info) + + def extract_film(html, video_id): + common_info = { + 'id': anime_id, + 'title': anime_title, + 'description': anime_description, + } + extract_entries(html, video_id, common_info) + + extract_episodes(webpage) + + if not entries: + extract_film(webpage, anime_id) + return self.playlist_result(entries, anime_id, anime_title, anime_description) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index be40f85b4..a6801f3d4 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -7,6 +7,8 @@ from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( int_or_none, + parse_duration, + unified_strdate, ) @@ -16,7 +18,8 @@ class AppleTrailersIE(InfoExtractor): _TESTS = [{ 'url': 'http://trailers.apple.com/trailers/wb/manofsteel/', 'info_dict': { - 'id': 'manofsteel', + 'id': '5111', + 'title': 'Man of Steel', }, 'playlist': [ { @@ -70,6 +73,15 @@ class AppleTrailersIE(InfoExtractor): 'id': 'blackthorn', }, 'playlist_mincount': 2, + 'expected_warnings': ['Unable to download JSON metadata'], + }, { + # json data only available from http://trailers.apple.com/trailers/feeds/data/15881.json + 'url': 'http://trailers.apple.com/trailers/fox/kungfupanda3/', + 'info_dict': { + 'id': '15881', + 'title': 'Kung Fu Panda 3', + }, + 'playlist_mincount': 4, }, { 'url': 'http://trailers.apple.com/ca/metropole/autrui/', 'only_matching': True, @@ -85,6 +97,45 @@ class AppleTrailersIE(InfoExtractor): movie = mobj.group('movie') uploader_id = mobj.group('company') + webpage = self._download_webpage(url, movie) + film_id = self._search_regex(r"FilmId\s*=\s*'(\d+)'", webpage, 'film id') + film_data = self._download_json( + 'http://trailers.apple.com/trailers/feeds/data/%s.json' % film_id, + film_id, fatal=False) + + if film_data: + entries = [] + for clip in film_data.get('clips', []): + clip_title = clip['title'] + + formats = [] + for version, version_data in clip.get('versions', {}).items(): + for size, size_data in version_data.get('sizes', {}).items(): + src = size_data.get('src') + if not src: + continue + formats.append({ + 'format_id': '%s-%s' % (version, size), + 'url': re.sub(r'_(\d+p.mov)', r'_h\1', src), + 'width': int_or_none(size_data.get('width')), + 'height': int_or_none(size_data.get('height')), + 'language': version[:2], + }) + self._sort_formats(formats) + + entries.append({ + 'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(), + 'formats': formats, + 'title': clip_title, + 'thumbnail': clip.get('screen') or clip.get('thumb'), + 'duration': parse_duration(clip.get('runtime') or clip.get('faded')), + 'upload_date': unified_strdate(clip.get('posted')), + 'uploader_id': uploader_id, + }) + + page_data = film_data.get('page', {}) + return self.playlist_result(entries, film_id, page_data.get('movie_title')) + playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc') def fix_html(s): diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index fd45b3e42..13a06396d 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -13,6 +13,7 @@ from ..utils import ( parse_duration, unified_strdate, xpath_text, + update_url_query, ) from ..compat import compat_etree_fromstring @@ -34,6 +35,7 @@ class ARDMediathekIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916', 'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e', @@ -44,6 +46,7 @@ class ARDMediathekIE(InfoExtractor): 'description': 'md5:196392e79876d0ac94c94e8cdb2875f1', 'duration': 5252, }, + 'skip': 'HTTP Error 404: Not Found', }, { # audio 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', @@ -55,6 +58,7 @@ class ARDMediathekIE(InfoExtractor): 'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef', 'duration': 3240, }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', 'only_matching': True, @@ -113,11 +117,14 @@ class ARDMediathekIE(InfoExtractor): continue if ext == 'f4m': formats.extend(self._extract_f4m_formats( - stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', - video_id, preference=-1, f4m_id='hds', fatal=False)) + update_url_query(stream_url, { + 'hdcore': '3.1.1', + 'plugin': 'aasp-3.1.1.69.124' + }), + video_id, f4m_id='hds', fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - stream_url, video_id, 'mp4', preference=1, m3u8_id='hls', fatal=False)) + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: if server and server.startswith('rtmp'): f = { @@ -231,7 +238,8 @@ class ARDIE(InfoExtractor): 'title': 'Die Story im Ersten: Mission unter falscher Flagge', 'upload_date': '20140804', 'thumbnail': 're:^https?://.*\.jpg$', - } + }, + 'skip': 'HTTP Error 404: Not Found', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 049f1fa9e..e0c5c1804 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -419,6 +419,7 @@ class ArteTVPlaylistIE(ArteTVBaseIE): 'info_dict': { 'id': 'PL-013263', 'title': 'Areva & Uramin', + 'description': 'md5:a1dc0312ce357c262259139cfd48c9bf', }, 'playlist_mincount': 6, }, { diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index ef560b592..57ce0c174 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -90,6 +90,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'description': 'md5:363109c02998fee92ec02211bd8000df', 'uploader': 'National Ballet of Canada', }, + 'skip': 'Video gone', }, { # test flv videos served by akamaihd.net @@ -108,7 +109,7 @@ class BrightcoveLegacyIE(InfoExtractor): }, }, { - # playlist test + # playlist with 'videoList' # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL', 'info_dict': { @@ -117,6 +118,15 @@ class BrightcoveLegacyIE(InfoExtractor): }, 'playlist_mincount': 7, }, + { + # playlist with 'playlistTab' (https://github.com/rg3/youtube-dl/issues/9965) + 'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg', + 'info_dict': { + 'id': '1522758701001', + 'title': 'Lesson 08', + }, + 'playlist_mincount': 10, + }, ] FLV_VCODECS = { 1: 'SORENSON', @@ -298,13 +308,19 @@ class BrightcoveLegacyIE(InfoExtractor): info_url, player_key, 'Downloading playlist information') json_data = json.loads(playlist_info) - if 'videoList' not in json_data: + if 'videoList' in json_data: + playlist_info = json_data['videoList'] + playlist_dto = playlist_info['mediaCollectionDTO'] + elif 'playlistTabs' in json_data: + playlist_info = json_data['playlistTabs'] + playlist_dto = playlist_info['lineupListDTO']['playlistDTOs'][0] + else: raise ExtractorError('Empty playlist') - playlist_info = json_data['videoList'] - videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']] + + videos = [self._extract_video_info(video_info) for video_info in playlist_dto['videoDTOs']] return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'], - playlist_title=playlist_info['mediaCollectionDTO']['displayName']) + playlist_title=playlist_dto['displayName']) def _extract_video_info(self, video_info): video_id = compat_str(video_info['id']) @@ -585,6 +601,13 @@ class BrightcoveNewIE(InfoExtractor): 'format_id': build_format_id('rtmp'), }) formats.append(f) + + errors = json_data.get('errors') + if not formats and errors: + error = errors[0] + raise ExtractorError( + error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + self._sort_formats(formats) subtitles = {} diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py index df503ecc0..75fa92d7c 100644 --- a/youtube_dl/extractor/buzzfeed.py +++ b/youtube_dl/extractor/buzzfeed.py @@ -5,6 +5,7 @@ import json import re from .common import InfoExtractor +from .facebook import FacebookIE class BuzzFeedIE(InfoExtractor): @@ -20,11 +21,11 @@ class BuzzFeedIE(InfoExtractor): 'info_dict': { 'id': 'aVCR29aE_OQ', 'ext': 'mp4', + 'title': 'Angry Ram destroys a punching bag..', + 'description': 'md5:c59533190ef23fd4458a5e8c8c872345', 'upload_date': '20141024', 'uploader_id': 'Buddhanz1', - 'description': 'He likes to stay in shape with his heavy bag, he wont stop until its on the ground\n\nFollow Angry Ram on Facebook for regular updates -\nhttps://www.facebook.com/pages/Angry-Ram/1436897249899558?ref=hl', - 'uploader': 'Buddhanz', - 'title': 'Angry Ram destroys a punching bag', + 'uploader': 'Angry Ram', } }] }, { @@ -41,13 +42,30 @@ class BuzzFeedIE(InfoExtractor): 'info_dict': { 'id': 'mVmBL8B-In0', 'ext': 'mp4', + 'title': 're:Munchkin the Teddy Bear gets her exercise', + 'description': 'md5:28faab95cda6e361bcff06ec12fc21d8', 'upload_date': '20141124', 'uploader_id': 'CindysMunchkin', - 'description': 're:© 2014 Munchkin the', 'uploader': 're:^Munchkin the', - 'title': 're:Munchkin the Teddy Bear gets her exercise', }, }] + }, { + 'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK', + 'info_dict': { + 'id': 'the-most-adorable-crash-landing-ever', + 'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing', + 'description': 'This gosling knows how to stick a landing.', + }, + 'playlist': [{ + 'md5': '763ca415512f91ca62e4621086900a23', + 'info_dict': { + 'id': '971793786185728', + 'ext': 'mp4', + 'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...', + 'uploader': 'Calgary Outdoor Centre-University of Calgary', + }, + }], + 'add_ie': ['Facebook'], }] def _real_extract(self, url): @@ -66,6 +84,10 @@ class BuzzFeedIE(InfoExtractor): continue entries.append(self.url_result(video['url'])) + facebook_url = FacebookIE._extract_url(webpage) + if facebook_url: + entries.append(self.url_result(facebook_url)) + return { '_type': 'playlist', 'id': playlist_id, diff --git a/youtube_dl/extractor/cbsinteractive.py b/youtube_dl/extractor/cbsinteractive.py index 0011c3029..821db20b2 100644 --- a/youtube_dl/extractor/cbsinteractive.py +++ b/youtube_dl/extractor/cbsinteractive.py @@ -80,9 +80,6 @@ class CBSInteractiveIE(ThePlatformIE): media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId']) formats, subtitles = [], {} - if site == 'cnet': - formats, subtitles = self._extract_theplatform_smil( - self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id) for (fkey, vid) in vdata['files'].items(): if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: continue @@ -94,7 +91,7 @@ class CBSInteractiveIE(ThePlatformIE): subtitles = self._merge_subtitles(subtitles, tp_subtitles) self._sort_formats(formats) - info = self.get_metadata('kYEXFC/%s' % media_guid_path, video_id) + info = self._extract_theplatform_metadata('kYEXFC/%s' % media_guid_path, video_id) info.update({ 'id': video_id, 'display_id': display_id, diff --git a/youtube_dl/extractor/cliprs.py b/youtube_dl/extractor/cliprs.py index 4f9320ea5..d55b26d59 100644 --- a/youtube_dl/extractor/cliprs.py +++ b/youtube_dl/extractor/cliprs.py @@ -1,16 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - parse_iso8601, -) +from .onet import OnetBaseIE -class ClipRsIE(InfoExtractor): +class ClipRsIE(OnetBaseIE): _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P<id>[^/]+)/\d+' _TEST = { 'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', @@ -27,64 +21,13 @@ class ClipRsIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id') + mvp_id = self._search_mvp_id(webpage) - response = self._download_json( - 'http://qi.ckm.onetapi.pl/', video_id, - query={ - 'body[id]': video_id, - 'body[jsonrpc]': '2.0', - 'body[method]': 'get_asset_detail', - 'body[params][ID_Publikacji]': video_id, - 'body[params][Service]': 'www.onet.pl', - 'content-type': 'application/jsonp', - 'x-onet-app': 'player.front.onetapi.pl', - }) + info_dict = self._extract_from_id(mvp_id, webpage) + info_dict['display_id'] = display_id - error = response.get('error') - if error: - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error['message']), expected=True) - - video = response['result'].get('0') - - formats = [] - for _, formats_dict in video['formats'].items(): - if not isinstance(formats_dict, dict): - continue - for format_id, format_list in formats_dict.items(): - if not isinstance(format_list, list): - continue - for f in format_list: - if not f.get('url'): - continue - formats.append({ - 'url': f['url'], - 'format_id': format_id, - 'height': int_or_none(f.get('vertical_resolution')), - 'width': int_or_none(f.get('horizontal_resolution')), - 'abr': float_or_none(f.get('audio_bitrate')), - 'vbr': float_or_none(f.get('video_bitrate')), - }) - self._sort_formats(formats) - - meta = video.get('meta', {}) - - title = self._og_search_title(webpage, default=None) or meta['title'] - description = self._og_search_description(webpage, default=None) or meta.get('description') - duration = meta.get('length') or meta.get('lenght') - timestamp = parse_iso8601(meta.get('addDate'), ' ') - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - } + return info_dict diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 661889593..df546da27 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -44,6 +44,7 @@ from ..utils import ( sanitized_Request, unescapeHTML, unified_strdate, + unified_timestamp, url_basename, xpath_element, xpath_text, @@ -163,6 +164,7 @@ class InfoExtractor(object): * "height" (optional, int) * "resolution" (optional, string "{width}x{height"}, deprecated) + * "filesize" (optional, int) thumbnail: Full URL to a video thumbnail image. description: Full video description. uploader: Full name of the video uploader. @@ -751,10 +753,12 @@ class InfoExtractor(object): return self._og_search_property('url', html, **kargs) def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): + if not isinstance(name, (list, tuple)): + name = [name] if display_name is None: - display_name = name + display_name = name[0] return self._html_search_regex( - self._meta_regex(name), + [self._meta_regex(n) for n in name], html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): @@ -803,15 +807,17 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') - def _search_json_ld(self, html, video_id, **kwargs): + def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): json_ld = self._search_regex( r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', html, 'JSON-LD', group='json_ld', **kwargs) if not json_ld: return {} - return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True)) + return self._json_ld( + json_ld, video_id, fatal=kwargs.get('fatal', True), + expected_type=expected_type) - def _json_ld(self, json_ld, video_id, fatal=True): + def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): if isinstance(json_ld, compat_str): json_ld = self._parse_json(json_ld, video_id, fatal=fatal) if not json_ld: @@ -819,6 +825,8 @@ class InfoExtractor(object): info = {} if json_ld.get('@context') == 'http://schema.org': item_type = json_ld.get('@type') + if expected_type is not None and expected_type != item_type: + return info if item_type == 'TVEpisode': info.update({ 'episode': unescapeHTML(json_ld.get('name')), @@ -837,6 +845,19 @@ class InfoExtractor(object): 'title': unescapeHTML(json_ld.get('headline')), 'description': unescapeHTML(json_ld.get('articleBody')), }) + elif item_type == 'VideoObject': + info.update({ + 'url': json_ld.get('contentUrl'), + 'title': unescapeHTML(json_ld.get('name')), + 'description': unescapeHTML(json_ld.get('description')), + 'thumbnail': json_ld.get('thumbnailUrl'), + 'duration': parse_duration(json_ld.get('duration')), + 'timestamp': unified_timestamp(json_ld.get('uploadDate')), + 'filesize': float_or_none(json_ld.get('contentSize')), + 'tbr': int_or_none(json_ld.get('bitrate')), + 'width': int_or_none(json_ld.get('width')), + 'height': int_or_none(json_ld.get('height')), + }) return dict((k, v) for k, v in info.items() if v is not None) @staticmethod @@ -878,7 +899,11 @@ class InfoExtractor(object): f['ext'] = determine_ext(f['url']) if isinstance(field_preference, (list, tuple)): - return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) + return tuple( + f.get(field) + if f.get(field) is not None + else ('' if field == 'format_id' else -1) + for field in field_preference) preference = f.get('preference') if preference is None: @@ -1781,6 +1806,13 @@ class InfoExtractor(object): def _mark_watched(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + def geo_verification_headers(self): + headers = {} + geo_verification_proxy = self._downloader.params.get('geo_verification_proxy') + if geo_verification_proxy: + headers['Ytdl-request-proxy'] = geo_verification_proxy + return headers + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/ctv.py b/youtube_dl/extractor/ctv.py new file mode 100644 index 000000000..5807fbac9 --- /dev/null +++ b/youtube_dl/extractor/ctv.py @@ -0,0 +1,30 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class CTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ctv\.ca/video/player\?vid=(?P<id>[0-9.]+)' + _TESTS = [{ + 'url': 'http://www.ctv.ca/video/player?vid=706966', + 'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', + 'info_dict': { + 'id': '706966', + 'ext': 'mp4', + 'title': 'Larry Day and Richard Jutras on the TIFF red carpet of \'Stonewall\'', + 'description': 'etalk catches up with Larry Day and Richard Jutras on the TIFF red carpet of "Stonewall”.', + 'upload_date': '20150919', + 'timestamp': 1442624700, + }, + 'expected_warnings': ['HTTP Error 404'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': '9c9media:ctv_web:%s' % video_id, + 'ie_key': 'NineCNineMedia', + } diff --git a/youtube_dl/extractor/ctvnews.py b/youtube_dl/extractor/ctvnews.py new file mode 100644 index 000000000..1023b6130 --- /dev/null +++ b/youtube_dl/extractor/ctvnews.py @@ -0,0 +1,65 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import orderedSet + + +class CTVNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P<id>[0-9.]+)' + _TESTS = [{ + 'url': 'http://www.ctvnews.ca/video?clipId=901995', + 'md5': '10deb320dc0ccb8d01d34d12fc2ea672', + 'info_dict': { + 'id': '901995', + 'ext': 'mp4', + 'title': 'Extended: \'That person cannot be me\' Johnson says', + 'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285', + 'timestamp': 1467286284, + 'upload_date': '20160630', + } + }, { + 'url': 'http://www.ctvnews.ca/video?playlistId=1.2966224', + 'info_dict': + { + 'id': '1.2966224', + }, + 'playlist_mincount': 19, + }, { + 'url': 'http://www.ctvnews.ca/video?binId=1.2876780', + 'info_dict': + { + 'id': '1.2876780', + }, + 'playlist_mincount': 100, + }, { + 'url': 'http://www.ctvnews.ca/1.810401', + 'only_matching': True, + }, { + 'url': 'http://www.ctvnews.ca/canadiens-send-p-k-subban-to-nashville-in-blockbuster-trade-1.2967231', + 'only_matching': True, + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + + def ninecninemedia_url_result(clip_id): + return { + '_type': 'url_transparent', + 'id': clip_id, + 'url': '9c9media:ctvnews_web:%s' % clip_id, + 'ie_key': 'NineCNineMedia', + } + + if page_id.isdigit(): + return ninecninemedia_url_result(page_id) + else: + webpage = self._download_webpage('http://www.ctvnews.ca/%s' % page_id, page_id, query={ + 'ot': 'example.AjaxPageLayout.ot', + 'maxItemsPerPage': 1000000, + }) + entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet( + re.findall(r'clip\.id\s*=\s*(\d+);', webpage))] + return self.playlist_result(entries, page_id) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 2e6226ea0..1f92823b7 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -16,6 +16,7 @@ from ..utils import ( sanitized_Request, str_to_int, unescapeHTML, + mimetype2ext, ) @@ -111,6 +112,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor): } ] + @staticmethod + def _extract_urls(webpage): + # Look for embedded Dailymotion player + matches = re.findall( + r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage) + return list(map(lambda m: unescapeHTML(m[1]), matches)) + def _real_extract(self, url): video_id = self._match_id(url) @@ -153,18 +161,19 @@ class DailymotionIE(DailymotionBaseInfoExtractor): type_ = media.get('type') if type_ == 'application/vnd.lumberjack.manifest': continue - ext = determine_ext(media_url) - if type_ == 'application/x-mpegURL' or ext == 'm3u8': + ext = mimetype2ext(type_) or determine_ext(media_url) + if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( media_url, video_id, 'mp4', preference=-1, m3u8_id='hls', fatal=False)) - elif type_ == 'application/f4m' or ext == 'f4m': + elif ext == 'f4m': formats.extend(self._extract_f4m_formats( media_url, video_id, preference=-1, f4m_id='hds', fatal=False)) else: f = { 'url': media_url, 'format_id': 'http-%s' % quality, + 'ext': ext, } m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url) if m: diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 86024a745..b5c310ccb 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -66,22 +66,32 @@ class DaumIE(InfoExtractor): 'view_count': int, 'comment_count': int, }, + }, { + # Requires dte_type=WEB (#9972) + 'url': 'http://tvpot.daum.net/v/s3794Uf1NZeZ1qMpGpeqeRU', + 'md5': 'a8917742069a4dd442516b86e7d66529', + 'info_dict': { + 'id': 's3794Uf1NZeZ1qMpGpeqeRU', + 'ext': 'mp4', + 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny) [쇼! 음악중심] 508회 20160611', + 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\n\n[쇼! 음악중심] 20160611, 507회', + 'upload_date': '20160611', + }, }] def _real_extract(self, url): video_id = compat_urllib_parse_unquote(self._match_id(url)) - query = compat_urllib_parse_urlencode({'vid': video_id}) movie_data = self._download_json( - 'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json?' + query, - video_id, 'Downloading video formats info') + 'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json', + video_id, 'Downloading video formats info', query={'vid': video_id, 'dte_type': 'WEB'}) # For urls like http://m.tvpot.daum.net/v/65139429, where the video_id is really a clipid if not movie_data.get('output_list', {}).get('output_list') and re.match(r'^\d+$', video_id): return self.url_result('http://tvpot.daum.net/clip/ClipView.do?clipid=%s' % video_id) info = self._download_xml( - 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, - 'Downloading video info') + 'http://tvpot.daum.net/clip/ClipInfoXml.do', video_id, + 'Downloading video info', query={'vid': video_id}) formats = [] for format_el in movie_data['output_list']['output_list']: diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 5deff5f30..efb8585e8 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -20,7 +20,7 @@ from ..utils import ( class DCNIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?' def _real_extract(self, url): show_id, video_id, season_id = re.match(self._VALID_URL, url).groups() @@ -55,30 +55,32 @@ class DCNBaseIE(InfoExtractor): 'is_live': is_live, } - def _extract_video_formats(self, webpage, video_id, entry_protocol): + def _extract_video_formats(self, webpage, video_id, m3u8_entry_protocol): formats = [] - m3u8_url = self._html_search_regex( - r'file\s*:\s*"([^"]+)', webpage, 'm3u8 url', fatal=False) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=None)) - - rtsp_url = self._search_regex( - r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) - if rtsp_url: - formats.append({ - 'url': rtsp_url, - 'format_id': 'rtsp', - }) - + format_url_base = 'http' + self._html_search_regex( + [ + r'file\s*:\s*"https?(://[^"]+)/playlist.m3u8', + r'<a[^>]+href="rtsp(://[^"]+)"' + ], webpage, 'format url') + # TODO: Current DASH formats are broken - $Time$ pattern in + # <SegmentTemplate> not implemented yet + # formats.extend(self._extract_mpd_formats( + # format_url_base + '/manifest.mpd', + # video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_m3u8_formats( + format_url_base + '/playlist.m3u8', video_id, 'mp4', + m3u8_entry_protocol, m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + format_url_base + '/manifest.f4m', + video_id, f4m_id='hds', fatal=False)) self._sort_formats(formats) return formats class DCNVideoIE(DCNBaseIE): IE_NAME = 'dcn:video' - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?(?:video(?:/[^/]+)?|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)' + _TESTS = [{ 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', 'info_dict': { @@ -94,7 +96,10 @@ class DCNVideoIE(DCNBaseIE): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'http://awaan.ae/video/26723981/%D8%AF%D8%A7%D8%B1-%D8%A7%D9%84%D8%B3%D9%84%D8%A7%D9%85:-%D8%AE%D9%8A%D8%B1-%D8%AF%D9%88%D8%B1-%D8%A7%D9%84%D8%A3%D9%86%D8%B5%D8%A7%D8%B1', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -120,7 +125,7 @@ class DCNVideoIE(DCNBaseIE): class DCNLiveIE(DCNBaseIE): IE_NAME = 'dcn:live' - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?live/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?live/(?P<id>\d+)' def _real_extract(self, url): channel_id = self._match_id(url) @@ -147,7 +152,7 @@ class DCNLiveIE(DCNBaseIE): class DCNSeasonIE(InfoExtractor): IE_NAME = 'dcn:season' - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' _TEST = { 'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A', 'info_dict': diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 113a4966f..12d28d3b9 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -51,6 +51,14 @@ class EaglePlatformIE(InfoExtractor): }] @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1', + webpage) + if mobj is not None: + return mobj.group('url') + + @staticmethod def _handle_error(response): status = int_or_none(response.get('status', 200)) if status != 200: diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b1b04f2fc..864c9af68 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -20,7 +20,10 @@ from .adobetv import ( AdobeTVVideoIE, ) from .adultswim import AdultSwimIE -from .aenetworks import AENetworksIE +from .aenetworks import ( + AENetworksIE, + HistoryTopicIE, +) from .afreecatv import AfreecaTVIE from .aftonbladet import AftonbladetIE from .airmozilla import AirMozillaIE @@ -136,9 +139,9 @@ from .chirbit import ( ChirbitProfileIE, ) from .cinchcast import CinchcastIE -from .cliprs import ClipRsIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE +from .cliprs import ClipRsIE from .clipsyndicate import ClipsyndicateIE from .closertotruth import CloserToTruthIE from .cloudy import CloudyIE @@ -168,6 +171,8 @@ from .crunchyroll import ( ) from .cspan import CSpanIE from .ctsnews import CtsNewsIE +from .ctv import CTVIE +from .ctvnews import CTVNewsIE from .cultureunplugged import CultureUnpluggedIE from .cwtv import CWTVIE from .dailymail import DailyMailIE @@ -251,6 +256,7 @@ from .fivemin import FiveMinIE from .fivetv import FiveTVIE from .fktv import FKTVIE from .flickr import FlickrIE +from .flipagram import FlipagramIE from .folketinget import FolketingetIE from .footyroom import FootyRoomIE from .formula1 import Formula1IE @@ -276,6 +282,7 @@ from .freespeech import FreespeechIE from .freevideo import FreeVideoIE from .funimation import FunimationIE from .funnyordie import FunnyOrDieIE +from .fusion import FusionIE from .gameinformer import GameInformerIE from .gamekings import GamekingsIE from .gameone import ( @@ -285,7 +292,6 @@ from .gameone import ( from .gamersyde import GamersydeIE from .gamespot import GameSpotIE from .gamestar import GameStarIE -from .gametrailers import GametrailersIE from .gazeta import GazetaIE from .gdcvault import GDCVaultIE from .generic import GenericIE @@ -321,6 +327,10 @@ from .hotnewhiphop import HotNewHipHopIE from .hotstar import HotStarIE from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE +from .hrti import ( + HRTiIE, + HRTiPlaylistIE, +) from .huffpost import HuffPostIE from .hypem import HypemIE from .iconosquare import IconosquareIE @@ -359,6 +369,7 @@ from .jove import JoveIE from .jwplatform import JWPlatformIE from .jpopsukitv import JpopsukiIE from .kaltura import KalturaIE +from .kamcord import KamcordIE from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE @@ -423,6 +434,7 @@ from .makerschannel import MakersChannelIE from .makertv import MakerTVIE from .matchtv import MatchTVIE from .mdr import MDRIE +from .meta import METAIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE @@ -455,6 +467,7 @@ from .motherless import MotherlessIE from .motorsport import MotorsportIE from .movieclips import MovieClipsIE from .moviezine import MoviezineIE +from .msn import MSNIE from .mtv import ( MTVIE, MTVServicesEmbeddedIE, @@ -481,7 +494,6 @@ from .nbc import ( NBCNewsIE, NBCSportsIE, NBCSportsVPlayerIE, - MSNBCIE, ) from .ndr import ( NDRIE, @@ -523,6 +535,7 @@ from .nick import ( NickDeIE, ) from .niconico import NiconicoIE, NiconicoPlaylistIE +from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE from .noco import NocoIE from .normalboots import NormalbootsIE @@ -570,6 +583,10 @@ from .nytimes import ( from .nuvid import NuvidIE from .odnoklassniki import OdnoklassnikiIE from .oktoberfesttv import OktoberfestTVIE +from .onet import ( + OnetIE, + OnetChannelIE, +) from .onionstudios import OnionStudiosIE from .ooyala import ( OoyalaIE, @@ -608,6 +625,7 @@ from .pluralsight import ( PluralsightCourseIE, ) from .podomatic import PodomaticIE +from .polskieradio import PolskieRadioIE from .porn91 import Porn91IE from .pornhd import PornHdIE from .pornhub import ( @@ -662,6 +680,7 @@ from .rice import RICEIE from .ringtv import RingTVIE from .ro220 import Ro220IE from .rockstargames import RockstarGamesIE +from .roosterteeth import RoosterTeethIE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtbf import RTBFIE @@ -706,10 +725,12 @@ from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE from .sina import SinaIE +from .sixplay import SixPlayIE from .skynewsarabia import ( SkyNewsArabiaIE, SkyNewsArabiaArticleIE, ) +from .skysports import SkySportsIE from .slideshare import SlideshareIE from .slutload import SlutloadIE from .smotri import ( @@ -891,6 +912,7 @@ from .udn import UDNEmbedIE from .digiteka import DigitekaIE from .unistra import UnistraIE from .urort import UrortIE +from .urplay import URPlayIE from .usatoday import USATodayIE from .ustream import UstreamIE, UstreamChannelIE from .ustudio import ( @@ -917,6 +939,7 @@ from .vice import ( ViceIE, ViceShowIE, ) +from .vidbit import VidbitIE from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE @@ -1050,6 +1073,7 @@ from .youtube import ( YoutubeSearchDateIE, YoutubeSearchIE, YoutubeSearchURLIE, + YoutubeSharedVideoIE, YoutubeShowIE, YoutubeSubscriptionsIE, YoutubeTruncatedIDIE, diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f5bbd39d2..cdb093262 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -129,6 +129,21 @@ class FacebookIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage) + if mobj is not None: + return mobj.group('url') + + # Facebook API embed + # see https://developers.facebook.com/docs/plugins/embedded-video-player + mobj = re.search(r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ + data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage) + if mobj is not None: + return mobj.group('url') + def _login(self): (useremail, password) = self._get_login_info() if useremail is None: @@ -204,12 +219,25 @@ class FacebookIE(InfoExtractor): BEFORE = '{swf.addParam(param[0], param[1]);});' AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});' - m = re.search(re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER), webpage) - if m: - swf_params = m.group(1).replace('\\\\', '\\').replace('\\"', '"') + PATTERN = re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER) + + for m in re.findall(PATTERN, webpage): + swf_params = m.replace('\\\\', '\\').replace('\\"', '"') data = dict(json.loads(swf_params)) params_raw = compat_urllib_parse_unquote(data['params']) - video_data = json.loads(params_raw)['video_data'] + video_data_candidate = json.loads(params_raw)['video_data'] + for _, f in video_data_candidate.items(): + if not f: + continue + if isinstance(f, dict): + f = [f] + if not isinstance(f, list): + continue + if f[0].get('video_id') == video_id: + video_data = video_data_candidate + break + if video_data: + break def video_data_list2dict(video_data): ret = {} @@ -239,6 +267,8 @@ class FacebookIE(InfoExtractor): formats = [] for format_id, f in video_data.items(): + if f and isinstance(f, dict): + f = [f] if not f or not isinstance(f, list): continue for quality in ('sd', 'hd'): diff --git a/youtube_dl/extractor/flipagram.py b/youtube_dl/extractor/flipagram.py new file mode 100644 index 000000000..acb6133ff --- /dev/null +++ b/youtube_dl/extractor/flipagram.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + float_or_none, + try_get, + unified_timestamp, +) + + +class FlipagramIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?flipagram\.com/f/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://flipagram.com/f/nyvTSJMKId', + 'md5': '888dcf08b7ea671381f00fab74692755', + 'info_dict': { + 'id': 'nyvTSJMKId', + 'ext': 'mp4', + 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction', + 'description': 'md5:d55e32edc55261cae96a41fa85ff630e', + 'duration': 35.571, + 'timestamp': 1461244995, + 'upload_date': '20160421', + 'uploader': 'kitty juria', + 'uploader_id': 'sjuria101', + 'creator': 'kitty juria', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'comments': list, + 'formats': 'mincount:2', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_data = self._parse_json( + self._search_regex( + r'window\.reactH2O\s*=\s*({.+});', webpage, 'video data'), + video_id) + + flipagram = video_data['flipagram'] + video = flipagram['video'] + + json_ld = self._search_json_ld(webpage, video_id, default=False) + title = json_ld.get('title') or flipagram['captionText'] + description = json_ld.get('description') or flipagram.get('captionText') + + formats = [{ + 'url': video['url'], + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'filesize': int_or_none(video_data.get('size')), + }] + + preview_url = try_get( + flipagram, lambda x: x['music']['track']['previewUrl'], compat_str) + if preview_url: + formats.append({ + 'url': preview_url, + 'ext': 'm4a', + 'vcodec': 'none', + }) + + self._sort_formats(formats) + + counts = flipagram.get('counts', {}) + user = flipagram.get('user', {}) + video_data = flipagram.get('video', {}) + + thumbnails = [{ + 'url': self._proto_relative_url(cover['url']), + 'width': int_or_none(cover.get('width')), + 'height': int_or_none(cover.get('height')), + 'filesize': int_or_none(cover.get('size')), + } for cover in flipagram.get('covers', []) if cover.get('url')] + + # Note that this only retrieves comments that are initally loaded. + # For videos with large amounts of comments, most won't be retrieved. + comments = [] + for comment in video_data.get('comments', {}).get(video_id, {}).get('items', []): + text = comment.get('comment') + if not text or not isinstance(text, list): + continue + comments.append({ + 'author': comment.get('user', {}).get('name'), + 'author_id': comment.get('user', {}).get('username'), + 'id': comment.get('id'), + 'text': text[0], + 'timestamp': unified_timestamp(comment.get('created')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': float_or_none(flipagram.get('duration'), 1000), + 'thumbnails': thumbnails, + 'timestamp': unified_timestamp(flipagram.get('iso8601Created')), + 'uploader': user.get('name'), + 'uploader_id': user.get('username'), + 'creator': user.get('name'), + 'view_count': int_or_none(counts.get('plays')), + 'like_count': int_or_none(counts.get('likes')), + 'repost_count': int_or_none(counts.get('reflips')), + 'comment_count': int_or_none(counts.get('comments')), + 'comments': comments, + 'formats': formats, + } diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index ad94e31f3..7653975e3 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -14,7 +14,10 @@ from ..utils import ( parse_duration, determine_ext, ) -from .dailymotion import DailymotionCloudIE +from .dailymotion import ( + DailymotionIE, + DailymotionCloudIE, +) class FranceTVBaseInfoExtractor(InfoExtractor): @@ -188,6 +191,21 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'params': { 'skip_download': True, }, + }, { + # Dailymotion embed + 'url': 'http://www.francetvinfo.fr/politique/notre-dame-des-landes/video-sur-france-inter-cecile-duflot-denonce-le-regard-meprisant-de-patrick-cohen_1520091.html', + 'md5': 'ee7f1828f25a648addc90cb2687b1f12', + 'info_dict': { + 'id': 'x4iiko0', + 'ext': 'mp4', + 'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen', + 'description': 'Au lendemain de la victoire du "oui" au référendum sur l\'aéroport de Notre-Dame-des-Landes, l\'ancienne ministre écologiste est l\'invitée de Patrick Cohen. Plus d\'info : https://www.franceinter.fr/emissions/le-7-9/le-7-9-27-juin-2016', + 'timestamp': 1467011958, + 'upload_date': '20160627', + 'uploader': 'France Inter', + 'uploader_id': 'x2q2ez', + }, + 'add_ie': ['Dailymotion'], }] def _real_extract(self, url): @@ -197,7 +215,13 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) if dmcloud_url: - return self.url_result(dmcloud_url, 'DailymotionCloud') + return self.url_result(dmcloud_url, DailymotionCloudIE.ie_key()) + + dailymotion_urls = DailymotionIE._extract_urls(webpage) + if dailymotion_urls: + return self.playlist_result([ + self.url_result(dailymotion_url, DailymotionIE.ie_key()) + for dailymotion_url in dailymotion_urls]) video_id, catalogue = self._search_regex( (r'id-video=([^@]+@[^"]+)', diff --git a/youtube_dl/extractor/fusion.py b/youtube_dl/extractor/fusion.py new file mode 100644 index 000000000..b4ab4cbb7 --- /dev/null +++ b/youtube_dl/extractor/fusion.py @@ -0,0 +1,35 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .ooyala import OoyalaIE + + +class FusionIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fusion\.net/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://fusion.net/video/201781/u-s-and-panamanian-forces-work-together-to-stop-a-vessel-smuggling-drugs/', + 'info_dict': { + 'id': 'ZpcWNoMTE6x6uVIIWYpHh0qQDjxBuq5P', + 'ext': 'mp4', + 'title': 'U.S. and Panamanian forces work together to stop a vessel smuggling drugs', + 'description': 'md5:0cc84a9943c064c0f46b128b41b1b0d7', + 'duration': 140.0, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], + }, { + 'url': 'http://fusion.net/video/201781', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + ooyala_code = self._search_regex( + r'data-video-id=(["\'])(?P<code>.+?)\1', + webpage, 'ooyala code', group='code') + + return OoyalaIE._build_url_result(ooyala_code) diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py deleted file mode 100644 index 1e7948ab8..000000000 --- a/youtube_dl/extractor/gametrailers.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_age_limit, - url_basename, -) - - -class GametrailersIE(InfoExtractor): - _VALID_URL = r'https?://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' - - _TEST = { - 'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review', - 'md5': 'f28c4efa0bdfaf9b760f6507955b6a6a', - 'info_dict': { - 'id': '2983958', - 'ext': 'mp4', - 'display_id': '116437-Just-Cause-3-Review', - 'title': 'Just Cause 3 - Review', - 'description': 'It\'s a lot of fun to shoot at things and then watch them explode in Just Cause 3, but should there be more to the experience than that?', - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - title = self._html_search_regex( - r'<title>(.+?)\|', webpage, 'title').strip() - embed_url = self._proto_relative_url( - self._search_regex( - r'src=\'(//embed.gametrailers.com/embed/[^\']+)\'', webpage, - 'embed url'), - scheme='http:') - video_id = url_basename(embed_url) - embed_page = self._download_webpage(embed_url, video_id) - embed_vars_json = self._search_regex( - r'(?s)var embedVars = (\{.*?\})\s*</script>', embed_page, - 'embed vars') - info = self._parse_json(embed_vars_json, video_id) - - formats = [] - for media in info['media']: - if media['mediaPurpose'] == 'play': - formats.append({ - 'url': media['uri'], - 'height': media['height'], - 'width:': media['width'], - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'thumbnail': info.get('thumbUri'), - 'description': self._og_search_description(webpage), - 'duration': int_or_none(info.get('videoLengthInSeconds')), - 'age_limit': parse_age_limit(info.get('audienceRating')), - } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 4aa24061c..cddd1a817 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -49,7 +49,10 @@ from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE from .tnaflix import TNAFlixNetworkEmbedIE from .vimeo import VimeoIE -from .dailymotion import DailymotionCloudIE +from .dailymotion import ( + DailymotionIE, + DailymotionCloudIE, +) from .onionstudios import OnionStudiosIE from .viewlift import ViewLiftEmbedIE from .screenwavemedia import ScreenwaveMediaIE @@ -64,6 +67,9 @@ from .liveleak import LiveLeakIE from .threeqsdn import ThreeQSDNIE from .theplatform import ThePlatformIE from .vessel import VesselIE +from .kaltura import KalturaIE +from .eagleplatform import EaglePlatformIE +from .facebook import FacebookIE class GenericIE(InfoExtractor): @@ -920,6 +926,24 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, + { + # Kaltura embedded via quoted entry_id + 'url': 'https://www.oreilly.com/ideas/my-cloud-makes-pretty-pictures', + 'info_dict': { + 'id': '0_utuok90b', + 'ext': 'mp4', + 'title': '06_matthew_brender_raj_dutt', + 'timestamp': 1466638791, + 'upload_date': '20160622', + }, + 'add_ie': ['Kaltura'], + 'expected_warnings': [ + 'Could not send HEAD request' + ], + 'params': { + 'skip_download': True, + } + }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -1091,12 +1115,17 @@ class GenericIE(InfoExtractor): # Dailymotion Cloud video { 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910', - 'md5': '49444254273501a64675a7e68c502681', + 'md5': 'dcaf23ad0c67a256f4278bce6e0bae38', 'info_dict': { - 'id': '5585de919473990de4bee11b', + 'id': 'x2uy8t3', 'ext': 'mp4', - 'title': 'Le débat', + 'title': 'Sauvons les abeilles ! - Le débat', + 'description': 'md5:d9082128b1c5277987825d684939ca26', 'thumbnail': 're:^https?://.*\.jpe?g$', + 'timestamp': 1434970506, + 'upload_date': '20150622', + 'uploader': 'Public Sénat', + 'uploader_id': 'xa9gza', } }, # OnionStudios embed @@ -1220,6 +1249,102 @@ class GenericIE(InfoExtractor): 'uploader': 'www.hudl.com', }, }, + # twitter:player embed + { + 'url': 'http://www.theatlantic.com/video/index/484130/what-do-black-holes-sound-like/', + 'md5': 'a3e0df96369831de324f0778e126653c', + 'info_dict': { + 'id': '4909620399001', + 'ext': 'mp4', + 'title': 'What Do Black Holes Sound Like?', + 'description': 'what do black holes sound like', + 'upload_date': '20160524', + 'uploader_id': '29913724001', + 'timestamp': 1464107587, + 'uploader': 'TheAtlantic', + }, + 'add_ie': ['BrightcoveLegacy'], + }, + # Facebook <iframe> embed + { + 'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html', + 'md5': 'fbcde74f534176ecb015849146dd3aee', + 'info_dict': { + 'id': '599637780109885', + 'ext': 'mp4', + 'title': 'Facebook video #599637780109885', + }, + }, + # Facebook API embed + { + 'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/', + 'md5': 'a47372ee61b39a7b90287094d447d94e', + 'info_dict': { + 'id': '10153467542406923', + 'ext': 'mp4', + 'title': 'Facebook video #10153467542406923', + }, + }, + # Wordpress "YouTube Video Importer" plugin + { + 'url': 'http://www.lothype.com/blue-devils-drumline-stanford-lot-2016/', + 'md5': 'd16797741b560b485194eddda8121b48', + 'info_dict': { + 'id': 'HNTXWDXV9Is', + 'ext': 'mp4', + 'title': 'Blue Devils Drumline Stanford lot 2016', + 'upload_date': '20160627', + 'uploader_id': 'GENOCIDE8GENERAL10', + 'uploader': 'cylus cyrus', + }, + }, + { + # video stored on custom kaltura server + 'url': 'http://www.expansion.com/multimedia/videos.html?media=EQcM30NHIPv', + 'md5': '537617d06e64dfed891fa1593c4b30cc', + 'info_dict': { + 'id': '0_1iotm5bh', + 'ext': 'mp4', + 'title': 'Elecciones británicas: 5 lecciones para Rajoy', + 'description': 'md5:435a89d68b9760b92ce67ed227055f16', + 'uploader_id': 'videos.expansion@el-mundo.net', + 'upload_date': '20150429', + 'timestamp': 1430303472, + }, + 'add_ie': ['Kaltura'], + }, + { + # Non-standard Vimeo embed + 'url': 'https://openclassrooms.com/courses/understanding-the-web', + 'md5': '64d86f1c7d369afd9a78b38cbb88d80a', + 'info_dict': { + 'id': '148867247', + 'ext': 'mp4', + 'title': 'Understanding the web - Teaser', + 'description': 'This is "Understanding the web - Teaser" by openclassrooms on Vimeo, the home for high quality videos and the people who love them.', + 'upload_date': '20151214', + 'uploader': 'OpenClassrooms', + 'uploader_id': 'openclassrooms', + }, + 'add_ie': ['Vimeo'], + }, + # { + # # TODO: find another test + # # http://schema.org/VideoObject + # 'url': 'https://flipagram.com/f/nyvTSJMKId', + # 'md5': '888dcf08b7ea671381f00fab74692755', + # 'info_dict': { + # 'id': 'nyvTSJMKId', + # 'ext': 'mp4', + # 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction', + # 'description': '#love for cats.', + # 'timestamp': 1461244995, + # 'upload_date': '20160421', + # }, + # 'params': { + # 'force_generic_extractor': True, + # }, + # } ] def report_following_redirect(self, new_url): @@ -1576,12 +1701,16 @@ class GenericIE(InfoExtractor): if matches: return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) - # Look for embedded Dailymotion player - matches = re.findall( - r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage) + # Look for Wordpress "YouTube Video Importer" plugin + matches = re.findall(r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ + data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) if matches: - return _playlist_from_matches( - matches, lambda m: unescapeHTML(m[1])) + return _playlist_from_matches(matches, lambda m: m[-1]) + + matches = DailymotionIE._extract_urls(webpage) + if matches: + return _playlist_from_matches(matches) # Look for embedded Dailymotion playlist player (#3822) m = re.search( @@ -1718,10 +1847,9 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url')) # Look for embedded Facebook player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Facebook') + facebook_url = FacebookIE._extract_url(webpage) + if facebook_url is not None: + return self.url_result(facebook_url, 'Facebook') # Look for embedded VK player mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) @@ -1903,18 +2031,14 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'Zapiks') # Look for Kaltura embeds - mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P<q1>['\"])wid(?P=q1)\s*:\s*(?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?(?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),", webpage) or - re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage)) - if mobj is not None: - return self.url_result(smuggle_url( - 'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), - {'source_url': url}), 'Kaltura') + kaltura_url = KalturaIE._extract_url(webpage) + if kaltura_url: + return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) # Look for Eagle.Platform embeds - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'EaglePlatform') + eagleplatform_url = EaglePlatformIE._extract_url(webpage) + if eagleplatform_url: + return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key()) # Look for ClipYou (uses Eagle.Platform) embeds mobj = re.search( @@ -2060,6 +2184,24 @@ class GenericIE(InfoExtractor): 'uploader': video_uploader, } + # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser + embed_url = self._html_search_meta('twitter:player', webpage, default=None) + if embed_url: + return self.url_result(embed_url) + + # Looking for http://schema.org/VideoObject + json_ld = self._search_json_ld( + webpage, video_id, default=None, expected_type='VideoObject') + if json_ld and json_ld.get('url'): + info_dict.update({ + 'title': video_title or info_dict['title'], + 'description': video_description, + 'thumbnail': video_thumbnail, + 'age_limit': age_limit + }) + info_dict.update(json_ld) + return info_dict + def check_video(vurl): if YoutubeIE.suitable(vurl): return True diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py new file mode 100644 index 000000000..656ce6d05 --- /dev/null +++ b/youtube_dl/extractor/hrti.py @@ -0,0 +1,202 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + parse_age_limit, + sanitized_Request, + try_get, +) + + +class HRTiBaseIE(InfoExtractor): + """ + Base Information Extractor for Croatian Radiotelevision + video on demand site https://hrti.hrt.hr + Reverse engineered from the JavaScript app in app.min.js + """ + _NETRC_MACHINE = 'hrti' + + _APP_LANGUAGE = 'hr' + _APP_VERSION = '1.1' + _APP_PUBLICATION_ID = 'all_in_one' + _API_URL = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json' + + def _initialize_api(self): + init_data = { + 'application_publication_id': self._APP_PUBLICATION_ID + } + + uuid = self._download_json( + self._API_URL, None, note='Downloading uuid', + errnote='Unable to download uuid', + data=json.dumps(init_data).encode('utf-8'))['uuid'] + + app_data = { + 'uuid': uuid, + 'application_publication_id': self._APP_PUBLICATION_ID, + 'application_version': self._APP_VERSION + } + + req = sanitized_Request(self._API_URL, data=json.dumps(app_data).encode('utf-8')) + req.get_method = lambda: 'PUT' + + resources = self._download_json( + req, None, note='Downloading session information', + errnote='Unable to download session information') + + self._session_id = resources['session_id'] + + modules = resources['modules'] + + self._search_url = modules['vod_catalog']['resources']['search']['uri'].format( + language=self._APP_LANGUAGE, + application_id=self._APP_PUBLICATION_ID) + + self._login_url = (modules['user']['resources']['login']['uri'] + + '/format/json').format(session_id=self._session_id) + + self._logout_url = modules['user']['resources']['logout']['uri'] + + def _login(self): + (username, password) = self._get_login_info() + # TODO: figure out authentication with cookies + if username is None or password is None: + self.raise_login_required() + + auth_data = { + 'username': username, + 'password': password, + } + + try: + auth_info = self._download_json( + self._login_url, None, note='Logging in', errnote='Unable to log in', + data=json.dumps(auth_data).encode('utf-8')) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406: + auth_info = self._parse_json(e.cause.read().encode('utf-8'), None) + else: + raise + + error_message = auth_info.get('error', {}).get('message') + if error_message: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error_message), + expected=True) + + self._token = auth_info['secure_streaming_token'] + + def _real_initialize(self): + self._initialize_api() + self._login() + + +class HRTiIE(HRTiBaseIE): + _VALID_URL = r'''(?x) + (?: + hrti:(?P<short_id>[0-9]+)| + https?:// + hrti\.hrt\.hr/\#/video/show/(?P<id>[0-9]+)/(?P<display_id>[^/]+)? + ) + ''' + _TESTS = [{ + 'url': 'https://hrti.hrt.hr/#/video/show/2181385/republika-dokumentarna-serija-16-hd', + 'info_dict': { + 'id': '2181385', + 'display_id': 'republika-dokumentarna-serija-16-hd', + 'ext': 'mp4', + 'title': 'REPUBLIKA, dokumentarna serija (1/6) (HD)', + 'description': 'md5:48af85f620e8e0e1df4096270568544f', + 'duration': 2922, + 'view_count': int, + 'average_rating': int, + 'episode_number': int, + 'season_number': int, + 'age_limit': 12, + }, + 'skip': 'Requires account credentials', + }, { + 'url': 'https://hrti.hrt.hr/#/video/show/2181385/', + 'only_matching': True, + }, { + 'url': 'hrti:2181385', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('short_id') or mobj.group('id') + display_id = mobj.group('display_id') or video_id + + video = self._download_json( + '%s/video_id/%s/format/json' % (self._search_url, video_id), + display_id, 'Downloading video metadata JSON')['video'][0] + + title_info = video['title'] + title = title_info['title_long'] + + movie = video['video_assets']['movie'][0] + m3u8_url = movie['url'].format(TOKEN=self._token) + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + description = clean_html(title_info.get('summary_long')) + age_limit = parse_age_limit(video.get('parental_control', {}).get('rating')) + view_count = int_or_none(video.get('views')) + average_rating = int_or_none(video.get('user_rating')) + duration = int_or_none(movie.get('duration')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'average_rating': average_rating, + 'age_limit': age_limit, + 'formats': formats, + } + + +class HRTiPlaylistIE(HRTiBaseIE): + _VALID_URL = r'https?://hrti.hrt.hr/#/video/list/category/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?' + _TESTS = [{ + 'url': 'https://hrti.hrt.hr/#/video/list/category/212/ekumena', + 'info_dict': { + 'id': '212', + 'title': 'ekumena', + }, + 'playlist_mincount': 8, + 'skip': 'Requires account credentials', + }, { + 'url': 'https://hrti.hrt.hr/#/video/list/category/212/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + category_id = mobj.group('id') + display_id = mobj.group('display_id') or category_id + + response = self._download_json( + '%s/category_id/%s/format/json' % (self._search_url, category_id), + display_id, 'Downloading video metadata JSON') + + video_ids = try_get( + response, lambda x: x['video_listings'][0]['alternatives'][0]['list'], + list) or [video['id'] for video in response.get('videos', []) if video.get('id')] + + entries = [self.url_result('hrti:%s' % video_id) for video_id in video_ids] + + return self.playlist_result(entries, category_id, display_id) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index ddcb3c916..01c7b3042 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -3,28 +3,22 @@ from __future__ import unicode_literals import hashlib import itertools -import math -import os -import random import re import time -import uuid from .common import InfoExtractor from ..compat import ( - compat_parse_qs, compat_str, compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, ) from ..utils import ( + clean_html, decode_packed_codes, + get_element_by_id, + get_element_by_attribute, ExtractorError, ohdave_rsa_encrypt, remove_start, - sanitized_Request, - urlencode_postdata, - url_basename, ) @@ -171,70 +165,21 @@ class IqiyiIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', - 'md5': '2cb594dc2781e6c941a110d8f358118b', + # MD5 checksum differs on my machine and Travis CI 'info_dict': { 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73', + 'ext': 'mp4', 'title': '美国德州空中惊现奇异云团 酷似UFO', - 'ext': 'f4v', } }, { 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html', + 'md5': '667171934041350c5de3f5015f7f1152', 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb', - 'title': '名侦探柯南第752集', - }, - 'playlist': [{ - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part1', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part2', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part3', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part4', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part5', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part6', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part7', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part8', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }], - 'params': { - 'skip_download': True, + 'ext': 'mp4', + 'title': '名侦探柯南 国语版:第752集 迫近灰原秘密的黑影 下篇', }, + 'skip': 'Geo-restricted to China', }, { 'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html', 'only_matching': True, @@ -250,22 +195,10 @@ class IqiyiIE(InfoExtractor): 'url': 'http://www.iqiyi.com/v_19rrny4w8w.html', 'info_dict': { 'id': 'f3cf468b39dddb30d676f89a91200dc1', + 'ext': 'mp4', 'title': '泰坦尼克号', }, - 'playlist': [{ - 'info_dict': { - 'id': 'f3cf468b39dddb30d676f89a91200dc1_part1', - 'ext': 'f4v', - 'title': '泰坦尼克号', - }, - }, { - 'info_dict': { - 'id': 'f3cf468b39dddb30d676f89a91200dc1_part2', - 'ext': 'f4v', - 'title': '泰坦尼克号', - }, - }], - 'expected_warnings': ['Needs a VIP account for full video'], + 'skip': 'Geo-restricted to China', }, { 'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html', 'info_dict': { @@ -278,20 +211,15 @@ class IqiyiIE(InfoExtractor): 'only_matching': True, }] - _FORMATS_MAP = [ - ('1', 'h6'), - ('2', 'h5'), - ('3', 'h4'), - ('4', 'h3'), - ('5', 'h2'), - ('10', 'h1'), - ] - - AUTH_API_ERRORS = { - # No preview available (不允许试看鉴权失败) - 'Q00505': 'This video requires a VIP account', - # End of preview time (试看结束鉴权失败) - 'Q00506': 'Needs a VIP account for full video', + _FORMATS_MAP = { + '96': 1, # 216p, 240p + '1': 2, # 336p, 360p + '2': 3, # 480p, 504p + '21': 4, # 504p + '4': 5, # 720p + '17': 5, # 720p + '5': 6, # 1072p, 1080p + '18': 7, # 1080p } def _real_initialize(self): @@ -352,177 +280,23 @@ class IqiyiIE(InfoExtractor): return True - def _authenticate_vip_video(self, api_video_url, video_id, tvid, _uuid, do_report_warning): - auth_params = { - # version and platform hard-coded in com/qiyi/player/core/model/remote/AuthenticationRemote.as - 'version': '2.0', - 'platform': 'b6c13e26323c537d', - 'aid': tvid, - 'tvid': tvid, - 'uid': '', - 'deviceId': _uuid, - 'playType': 'main', # XXX: always main? - 'filename': os.path.splitext(url_basename(api_video_url))[0], - } + def get_raw_data(self, tvid, video_id): + tm = int(time.time() * 1000) - qd_items = compat_parse_qs(compat_urllib_parse_urlparse(api_video_url).query) - for key, val in qd_items.items(): - auth_params[key] = val[0] - - auth_req = sanitized_Request( - 'http://api.vip.iqiyi.com/services/ckn.action', - urlencode_postdata(auth_params)) - # iQiyi server throws HTTP 405 error without the following header - auth_req.add_header('Content-Type', 'application/x-www-form-urlencoded') - auth_result = self._download_json( - auth_req, video_id, - note='Downloading video authentication JSON', - errnote='Unable to download video authentication JSON') - - code = auth_result.get('code') - msg = self.AUTH_API_ERRORS.get(code) or auth_result.get('msg') or code - if code == 'Q00506': - if do_report_warning: - self.report_warning(msg) - return False - if 'data' not in auth_result: - if msg is not None: - raise ExtractorError('%s said: %s' % (self.IE_NAME, msg), expected=True) - raise ExtractorError('Unexpected error from Iqiyi auth API') - - return auth_result['data'] - - def construct_video_urls(self, data, video_id, _uuid, tvid): - def do_xor(x, y): - a = y % 3 - if a == 1: - return x ^ 121 - if a == 2: - return x ^ 72 - return x ^ 103 - - def get_encode_code(l): - a = 0 - b = l.split('-') - c = len(b) - s = '' - for i in range(c - 1, -1, -1): - a = do_xor(int(b[c - i - 1], 16), i) - s += chr(a) - return s[::-1] - - def get_path_key(x, format_id, segment_index): - mg = ')(*&^flash@#$%a' - tm = self._download_json( - 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id, - note='Download path key of segment %d for format %s' % (segment_index + 1, format_id) - )['t'] - t = str(int(math.floor(int(tm) / (600.0)))) - return md5_text(t + mg + x) - - video_urls_dict = {} - need_vip_warning_report = True - for format_item in data['vp']['tkl'][0]['vs']: - if 0 < int(format_item['bid']) <= 10: - format_id = self.get_format(format_item['bid']) - else: - continue - - video_urls = [] - - video_urls_info = format_item['fs'] - if not format_item['fs'][0]['l'].startswith('/'): - t = get_encode_code(format_item['fs'][0]['l']) - if t.endswith('mp4'): - video_urls_info = format_item['flvs'] - - for segment_index, segment in enumerate(video_urls_info): - vl = segment['l'] - if not vl.startswith('/'): - vl = get_encode_code(vl) - is_vip_video = '/vip/' in vl - filesize = segment['b'] - base_url = data['vp']['du'].split('/') - if not is_vip_video: - key = get_path_key( - vl.split('/')[-1].split('.')[0], format_id, segment_index) - base_url.insert(-1, key) - base_url = '/'.join(base_url) - param = { - 'su': _uuid, - 'qyid': uuid.uuid4().hex, - 'client': '', - 'z': '', - 'bt': '', - 'ct': '', - 'tn': str(int(time.time())) - } - api_video_url = base_url + vl - if is_vip_video: - api_video_url = api_video_url.replace('.f4v', '.hml') - auth_result = self._authenticate_vip_video( - api_video_url, video_id, tvid, _uuid, need_vip_warning_report) - if auth_result is False: - need_vip_warning_report = False - break - param.update({ - 't': auth_result['t'], - # cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as - 'cid': 'afbe8fd3d73448c9', - 'vid': video_id, - 'QY00001': auth_result['u'], - }) - api_video_url += '?' if '?' not in api_video_url else '&' - api_video_url += compat_urllib_parse_urlencode(param) - js = self._download_json( - api_video_url, video_id, - note='Download video info of segment %d for format %s' % (segment_index + 1, format_id)) - video_url = js['l'] - video_urls.append( - (video_url, filesize)) - - video_urls_dict[format_id] = video_urls - return video_urls_dict - - def get_format(self, bid): - matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)] - return matched_format_ids[0] if len(matched_format_ids) else None - - def get_bid(self, format_id): - matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id] - return matched_bids[0] if len(matched_bids) else None - - def get_raw_data(self, tvid, video_id, enc_key, _uuid): - tm = str(int(time.time())) - tail = tm + tvid - param = { - 'key': 'fvip', - 'src': md5_text('youtube-dl'), - 'tvId': tvid, + key = 'd5fb4bd9d50c4be6948c97edd7254b0e' + sc = md5_text(compat_str(tm) + key + tvid) + params = { + 'tvid': tvid, 'vid': video_id, - 'vinfo': 1, - 'tm': tm, - 'enc': md5_text(enc_key + tail), - 'qyid': _uuid, - 'tn': random.random(), - # In iQiyi's flash player, um is set to 1 if there's a logged user - # Some 1080P formats are only available with a logged user. - # Here force um=1 to trick the iQiyi server - 'um': 1, - 'authkey': md5_text(md5_text('') + tail), - 'k_tag': 1, + 'src': '76f90cbd92f94a2e925d83e8ccd22cb7', + 'sc': sc, + 't': tm, } - api_url = 'http://cache.video.qiyi.com/vms' + '?' + \ - compat_urllib_parse_urlencode(param) - raw_data = self._download_json(api_url, video_id) - return raw_data - - def get_enc_key(self, video_id): - # TODO: automatic key extraction - # last update at 2016-01-22 for Zombie::bite - enc_key = '4a1caba4b4465345366f28da7c117d20' - return enc_key + return self._download_json( + 'http://cache.m.iqiyi.com/jp/tmts/%s/%s/' % (tvid, video_id), + video_id, transform_source=lambda s: remove_start(s, 'var tvInfoJs='), + query=params, headers=self.geo_verification_headers()) def _extract_playlist(self, webpage): PAGE_SIZE = 50 @@ -571,58 +345,41 @@ class IqiyiIE(InfoExtractor): r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid') video_id = self._search_regex( r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') - _uuid = uuid.uuid4().hex - - enc_key = self.get_enc_key(video_id) - - raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid) - - if raw_data['code'] != 'A000000': - raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) - - data = raw_data['data'] - - title = data['vi']['vn'] - - # generate video_urls_dict - video_urls_dict = self.construct_video_urls( - data, video_id, _uuid, tvid) - - # construct info - entries = [] - for format_id in video_urls_dict: - video_urls = video_urls_dict[format_id] - for i, video_url_info in enumerate(video_urls): - if len(entries) < i + 1: - entries.append({'formats': []}) - entries[i]['formats'].append( - { - 'url': video_url_info[0], - 'filesize': video_url_info[-1], - 'format_id': format_id, - 'preference': int(self.get_bid(format_id)) - } - ) - - for i in range(len(entries)): - self._sort_formats(entries[i]['formats']) - entries[i].update( - { - 'id': '%s_part%d' % (video_id, i + 1), - 'title': title, - } - ) - - if len(entries) > 1: - info = { - '_type': 'multi_video', - 'id': video_id, - 'title': title, - 'entries': entries, - } - else: - info = entries[0] - info['id'] = video_id - info['title'] = title - - return info + + formats = [] + for _ in range(5): + raw_data = self.get_raw_data(tvid, video_id) + + if raw_data['code'] != 'A00000': + if raw_data['code'] == 'A00111': + self.raise_geo_restricted() + raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) + + data = raw_data['data'] + + for stream in data['vidl']: + if 'm3utx' not in stream: + continue + vd = compat_str(stream['vd']) + formats.append({ + 'url': stream['m3utx'], + 'format_id': vd, + 'ext': 'mp4', + 'preference': self._FORMATS_MAP.get(vd, -1), + 'protocol': 'm3u8_native', + }) + + if formats: + break + + self._sleep(5, video_id) + + self._sort_formats(formats) + title = (get_element_by_id('widget-videotitle', webpage) or + clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage))) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index a65697ff5..1729f5bfb 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -6,7 +6,6 @@ import base64 from .common import InfoExtractor from ..compat import ( - compat_urllib_parse_urlencode, compat_urlparse, compat_parse_qs, ) @@ -15,6 +14,7 @@ from ..utils import ( ExtractorError, int_or_none, unsmuggle_url, + smuggle_url, ) @@ -34,7 +34,8 @@ class KalturaIE(InfoExtractor): )(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))? ) ''' - _API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?' + _SERVICE_URL = 'http://cdnapi.kaltura.com' + _SERVICE_BASE = '/api_v3/index.php' _TESTS = [ { 'url': 'kaltura:269692:1_1jc2y3e4', @@ -64,16 +65,50 @@ class KalturaIE(InfoExtractor): } ] - def _kaltura_api_call(self, video_id, actions, *args, **kwargs): + @staticmethod + def _extract_url(webpage): + mobj = ( + re.search( + r"""(?xs) + kWidget\.(?:thumb)?[Ee]mbed\( + \{.*? + (?P<q1>['\"])wid(?P=q1)\s*:\s* + (?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*? + (?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s* + (?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4), + """, webpage) or + re.search( + r'''(?xs) + (?P<q1>["\']) + (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*? + (?P=q1).*? + (?: + entry_?[Ii]d| + (?P<q2>["\'])entry_?[Ii]d(?P=q2) + )\s*:\s* + (?P<q3>["\'])(?P<id>.+?)(?P=q3) + ''', webpage)) + if mobj: + embed_info = mobj.groupdict() + url = 'kaltura:%(partner_id)s:%(id)s' % embed_info + escaped_pid = re.escape(embed_info['partner_id']) + service_url = re.search( + r'<script[^>]+src=["\']((?:https?:)?//.+?)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid), + webpage) + if service_url: + url = smuggle_url(url, {'service_url': service_url.group(1)}) + return url + + def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs): params = actions[0] if len(actions) > 1: for i, a in enumerate(actions[1:], start=1): for k, v in a.items(): params['%d:%s' % (i, k)] = v - query = compat_urllib_parse_urlencode(params) - url = self._API_BASE + query - data = self._download_json(url, video_id, *args, **kwargs) + data = self._download_json( + (service_url or self._SERVICE_URL) + self._SERVICE_BASE, + video_id, query=params, *args, **kwargs) status = data if len(actions) == 1 else data[0] if status.get('objectType') == 'KalturaAPIException': @@ -82,7 +117,7 @@ class KalturaIE(InfoExtractor): return data - def _get_kaltura_signature(self, video_id, partner_id): + def _get_kaltura_signature(self, video_id, partner_id, service_url=None): actions = [{ 'apiVersion': '3.1', 'expiry': 86400, @@ -92,10 +127,10 @@ class KalturaIE(InfoExtractor): 'widgetId': '_%s' % partner_id, }] return self._kaltura_api_call( - video_id, actions, note='Downloading Kaltura signature')['ks'] + video_id, actions, service_url, note='Downloading Kaltura signature')['ks'] - def _get_video_info(self, video_id, partner_id): - signature = self._get_kaltura_signature(video_id, partner_id) + def _get_video_info(self, video_id, partner_id, service_url=None): + signature = self._get_kaltura_signature(video_id, partner_id, service_url) actions = [ { 'action': 'null', @@ -118,7 +153,7 @@ class KalturaIE(InfoExtractor): }, ] return self._kaltura_api_call( - video_id, actions, note='Downloading video info JSON') + video_id, actions, service_url, note='Downloading video info JSON') def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -127,7 +162,7 @@ class KalturaIE(InfoExtractor): partner_id, entry_id = mobj.group('partner_id', 'id') ks = None if partner_id and entry_id: - info, flavor_assets = self._get_video_info(entry_id, partner_id) + info, flavor_assets = self._get_video_info(entry_id, partner_id, smuggled_data.get('service_url')) else: path, query = mobj.group('path', 'query') if not path and not query: @@ -175,12 +210,17 @@ class KalturaIE(InfoExtractor): unsigned_url += '?referrer=%s' % referrer return unsigned_url + data_url = info['dataUrl'] + if '/flvclipper/' in data_url: + data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url) + formats = [] for f in flavor_assets: # Continue if asset is not ready if f['status'] != 2: continue - video_url = sign_url('%s/flavorId/%s' % (info['dataUrl'], f['id'])) + video_url = sign_url( + '%s/flavorId/%s' % (data_url, f['id'])) formats.append({ 'format_id': '%(fileExt)s-%(bitrate)s' % f, 'ext': f.get('fileExt'), @@ -193,9 +233,12 @@ class KalturaIE(InfoExtractor): 'width': int_or_none(f.get('width')), 'url': video_url, }) - m3u8_url = sign_url(info['dataUrl'].replace('format/url', 'format/applehttp')) - formats.extend(self._extract_m3u8_formats( - m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + if '/playManifest/' in data_url: + m3u8_url = sign_url(data_url.replace( + 'format/url', 'format/applehttp')) + formats.extend(self._extract_m3u8_formats( + m3u8_url, entry_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) self._check_formats(formats, entry_id) self._sort_formats(formats) diff --git a/youtube_dl/extractor/kamcord.py b/youtube_dl/extractor/kamcord.py new file mode 100644 index 000000000..b50120d98 --- /dev/null +++ b/youtube_dl/extractor/kamcord.py @@ -0,0 +1,71 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + qualities, +) + + +class KamcordIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kamcord\.com/v/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.kamcord.com/v/hNYRduDgWb4', + 'md5': 'c3180e8a9cfac2e86e1b88cb8751b54c', + 'info_dict': { + 'id': 'hNYRduDgWb4', + 'ext': 'mp4', + 'title': 'Drinking Madness', + 'uploader': 'jacksfilms', + 'uploader_id': '3044562', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video = self._parse_json( + self._search_regex( + r'window\.__props\s*=\s*({.+?});?(?:\n|\s*</script)', + webpage, 'video'), + video_id)['video'] + + title = video['title'] + + formats = self._extract_m3u8_formats( + video['play']['hls'], video_id, 'mp4', entry_protocol='m3u8_native') + self._sort_formats(formats) + + uploader = video.get('user', {}).get('username') + uploader_id = video.get('user', {}).get('id') + + view_count = int_or_none(video.get('viewCount')) + like_count = int_or_none(video.get('heartCount')) + comment_count = int_or_none(video.get('messageCount')) + + preference_key = qualities(('small', 'medium', 'large')) + + thumbnails = [{ + 'url': thumbnail_url, + 'id': thumbnail_id, + 'preference': preference_key(thumbnail_id), + } for thumbnail_id, thumbnail_url in (video.get('thumbnail') or {}).items() + if isinstance(thumbnail_id, compat_str) and isinstance(thumbnail_url, compat_str)] + + return { + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'view_count': view_count, + 'like_count': like_count, + 'comment_count': comment_count, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 0221fb919..b1d460599 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -26,11 +26,6 @@ class KuwoBaseIE(InfoExtractor): def _get_formats(self, song_id, tolerate_ip_deny=False): formats = [] for file_format in self._FORMATS: - headers = {} - cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') - if cn_verification_proxy: - headers['Ytdl-request-proxy'] = cn_verification_proxy - query = { 'format': file_format['ext'], 'br': file_format.get('br', ''), @@ -42,7 +37,7 @@ class KuwoBaseIE(InfoExtractor): song_url = self._download_webpage( 'http://antiserver.kuwo.cn/anti.s', song_id, note='Download %s url info' % file_format['format'], - query=query, headers=headers, + query=query, headers=self.geo_verification_headers(), ) if song_url == 'IPDeny' and not tolerate_ip_deny: diff --git a/youtube_dl/extractor/la7.py b/youtube_dl/extractor/la7.py index b08f6e3c9..da5a5de4a 100644 --- a/youtube_dl/extractor/la7.py +++ b/youtube_dl/extractor/la7.py @@ -1,60 +1,65 @@ +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - parse_duration, + js_to_json, + smuggle_url, ) class LA7IE(InfoExtractor): - IE_NAME = 'la7.tv' - _VALID_URL = r'''(?x) - https?://(?:www\.)?la7\.tv/ - (?: - richplayer/\?assetid=| - \?contentId= - ) - (?P<id>[0-9]+)''' - - _TEST = { - 'url': 'http://www.la7.tv/richplayer/?assetid=50355319', - 'md5': 'ec7d1f0224d20ba293ab56cf2259651f', + IE_NAME = 'la7.it' + _VALID_URL = r'''(?x)(https?://)?(?: + (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video)/| + tg\.la7\.it/repliche-tgla7\?id= + )(?P<id>.+)''' + + _TESTS = [{ + # 'src' is a plain URL + 'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722', + 'md5': '8b613ffc0c4bf9b9e377169fc19c214c', 'info_dict': { - 'id': '50355319', + 'id': 'inccool8-02-10-2015-163722', 'ext': 'mp4', - 'title': 'IL DIVO', - 'description': 'Un film di Paolo Sorrentino con Toni Servillo, Anna Bonaiuto, Giulio Bosetti e Flavio Bucci', - 'duration': 6254, + 'title': 'Inc.Cool8', + 'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico', + 'thumbnail': 're:^https?://.*', + 'uploader_id': 'kdla7pillole@iltrovatore.it', + 'timestamp': 1443814869, + 'upload_date': '20151002', }, - 'skip': 'Blocked in the US', - } + }, { + # 'src' is a dictionary + 'url': 'http://tg.la7.it/repliche-tgla7?id=189080', + 'md5': '6b0d8888d286e39870208dfeceaf456b', + 'info_dict': { + 'id': '189080', + 'ext': 'mp4', + 'title': 'TG LA7', + }, + }, { + 'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - xml_url = 'http://www.la7.tv/repliche/content/index.php?contentId=%s' % video_id - doc = self._download_xml(xml_url, video_id) - - video_title = doc.find('title').text - description = doc.find('description').text - duration = parse_duration(doc.find('duration').text) - thumbnail = doc.find('img').text - view_count = int(doc.find('views').text) - prefix = doc.find('.//fqdn').text.strip().replace('auto:', 'http:') + webpage = self._download_webpage(url, video_id) - formats = [{ - 'format': vnode.find('quality').text, - 'tbr': int(vnode.find('quality').text), - 'url': vnode.find('fms').text.strip().replace('mp4:', prefix), - } for vnode in doc.findall('.//videos/video')] - self._sort_formats(formats) + player_data = self._parse_json( + self._search_regex(r'videoLa7\(({[^;]+})\);', webpage, 'player data'), + video_id, transform_source=js_to_json) return { + '_type': 'url_transparent', + 'url': smuggle_url('kaltura:103:%s' % player_data['vid'], { + 'service_url': 'http://kdam.iltrovatore.it', + }), 'id': video_id, - 'title': video_title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'view_count': view_count, + 'title': player_data['title'], + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': player_data.get('poster'), + 'ie_key': 'Kaltura', } diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index 63f581cd9..e9cc9aa59 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -20,9 +20,10 @@ from ..utils import ( int_or_none, orderedSet, parse_iso8601, - sanitized_Request, str_or_none, url_basename, + urshift, + update_url_query, ) @@ -74,15 +75,11 @@ class LeIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def urshift(val, n): - return val >> n if val >= 0 else (val + 0x100000000) >> n - # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf def ror(self, param1, param2): _loc3_ = 0 while _loc3_ < param2: - param1 = self.urshift(param1, 1) + ((param1 & 1) << 31) + param1 = urshift(param1, 1) + ((param1 & 1) << 31) _loc3_ += 1 return param1 @@ -93,6 +90,10 @@ class LeIE(InfoExtractor): _loc3_ = self.ror(_loc3_, _loc2_ % 17) return _loc3_ + # reversed from http://jstatic.letvcdn.com/sdk/player.js + def get_mms_key(self, time): + return self.ror(time, 8) ^ 185025305 + # see M3U8Encryption class in KLetvPlayer.swf @staticmethod def decrypt_m3u8(encrypted_data): @@ -113,28 +114,7 @@ class LeIE(InfoExtractor): return bytes(_loc7_) - def _real_extract(self, url): - media_id = self._match_id(url) - page = self._download_webpage(url, media_id) - params = { - 'id': media_id, - 'platid': 1, - 'splatid': 101, - 'format': 1, - 'tkey': self.calc_time_key(int(time.time())), - 'domain': 'www.le.com' - } - play_json_req = sanitized_Request( - 'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse_urlencode(params) - ) - cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') - if cn_verification_proxy: - play_json_req.add_header('Ytdl-request-proxy', cn_verification_proxy) - - play_json = self._download_json( - play_json_req, - media_id, 'Downloading playJson data') - + def _check_errors(self, play_json): # Check for errors playstatus = play_json['playstatus'] if playstatus['status'] == 0: @@ -145,43 +125,99 @@ class LeIE(InfoExtractor): msg = 'Generic error. flag = %d' % flag raise ExtractorError(msg, expected=True) - playurl = play_json['playurl'] - - formats = ['350', '1000', '1300', '720p', '1080p'] - dispatch = playurl['dispatch'] + def _real_extract(self, url): + media_id = self._match_id(url) + page = self._download_webpage(url, media_id) - urls = [] - for format_id in formats: - if format_id in dispatch: - media_url = playurl['domain'][0] + dispatch[format_id][0] - media_url += '&' + compat_urllib_parse_urlencode({ - 'm3v': 1, + play_json_h5 = self._download_json( + 'http://api.le.com/mms/out/video/playJsonH5', + media_id, 'Downloading html5 playJson data', query={ + 'id': media_id, + 'platid': 3, + 'splatid': 304, + 'format': 1, + 'tkey': self.get_mms_key(int(time.time())), + 'domain': 'www.le.com', + 'tss': 'no', + }, + headers=self.geo_verification_headers()) + self._check_errors(play_json_h5) + + play_json_flash = self._download_json( + 'http://api.le.com/mms/out/video/playJson', + media_id, 'Downloading flash playJson data', query={ + 'id': media_id, + 'platid': 1, + 'splatid': 101, + 'format': 1, + 'tkey': self.calc_time_key(int(time.time())), + 'domain': 'www.le.com', + }, + headers=self.geo_verification_headers()) + self._check_errors(play_json_flash) + + def get_h5_urls(media_url, format_id): + location = self._download_json( + media_url, media_id, + 'Download JSON metadata for format %s' % format_id, query={ 'format': 1, 'expect': 3, - 'rateid': format_id, - }) + 'tss': 'no', + })['location'] + + return { + 'http': update_url_query(location, {'tss': 'no'}), + 'hls': update_url_query(location, {'tss': 'ios'}), + } - nodes_data = self._download_json( - media_url, media_id, - 'Download JSON metadata for format %s' % format_id) + def get_flash_urls(media_url, format_id): + media_url += '&' + compat_urllib_parse_urlencode({ + 'm3v': 1, + 'format': 1, + 'expect': 3, + 'rateid': format_id, + }) - req = self._request_webpage( - nodes_data['nodelist'][0]['location'], media_id, - note='Downloading m3u8 information for format %s' % format_id) + nodes_data = self._download_json( + media_url, media_id, + 'Download JSON metadata for format %s' % format_id) - m3u8_data = self.decrypt_m3u8(req.read()) + req = self._request_webpage( + nodes_data['nodelist'][0]['location'], media_id, + note='Downloading m3u8 information for format %s' % format_id) - url_info_dict = { - 'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), - 'ext': determine_ext(dispatch[format_id][1]), - 'format_id': format_id, - 'protocol': 'm3u8', - } + m3u8_data = self.decrypt_m3u8(req.read()) - if format_id[-1:] == 'p': - url_info_dict['height'] = int_or_none(format_id[:-1]) + return { + 'hls': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), + } - urls.append(url_info_dict) + extracted_formats = [] + formats = [] + for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)): + playurl = play_json['playurl'] + play_domain = playurl['domain'][0] + + for format_id, format_data in playurl.get('dispatch', []).items(): + if format_id in extracted_formats: + continue + extracted_formats.append(format_id) + + media_url = play_domain + format_data[0] + for protocol, format_url in get_urls(media_url, format_id).items(): + f = { + 'url': format_url, + 'ext': determine_ext(format_data[1]), + 'format_id': '%s-%s' % (protocol, format_id), + 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + 'quality': int_or_none(format_id), + } + + if format_id[-1:] == 'p': + f['height'] = int_or_none(format_id[:-1]) + + formats.append(f) + self._sort_formats(formats, ('height', 'quality', 'format_id')) publish_time = parse_iso8601(self._html_search_regex( r'发布时间 ([^<>]+) ', page, 'publish time', default=None), @@ -190,7 +226,7 @@ class LeIE(InfoExtractor): return { 'id': media_id, - 'formats': urls, + 'formats': formats, 'title': playurl['title'], 'thumbnail': playurl['pic'], 'description': description, diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 2d5040032..a98c4c530 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -100,7 +100,7 @@ class LyndaIE(LyndaBaseIE): _TESTS = [{ 'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', - 'md5': 'ecfc6862da89489161fb9cd5f5a6fac1', + # md5 is unstable 'info_dict': { 'id': '114408', 'ext': 'mp4', diff --git a/youtube_dl/extractor/m6.py b/youtube_dl/extractor/m6.py index d5945ad66..39d2742c8 100644 --- a/youtube_dl/extractor/m6.py +++ b/youtube_dl/extractor/m6.py @@ -1,8 +1,6 @@ # encoding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor @@ -23,34 +21,5 @@ class M6IE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - rss = self._download_xml('http://ws.m6.fr/v1/video/info/m6/bonus/%s' % video_id, video_id, - 'Downloading video RSS') - - title = rss.find('./channel/item/title').text - description = rss.find('./channel/item/description').text - thumbnail = rss.find('./channel/item/visuel_clip_big').text - duration = int(rss.find('./channel/item/duration').text) - view_count = int(rss.find('./channel/item/nombre_vues').text) - - formats = [] - for format_id in ['lq', 'sd', 'hq', 'hd']: - video_url = rss.find('./channel/item/url_video_%s' % format_id) - if video_url is None: - continue - formats.append({ - 'url': video_url.text, - 'format_id': format_id, - }) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, - 'formats': formats, - } + video_id = self._match_id(url) + return self.url_result('6play:%s' % video_id, 'SixPlay', video_id) diff --git a/youtube_dl/extractor/meta.py b/youtube_dl/extractor/meta.py new file mode 100644 index 000000000..cdb46e163 --- /dev/null +++ b/youtube_dl/extractor/meta.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .pladform import PladformIE +from ..utils import ( + unescapeHTML, + int_or_none, + ExtractorError, +) + + +class METAIE(InfoExtractor): + _VALID_URL = r'https?://video\.meta\.ua/(?:iframe/)?(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://video.meta.ua/5502115.video', + 'md5': '71b6f3ee274bef16f1ab410f7f56b476', + 'info_dict': { + 'id': '5502115', + 'ext': 'mp4', + 'title': 'Sony Xperia Z camera test [HQ]', + 'description': 'Xperia Z shoots video in FullHD HDR.', + 'uploader_id': 'nomobile', + 'uploader': 'CHЁZA.TV', + 'upload_date': '20130211', + }, + 'add_ie': ['Youtube'], + }, { + 'url': 'http://video.meta.ua/iframe/5502115', + 'only_matching': True, + }, { + # pladform embed + 'url': 'http://video.meta.ua/7121015.video', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + st_html5 = self._search_regex( + r"st_html5\s*=\s*'#([^']+)'", webpage, 'uppod html5 st', default=None) + + if st_html5: + # uppod st decryption algorithm is reverse engineered from function un(s) at uppod.js + json_str = '' + for i in range(0, len(st_html5), 3): + json_str += '�%s;' % st_html5[i:i + 3] + uppod_data = self._parse_json(unescapeHTML(json_str), video_id) + error = uppod_data.get('customnotfound') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + video_url = uppod_data['file'] + info = { + 'id': video_id, + 'url': video_url, + 'title': uppod_data.get('comment') or self._og_search_title(webpage), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': uppod_data.get('poster') or self._og_search_thumbnail(webpage), + 'duration': int_or_none(self._og_search_property( + 'video:duration', webpage, default=None)), + } + if 'youtube.com/' in video_url: + info.update({ + '_type': 'url_transparent', + 'ie_key': 'Youtube', + }) + return info + + pladform_url = PladformIE._extract_url(webpage) + if pladform_url: + return self.url_result(pladform_url) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index b6f00cc25..e6e7659a1 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -11,13 +11,14 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, - sanitized_Request, urlencode_postdata, + get_element_by_attribute, + mimetype2ext, ) class MetacafeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' + _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P<video_id>[^/]+)/(?P<display_id>[^/?#]+)' _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' IE_NAME = 'metacafe' @@ -47,6 +48,7 @@ class MetacafeIE(InfoExtractor): 'uploader': 'ign', 'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.', }, + 'skip': 'Page is temporarily unavailable.', }, # AnyClip video { @@ -55,8 +57,8 @@ class MetacafeIE(InfoExtractor): 'id': 'an-dVVXnuY7Jh77J', 'ext': 'mp4', 'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3', - 'uploader': 'anyclip', - 'description': 'md5:38c711dd98f5bb87acf973d573442e67', + 'uploader': 'AnyClip', + 'description': 'md5:cbef0460d31e3807f6feb4e7a5952e5b', }, }, # age-restricted video @@ -110,28 +112,25 @@ class MetacafeIE(InfoExtractor): def report_disclaimer(self): self.to_screen('Retrieving disclaimer') - def _real_initialize(self): + def _confirm_age(self): # Retrieve disclaimer self.report_disclaimer() self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer') # Confirm age - disclaimer_form = { - 'filters': '0', - 'submit': "Continue - I'm over 18", - } - request = sanitized_Request(self._FILTER_POST, urlencode_postdata(disclaimer_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') self.report_age_confirmation() - self._download_webpage(request, None, False, 'Unable to confirm age') + self._download_webpage( + self._FILTER_POST, None, False, 'Unable to confirm age', + data=urlencode_postdata({ + 'filters': '0', + 'submit': "Continue - I'm over 18", + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) def _real_extract(self, url): # Extract id and simplified title from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - - video_id = mobj.group(1) + video_id, display_id = re.match(self._VALID_URL, url).groups() # the video may come from an external site m_external = re.match('^(\w{2})-(.*)$', video_id) @@ -144,15 +143,24 @@ class MetacafeIE(InfoExtractor): if prefix == 'cb': return self.url_result('theplatform:%s' % ext_id, 'ThePlatform') - # Retrieve video webpage to extract further information - req = sanitized_Request('http://www.metacafe.com/watch/%s/' % video_id) + # self._confirm_age() # AnyClip videos require the flashversion cookie so that we get the link # to the mp4 file - mobj_an = re.match(r'^an-(.*?)$', video_id) - if mobj_an: - req.headers['Cookie'] = 'flashVersion=0;' - webpage = self._download_webpage(req, video_id) + headers = {} + if video_id.startswith('an-'): + headers['Cookie'] = 'flashVersion=0;' + + # Retrieve video webpage to extract further information + webpage = self._download_webpage(url, video_id, headers=headers) + + error = get_element_by_attribute( + 'class', 'notfound-page-title', webpage) + if error: + raise ExtractorError(error, expected=True) + + video_title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage, 'title', default=None) or self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title') # Extract URL, uploader and title from webpage self.report_extraction(video_id) @@ -216,20 +224,40 @@ class MetacafeIE(InfoExtractor): 'player_url': player_url, 'ext': play_path.partition(':')[0], }) + if video_url is None: + flashvars = self._parse_json(self._search_regex( + r'flashvars\s*=\s*({.*});', webpage, 'flashvars', + default=None), video_id, fatal=False) + if flashvars: + video_url = [] + for source in flashvars.get('sources'): + source_url = source.get('src') + if not source_url: + continue + ext = mimetype2ext(source.get('type')) or determine_ext(source_url) + if ext == 'm3u8': + video_url.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + video_url.append({ + 'url': source_url, + 'ext': ext, + }) if video_url is None: raise ExtractorError('Unsupported video type') - video_title = self._html_search_regex( - r'(?im)<title>(.*) - Video</title>', webpage, 'title') - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) + description = self._html_search_meta( + ['og:description', 'twitter:description', 'description'], + webpage, 'title', fatal=False) + thumbnail = self._html_search_meta( + ['og:image', 'twitter:image'], webpage, 'title', fatal=False) video_uploader = self._html_search_regex( r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', webpage, 'uploader nickname', fatal=False) duration = int_or_none( - self._html_search_meta('video:duration', webpage)) - + self._html_search_meta('video:duration', webpage, default=None)) age_limit = ( 18 if re.search(r'(?:"contentRating":|"rating",)"restricted"', webpage) @@ -242,10 +270,11 @@ class MetacafeIE(InfoExtractor): 'url': video_url, 'ext': video_ext, }] - self._sort_formats(formats) + return { 'id': video_id, + 'display_id': display_id, 'description': description, 'uploader': video_uploader, 'title': video_title, diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index 9fbc74f5d..d970e94ec 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -26,7 +26,8 @@ class MGTVIE(InfoExtractor): video_id = self._match_id(url) api_data = self._download_json( 'http://v.api.mgtv.com/player/video', video_id, - query={'video_id': video_id})['data'] + query={'video_id': video_id}, + headers=self.geo_verification_headers())['data'] info = api_data['info'] formats = [] diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 5a00cd397..cd169f361 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -12,12 +12,69 @@ from ..utils import ( get_element_by_attribute, int_or_none, remove_start, + extract_attributes, + determine_ext, ) -class MiTeleIE(InfoExtractor): +class MiTeleBaseIE(InfoExtractor): + def _get_player_info(self, url, webpage): + player_data = extract_attributes(self._search_regex( + r'(?s)(<ms-video-player.+?</ms-video-player>)', + webpage, 'ms video player')) + video_id = player_data['data-media-id'] + config_url = compat_urlparse.urljoin(url, player_data['data-config']) + config = self._download_json( + config_url, video_id, 'Downloading config JSON') + mmc_url = config['services']['mmc'] + + duration = None + formats = [] + for m_url in (mmc_url, mmc_url.replace('/flash.json', '/html5.json')): + mmc = self._download_json( + m_url, video_id, 'Downloading mmc JSON') + if not duration: + duration = int_or_none(mmc.get('duration')) + for location in mmc['locations']: + gat = self._proto_relative_url(location.get('gat'), 'http:') + bas = location.get('bas') + loc = location.get('loc') + ogn = location.get('ogn') + if None in (gat, bas, loc, ogn): + continue + token_data = { + 'bas': bas, + 'icd': loc, + 'ogn': ogn, + 'sta': '0', + } + media = self._download_json( + '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)), + video_id, 'Downloading %s JSON' % location['loc']) + file_ = media.get('file') + if not file_: + continue + ext = determine_ext(file_) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', + video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + file_, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'thumbnail': player_data.get('data-poster') or config.get('poster', {}).get('imageUrl'), + 'duration': duration, + } + + +class MiTeleIE(MiTeleBaseIE): IE_DESC = 'mitele.es' - _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/' + _VALID_URL = r'https?://www\.mitele\.es/(?:[^/]+/){3}(?P<id>[^/]+)/' _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', @@ -25,7 +82,7 @@ class MiTeleIE(InfoExtractor): 'info_dict': { 'id': '0NF1jJnxS1Wu3pHrmvFyw2', 'display_id': 'programa-144', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Tor, la web invisible', 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', 'series': 'Diario de', @@ -40,7 +97,7 @@ class MiTeleIE(InfoExtractor): 'info_dict': { 'id': 'eLZSwoEd1S3pVyUm8lc6F', 'display_id': 'programa-226', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Cuarto Milenio - Temporada 6 - Programa 226', 'description': 'md5:50daf9fadefa4e62d9fc866d0c015701', 'series': 'Cuarto Milenio', @@ -59,40 +116,7 @@ class MiTeleIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - config_url = self._search_regex( - r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url') - config_url = compat_urlparse.urljoin(url, config_url) - - config = self._download_json( - config_url, display_id, 'Downloading config JSON') - - mmc = self._download_json( - config['services']['mmc'], display_id, 'Downloading mmc JSON') - - formats = [] - for location in mmc['locations']: - gat = self._proto_relative_url(location.get('gat'), 'http:') - bas = location.get('bas') - loc = location.get('loc') - ogn = location.get('ogn') - if None in (gat, bas, loc, ogn): - continue - token_data = { - 'bas': bas, - 'icd': loc, - 'ogn': ogn, - 'sta': '0', - } - media = self._download_json( - '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)), - display_id, 'Downloading %s JSON' % location['loc']) - file_ = media.get('file') - if not file_: - continue - formats.extend(self._extract_f4m_formats( - file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', - display_id, f4m_id=loc)) - self._sort_formats(formats) + info = self._get_player_info(url, webpage) title = self._search_regex( r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>', @@ -112,21 +136,12 @@ class MiTeleIE(InfoExtractor): title = remove_start(self._search_regex( r'<title>([^<]+)</title>', webpage, 'title'), 'Ver online ') - video_id = self._search_regex( - r'data-media-id\s*=\s*"([^"]+)"', webpage, - 'data media id', default=None) or display_id - thumbnail = config.get('poster', {}).get('imageUrl') - duration = int_or_none(mmc.get('duration')) - - return { - 'id': video_id, + info.update({ 'display_id': display_id, 'title': title, 'description': get_element_by_attribute('class', 'text', webpage), 'series': series, 'season': season, 'episode': episode, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } + }) + return info diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 483f6925f..560fe188b 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -102,11 +102,11 @@ class MixcloudIE(InfoExtractor): description = self._og_search_description(webpage) like_count = parse_count(self._search_regex( r'\bbutton-favorite[^>]+>.*?<span[^>]+class=["\']toggle-number[^>]+>\s*([^<]+)', - webpage, 'like count', fatal=False)) + webpage, 'like count', default=None)) view_count = str_to_int(self._search_regex( [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', r'/listeners/?">([0-9,.]+)</a>'], - webpage, 'play count', fatal=False)) + webpage, 'play count', default=None)) return { 'id': track_id, diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py new file mode 100644 index 000000000..1ec8e0f50 --- /dev/null +++ b/youtube_dl/extractor/msn.py @@ -0,0 +1,122 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + unescapeHTML, +) + + +class MSNIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?msn\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/[a-z]{2}-(?P<id>[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/criminal-minds-shemar-moore-shares-a-touching-goodbye-message/vp-BBqQYNE', + 'md5': '8442f66c116cbab1ff7098f986983458', + 'info_dict': { + 'id': 'BBqQYNE', + 'display_id': 'criminal-minds-shemar-moore-shares-a-touching-goodbye-message', + 'ext': 'mp4', + 'title': 'Criminal Minds - Shemar Moore Shares A Touching Goodbye Message', + 'description': 'md5:e8e89b897b222eb33a6b5067a8f1bc25', + 'duration': 104, + 'uploader': 'CBS Entertainment', + 'uploader_id': 'IT0X5aoJ6bJgYerJXSDCgFmYPB1__54v', + }, + }, { + 'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf', + 'only_matching': True, + }, { + 'url': 'http://www.msn.com/en-ae/video/watch/obama-a-lot-of-people-will-be-disappointed/vi-AAhxUMH', + 'only_matching': True, + }, { + # geo restricted + 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/the-first-fart-makes-you-laugh-the-last-fart-makes-you-cry/vp-AAhzIBU', + 'only_matching': True, + }, { + 'url': 'http://www.msn.com/en-ae/entertainment/bollywood/watch-how-salman-khan-reacted-when-asked-if-he-would-apologize-for-his-‘raped-woman’-comment/vi-AAhvzW6', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + video = self._parse_json( + self._search_regex( + r'data-metadata\s*=\s*(["\'])(?P<data>.+?)\1', + webpage, 'video data', default='{}', group='data'), + display_id, transform_source=unescapeHTML) + + if not video: + error = unescapeHTML(self._search_regex( + r'data-error=(["\'])(?P<error>.+?)\1', + webpage, 'error', group='error')) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + title = video['title'] + + formats = [] + for file_ in video.get('videoFiles', []): + format_url = file_.get('url') + if not format_url: + continue + ext = determine_ext(format_url) + # .ism is not yet supported (see + # https://github.com/rg3/youtube-dl/issues/8118) + if ext == 'ism': + continue + if 'm3u8' in format_url: + # m3u8_native should not be used here until + # https://github.com/rg3/youtube-dl/issues/9913 is fixed + m3u8_formats = self._extract_m3u8_formats( + format_url, display_id, 'mp4', + m3u8_id='hls', fatal=False) + # Despite metadata in m3u8 all video+audio formats are + # actually video-only (no audio) + for f in m3u8_formats: + if f.get('acodec') != 'none' and f.get('vcodec') != 'none': + f['acodec'] = 'none' + formats.extend(m3u8_formats) + else: + formats.append({ + 'url': format_url, + 'ext': 'mp4', + 'format_id': 'http', + 'width': int_or_none(file_.get('width')), + 'height': int_or_none(file_.get('height')), + }) + self._sort_formats(formats) + + subtitles = {} + for file_ in video.get('files', []): + format_url = file_.get('url') + format_code = file_.get('formatCode') + if not format_url or not format_code: + continue + if compat_str(format_code) == '3100': + subtitles.setdefault(file_.get('culture', 'en'), []).append({ + 'ext': determine_ext(format_url, 'ttml'), + 'url': format_url, + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('headlineImage', {}).get('url'), + 'duration': int_or_none(video.get('durationSecs')), + 'uploader': video.get('sourceFriendly'), + 'uploader_id': video.get('providerId'), + 'creator': video.get('creator'), + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 722518663..e717abb9f 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .theplatform import ThePlatformIE from ..utils import ( smuggle_url, url_basename, @@ -61,7 +62,7 @@ class NationalGeographicIE(InfoExtractor): } -class NationalGeographicChannelIE(InfoExtractor): +class NationalGeographicChannelIE(ThePlatformIE): IE_NAME = 'natgeo:channel' _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/videos/(?P<id>[^/?]+)' @@ -102,12 +103,22 @@ class NationalGeographicChannelIE(InfoExtractor): release_url = self._search_regex( r'video_auth_playlist_url\s*=\s*"([^"]+)"', webpage, 'release url') + query = { + 'mbr': 'true', + 'switch': 'http', + } + is_auth = self._search_regex(r'video_is_auth\s*=\s*"([^"]+)"', webpage, 'is auth', fatal=False) + if is_auth == 'auth': + auth_resource_id = self._search_regex( + r"video_auth_resourceId\s*=\s*'([^']+)'", + webpage, 'auth resource id') + query['auth'] = self._extract_mvpd_auth(url, display_id, 'natgeo', auth_resource_id) or '' return { '_type': 'url_transparent', 'ie_key': 'ThePlatform', 'url': smuggle_url( - update_url_query(release_url, {'mbr': 'true', 'switch': 'http'}), + update_url_query(release_url, query), {'force_smil_url': True}), 'display_id': display_id, } diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 6b7da1149..f694e210b 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -9,10 +9,6 @@ from ..utils import ( lowercase_escape, smuggle_url, unescapeHTML, - update_url_query, - int_or_none, - HEADRequest, - parse_iso8601, ) @@ -192,9 +188,9 @@ class CSNNEIE(InfoExtractor): class NBCNewsIE(ThePlatformIE): - _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today)\.com/ + _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/ (?:video/.+?/(?P<id>\d+)| - ([^/]+/)*(?P<display_id>[^/?]+)) + ([^/]+/)*(?:.*-)?(?P<mpx_id>[^/?]+)) ''' _TESTS = [ @@ -216,13 +212,16 @@ class NBCNewsIE(ThePlatformIE): 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', + 'uploader': 'NBCU-NEWS', + 'timestamp': 1401363060, + 'upload_date': '20140529', }, }, { 'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156', 'md5': 'fdbf39ab73a72df5896b6234ff98518a', 'info_dict': { - 'id': 'Wjf9EDR3A_60', + 'id': '529953347624', 'ext': 'mp4', 'title': 'FULL EPISODE: Family Business', 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04', @@ -237,6 +236,9 @@ class NBCNewsIE(ThePlatformIE): 'ext': 'mp4', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', + 'timestamp': 1423104900, + 'uploader': 'NBCU-NEWS', + 'upload_date': '20150205', }, }, { @@ -245,10 +247,12 @@ class NBCNewsIE(ThePlatformIE): 'info_dict': { 'id': '529953347624', 'ext': 'mp4', - 'title': 'Volkswagen U.S. Chief: We \'Totally Screwed Up\'', - 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', + 'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up', + 'description': 'md5:c8be487b2d80ff0594c005add88d8351', + 'upload_date': '20150922', + 'timestamp': 1442917800, + 'uploader': 'NBCU-NEWS', }, - 'expected_warnings': ['http-6000 is not available'] }, { 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', @@ -260,6 +264,22 @@ class NBCNewsIE(ThePlatformIE): 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', 'upload_date': '20160420', 'timestamp': 1461152093, + 'uploader': 'NBCU-NEWS', + }, + }, + { + 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', + 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', + 'info_dict': { + 'id': '314487875924', + 'ext': 'mp4', + 'title': 'The chaotic GOP immigration vote', + 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1406937606, + 'upload_date': '20140802', + 'uploader': 'NBCU-NEWS', + 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'], }, }, { @@ -290,105 +310,28 @@ class NBCNewsIE(ThePlatformIE): } else: # "feature" and "nightly-news" pages use theplatform.com - display_id = mobj.group('display_id') - webpage = self._download_webpage(url, display_id) - info = None - bootstrap_json = self._search_regex( - [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', - r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'], - webpage, 'bootstrap json', default=None) - bootstrap = self._parse_json( - bootstrap_json, display_id, transform_source=unescapeHTML) - if 'results' in bootstrap: - info = bootstrap['results'][0]['video'] - elif 'video' in bootstrap: - info = bootstrap['video'] - else: - info = bootstrap - video_id = info['mpxId'] - title = info['title'] - - subtitles = {} - caption_links = info.get('captionLinks') - if caption_links: - for (sub_key, sub_ext) in (('smpte-tt', 'ttml'), ('web-vtt', 'vtt'), ('srt', 'srt')): - sub_url = caption_links.get(sub_key) - if sub_url: - subtitles.setdefault('en', []).append({ - 'url': sub_url, - 'ext': sub_ext, - }) - - formats = [] - for video_asset in info['videoAssets']: - video_url = video_asset.get('publicUrl') - if not video_url: - continue - container = video_asset.get('format') - asset_type = video_asset.get('assetType') or '' - if container == 'ISM' or asset_type == 'FireTV-Once': - continue - elif asset_type == 'OnceURL': - tp_formats, tp_subtitles = self._extract_theplatform_smil( - video_url, video_id) - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) + video_id = mobj.group('mpx_id') + if not video_id.isdigit(): + webpage = self._download_webpage(url, video_id) + info = None + bootstrap_json = self._search_regex( + [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', + r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'], + webpage, 'bootstrap json', default=None) + bootstrap = self._parse_json( + bootstrap_json, video_id, transform_source=unescapeHTML) + if 'results' in bootstrap: + info = bootstrap['results'][0]['video'] + elif 'video' in bootstrap: + info = bootstrap['video'] else: - tbr = int_or_none(video_asset.get('bitRate') or video_asset.get('bitrate'), 1000) - format_id = 'http%s' % ('-%d' % tbr if tbr else '') - video_url = update_url_query( - video_url, {'format': 'redirect'}) - # resolve the url so that we can check availability and detect the correct extension - head = self._request_webpage( - HEADRequest(video_url), video_id, - 'Checking %s url' % format_id, - '%s is not available' % format_id, - fatal=False) - if head: - video_url = head.geturl() - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'width': int_or_none(video_asset.get('width')), - 'height': int_or_none(video_asset.get('height')), - 'tbr': tbr, - 'container': video_asset.get('format'), - }) - self._sort_formats(formats) + info = bootstrap + video_id = info['mpxId'] return { + '_type': 'url_transparent', 'id': video_id, - 'title': title, - 'description': info.get('description'), - 'thumbnail': info.get('thumbnail'), - 'duration': int_or_none(info.get('duration')), - 'timestamp': parse_iso8601(info.get('pubDate') or info.get('pub_date')), - 'formats': formats, - 'subtitles': subtitles, + # http://feed.theplatform.com/f/2E2eJC/nbcnews also works + 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byId=%s' % video_id, + 'ie_key': 'ThePlatformFeed', } - - -class MSNBCIE(InfoExtractor): - # https URLs redirect to corresponding http ones - _VALID_URL = r'https?://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)' - _TEST = { - 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', - 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', - 'info_dict': { - 'id': 'n_hayes_Aimm_140801_272214', - 'ext': 'mp4', - 'title': 'The chaotic GOP immigration vote', - 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'timestamp': 1406937606, - 'upload_date': '20140802', - 'uploader': 'NBCU-NEWS', - 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'], - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - embed_url = self._html_search_meta('embedURL', webpage) - return self.url_result(embed_url) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index e96013791..4935002d0 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -8,7 +8,7 @@ from ..utils import update_url_query class NickIE(MTVServicesInfoExtractor): IE_NAME = 'nick.com' - _VALID_URL = r'https?://(?:www\.)?nick\.com/videos/clip/(?P<id>[^/?#.]+)' + _VALID_URL = r'https?://(?:www\.)?nick(?:jr)?\.com/(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)' _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm' _TESTS = [{ 'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html', @@ -52,6 +52,9 @@ class NickIE(MTVServicesInfoExtractor): } }, ], + }, { + 'url': 'http://www.nickjr.com/paw-patrol/videos/pups-save-a-goldrush-s3-ep302-full-episode/', + 'only_matching': True, }] def _get_feed_query(self, uri): diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py new file mode 100644 index 000000000..d889245ad --- /dev/null +++ b/youtube_dl/extractor/ninecninemedia.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + parse_duration, + ExtractorError +) + + +class NineCNineMediaIE(InfoExtractor): + _VALID_URL = r'9c9media:(?P<destination_code>[^:]+):(?P<id>\d+)' + + def _real_extract(self, url): + destination_code, video_id = re.match(self._VALID_URL, url).groups() + api_base_url = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' % (destination_code, video_id) + content = self._download_json(api_base_url, video_id, query={ + '$include': '[contentpackages]', + }) + title = content['Name'] + if len(content['ContentPackages']) > 1: + raise ExtractorError('multiple content packages') + content_package = content['ContentPackages'][0] + stacks_base_url = api_base_url + 'contentpackages/%s/stacks/' % content_package['Id'] + stacks = self._download_json(stacks_base_url, video_id)['Items'] + if len(stacks) > 1: + raise ExtractorError('multiple stacks') + stack = stacks[0] + stack_base_url = '%s%s/manifest.' % (stacks_base_url, stack['Id']) + formats = [] + formats.extend(self._extract_m3u8_formats( + stack_base_url + 'm3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + stack_base_url + 'f4m', video_id, + f4m_id='hds', fatal=False)) + mp4_url = self._download_webpage(stack_base_url + 'pd', video_id, fatal=False) + if mp4_url: + formats.append({ + 'url': mp4_url, + 'format_id': 'mp4', + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': content.get('Desc') or content.get('ShortDesc'), + 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), + 'duration': parse_duration(content.get('BroadcastTime')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py new file mode 100644 index 000000000..402d3a9f7 --- /dev/null +++ b/youtube_dl/extractor/onet.py @@ -0,0 +1,172 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + float_or_none, + get_element_by_class, + int_or_none, + js_to_json, + parse_iso8601, + remove_start, + strip_or_none, + url_basename, +) + + +class OnetBaseIE(InfoExtractor): + def _search_mvp_id(self, webpage): + return self._search_regex( + r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id') + + def _extract_from_id(self, video_id, webpage): + response = self._download_json( + 'http://qi.ckm.onetapi.pl/', video_id, + query={ + 'body[id]': video_id, + 'body[jsonrpc]': '2.0', + 'body[method]': 'get_asset_detail', + 'body[params][ID_Publikacji]': video_id, + 'body[params][Service]': 'www.onet.pl', + 'content-type': 'application/jsonp', + 'x-onet-app': 'player.front.onetapi.pl', + }) + + error = response.get('error') + if error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error['message']), expected=True) + + video = response['result'].get('0') + + formats = [] + for _, formats_dict in video['formats'].items(): + if not isinstance(formats_dict, dict): + continue + for format_id, format_list in formats_dict.items(): + if not isinstance(format_list, list): + continue + for f in format_list: + video_url = f.get('url') + if not video_url: + continue + ext = determine_ext(video_url) + if format_id == 'ism': + # TODO: Support Microsoft Smooth Streaming + continue + elif ext == 'mpd': + # TODO: Current DASH formats are broken - $Time$ pattern in + # <SegmentTemplate> not implemented yet + # formats.extend(self._extract_mpd_formats( + # video_url, video_id, mpd_id='dash', fatal=False)) + continue + else: + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'height': int_or_none(f.get('vertical_resolution')), + 'width': int_or_none(f.get('horizontal_resolution')), + 'abr': float_or_none(f.get('audio_bitrate')), + 'vbr': float_or_none(f.get('video_bitrate')), + }) + self._sort_formats(formats) + + meta = video.get('meta', {}) + + title = self._og_search_title(webpage, default=None) or meta['title'] + description = self._og_search_description(webpage, default=None) or meta.get('description') + duration = meta.get('length') or meta.get('lenght') + timestamp = parse_iso8601(meta.get('addDate'), ' ') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } + + +class OnetIE(OnetBaseIE): + _VALID_URL = 'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P<display_id>[0-9a-z-]+)/(?P<id>[0-9a-z]+)' + IE_NAME = 'onet.tv' + + _TEST = { + 'url': 'http://onet.tv/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc', + 'md5': 'e3ffbf47590032ac3f27249204173d50', + 'info_dict': { + 'id': 'qbpyqc', + 'display_id': 'open-er-festival-2016-najdziwniejsze-wymagania-gwiazd', + 'ext': 'mp4', + 'title': 'Open\'er Festival 2016: najdziwniejsze wymagania gwiazd', + 'description': 'Trzy samochody, których nigdy nie użyto, prywatne spa, hotel dekorowany czarnym suknem czy nielegalne używki. Organizatorzy koncertów i festiwali muszą stawać przed nie lada wyzwaniem zapraszając gwia...', + 'upload_date': '20160705', + 'timestamp': 1467721580, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id, video_id = mobj.group('display_id', 'id') + + webpage = self._download_webpage(url, display_id) + + mvp_id = self._search_mvp_id(webpage) + + info_dict = self._extract_from_id(mvp_id, webpage) + info_dict.update({ + 'id': video_id, + 'display_id': display_id, + }) + + return info_dict + + +class OnetChannelIE(OnetBaseIE): + _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/(?P<id>[a-z]+)(?:[?#]|$)' + IE_NAME = 'onet.tv:channel' + + _TEST = { + 'url': 'http://onet.tv/k/openerfestival', + 'info_dict': { + 'id': 'openerfestival', + 'title': 'Open\'er Festival Live', + 'description': 'Dziękujemy, że oglądaliście transmisje. Zobaczcie nasze relacje i wywiady z artystami.', + }, + 'playlist_mincount': 46, + } + + def _real_extract(self, url): + channel_id = self._match_id(url) + + webpage = self._download_webpage(url, channel_id) + + current_clip_info = self._parse_json(self._search_regex( + r'var\s+currentClip\s*=\s*({[^}]+})', webpage, 'video info'), channel_id, + transform_source=lambda s: js_to_json(re.sub(r'\'\s*\+\s*\'', '', s))) + video_id = remove_start(current_clip_info['ckmId'], 'mvp:') + video_name = url_basename(current_clip_info['url']) + + if self._downloader.params.get('noplaylist'): + self.to_screen( + 'Downloading just video %s because of --no-playlist' % video_name) + return self._extract_from_id(video_id, webpage) + + self.to_screen( + 'Downloading channel %s - add --no-playlist to just download video %s' % ( + channel_id, video_name)) + matches = re.findall( + r'<a[^>]+href=[\'"](https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/[0-9a-z-]+/[0-9a-z]+)', + webpage) + entries = [ + self.url_result(video_link, OnetIE.ie_key()) + for video_link in matches] + + channel_title = strip_or_none(get_element_by_class('o_channelName', webpage)) + channel_description = strip_or_none(get_element_by_class('o_channelDesc', webpage)) + return self.playlist_result(entries, channel_id, channel_title, channel_description) diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py index d7b13a0f1..6fb1a3fcc 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/youtube_dl/extractor/onionstudios.py @@ -7,6 +7,8 @@ from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, + float_or_none, + mimetype2ext, ) @@ -15,15 +17,14 @@ class OnionStudiosIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', - 'md5': 'd4851405d31adfadf71cd7a487b765bb', + 'md5': 'e49f947c105b8a78a675a0ee1bddedfe', 'info_dict': { 'id': '2937', 'ext': 'mp4', 'title': 'Hannibal charges forward, stops for a cocktail', - 'description': 'md5:e786add7f280b7f0fe237b64cc73df76', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'The A.V. Club', - 'uploader_id': 'TheAVClub', + 'uploader_id': 'the-av-club', }, }, { 'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true', @@ -40,50 +41,38 @@ class OnionStudiosIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://www.onionstudios.com/embed?id=%s' % video_id, video_id) + video_data = self._download_json( + 'http://www.onionstudios.com/video/%s.json' % video_id, video_id) + + title = video_data['title'] formats = [] - for src in re.findall(r'<source[^>]+src="([^"]+)"', webpage): - ext = determine_ext(src) + for source in video_data.get('sources', []): + source_url = source.get('url') + if not source_url: + continue + ext = mimetype2ext(source.get('content_type')) or determine_ext(source_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + source_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) else: - height = int_or_none(self._search_regex( - r'/(\d+)\.%s' % ext, src, 'height', default=None)) + tbr = int_or_none(source.get('bitrate')) formats.append({ - 'format_id': ext + ('-%sp' % height if height else ''), - 'url': src, - 'height': height, + 'format_id': ext + ('-%d' % tbr if tbr else ''), + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'tbr': tbr, 'ext': ext, - 'preference': 1, }) self._sort_formats(formats) - title = self._search_regex( - r'share_title\s*=\s*(["\'])(?P<title>[^\1]+?)\1', - webpage, 'title', group='title') - description = self._search_regex( - r'share_description\s*=\s*(["\'])(?P<description>[^\'"]+?)\1', - webpage, 'description', default=None, group='description') - thumbnail = self._search_regex( - r'poster\s*=\s*(["\'])(?P<thumbnail>[^\1]+?)\1', - webpage, 'thumbnail', default=False, group='thumbnail') - - uploader_id = self._search_regex( - r'twitter_handle\s*=\s*(["\'])(?P<uploader_id>[^\1]+?)\1', - webpage, 'uploader id', fatal=False, group='uploader_id') - uploader = self._search_regex( - r'window\.channelName\s*=\s*(["\'])Embedded:(?P<uploader>[^\1]+?)\1', - webpage, 'uploader', default=False, group='uploader') - return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'uploader_id': uploader_id, + 'thumbnail': video_data.get('poster_url'), + 'uploader': video_data.get('channel_name'), + 'uploader_id': video_data.get('channel_slug'), + 'duration': float_or_none(video_data.get('duration', 1000)), + 'tags': video_data.get('tags'), 'formats': formats, } diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 81918ac6e..f6f423597 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -516,9 +516,14 @@ class PBSIE(InfoExtractor): # https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications if not bitrate or bitrate not in ('400k', '800k', '1200k', '2500k'): continue + f_url = re.sub(r'\d+k|baseline', bitrate, http_url) + # This may produce invalid links sometimes (e.g. + # http://www.pbs.org/wgbh/frontline/film/suicide-plan) + if not self._is_valid_url(f_url, display_id, 'http-%s video' % bitrate): + continue f = m3u8_format.copy() f.update({ - 'url': re.sub(r'\d+k|baseline', bitrate, http_url), + 'url': f_url, 'format_id': m3u8_format['format_id'].replace('hls', 'http'), 'protocol': 'http', }) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index c23b314e7..75f5884a9 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -120,9 +120,12 @@ class PeriscopeUserIE(InfoExtractor): title = user.get('display_name') or user.get('username') description = user.get('description') + broadcast_ids = (data_store.get('UserBroadcastHistory', {}).get('broadcastIds') or + data_store.get('BroadcastCache', {}).get('broadcastIds', [])) + entries = [ self.url_result( - 'https://www.periscope.tv/%s/%s' % (user_id, broadcast['id'])) - for broadcast in data_store.get('UserBroadcastHistory', {}).get('broadcasts', [])] + 'https://www.periscope.tv/%s/%s' % (user_id, broadcast_id)) + for broadcast_id in broadcast_ids] return self.playlist_result(entries, user_id, title, description) diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py index bc559d1df..77e1211d6 100644 --- a/youtube_dl/extractor/pladform.py +++ b/youtube_dl/extractor/pladform.py @@ -49,7 +49,7 @@ class PladformIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r'<iframe[^>]+src="(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)"', webpage) + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage) if mobj: return mobj.group('url') diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py new file mode 100644 index 000000000..f559b899f --- /dev/null +++ b/youtube_dl/extractor/polskieradio.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) +from ..utils import ( + int_or_none, + strip_or_none, + unified_timestamp, +) + + +class PolskieRadioIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', + 'info_dict': { + 'id': '1587943', + 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', + 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', + }, + 'playlist': [{ + 'md5': '2984ee6ce9046d91fc233bc1a864a09a', + 'info_dict': { + 'id': '1540576', + 'ext': 'mp3', + 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', + 'timestamp': 1456594200, + 'upload_date': '20160227', + 'duration': 2364, + 'thumbnail': 're:^https?://static\.prsa\.pl/images/.*\.jpg$' + }, + }], + }, { + 'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal', + 'info_dict': { + 'id': '1635803', + 'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał', + 'description': 'md5:01cb7d0cad58664095d72b51a1ebada2', + }, + 'playlist_mincount': 12, + }, { + 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', + 'only_matching': True, + }, { + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', + 'only_matching': True, + }, { + # with mp4 video + 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + content = self._search_regex( + r'(?s)<div[^>]+class="audio atarticle"[^>]*>(.+?)<script>', + webpage, 'content') + + timestamp = unified_timestamp(self._html_search_regex( + r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', + webpage, 'timestamp', fatal=False)) + + thumbnail_url = self._og_search_thumbnail(webpage) + + entries = [] + + media_urls = set() + + for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content): + media = self._parse_json(data_media, playlist_id, fatal=False) + if not media.get('file') or not media.get('desc'): + continue + media_url = self._proto_relative_url(media['file'], 'http:') + if media_url in media_urls: + continue + media_urls.add(media_url) + entries.append({ + 'id': compat_str(media['id']), + 'url': media_url, + 'title': compat_urllib_parse_unquote(media['desc']), + 'duration': int_or_none(media.get('length')), + 'vcodec': 'none' if media.get('provider') == 'audio' else None, + 'timestamp': timestamp, + 'thumbnail': thumbnail_url + }) + + title = self._og_search_title(webpage).strip() + description = strip_or_none(self._og_search_description(webpage)) + + return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 6d57e1d35..d2c92531b 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -25,7 +25,15 @@ from ..aes import ( class PornHubIE(InfoExtractor): - _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)' + IE_DESC = 'PornHub and Thumbzilla' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)| + (?:www\.)?thumbzilla\.com/video/ + ) + (?P<id>[0-9a-z]+) + ''' _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'md5': '1e19b41231a02eba417839222ac9d58e', @@ -63,8 +71,24 @@ class PornHubIE(InfoExtractor): 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, }, { + # removed at the request of cam4.com 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', 'only_matching': True, + }, { + # removed at the request of the copyright owner + 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859', + 'only_matching': True, + }, { + # removed by uploader + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111', + 'only_matching': True, + }, { + # private video + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7', + 'only_matching': True, + }, { + 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', + 'only_matching': True, }] @classmethod @@ -87,8 +111,8 @@ class PornHubIE(InfoExtractor): webpage = self._download_webpage(req, video_id) error_msg = self._html_search_regex( - r'(?s)<div class="userMessageSection[^"]*".*?>(.*?)</div>', - webpage, 'error message', default=None) + r'(?s)<div[^>]+class=(["\']).*?\b(?:removed|userMessageSection)\b.*?\1[^>]*>(?P<error>.+?)</div>', + webpage, 'error message', default=None, group='error') if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) raise ExtractorError( diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 07d49d489..c6eee3b72 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -5,7 +5,7 @@ import re from hashlib import sha1 from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode +from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, @@ -71,6 +71,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip', @@ -86,6 +87,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip', @@ -101,6 +103,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge', @@ -116,6 +119,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge', @@ -131,6 +135,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', @@ -227,70 +232,42 @@ class ProSiebenSat1IE(InfoExtractor): ] def _extract_clip(self, url, webpage): - clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id') + clip_id = self._html_search_regex( + self._CLIPID_REGEXES, webpage, 'clip id') access_token = 'prosieben' client_name = 'kolibri-2.0.19-splec4' client_location = url - videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse_urlencode({ - 'access_token': access_token, - 'client_location': client_location, - 'client_name': client_name, - 'ids': clip_id, - }) - - video = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')[0] + video = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos', + clip_id, 'Downloading videos JSON', query={ + 'access_token': access_token, + 'client_location': client_location, + 'client_name': client_name, + 'ids': clip_id, + })[0] if video.get('is_protected') is True: raise ExtractorError('This video is DRM protected.', expected=True) duration = float_or_none(video.get('duration')) - source_ids = [source['id'] for source in video['sources']] - source_ids_str = ','.join(map(str, source_ids)) + source_ids = [compat_str(source['id']) for source in video['sources']] g = '01!8d8F_)r9]4s[qeuXfP%' + client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]).encode('utf-8')).hexdigest() - client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]) - .encode('utf-8')).hexdigest() - - sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse_urlencode({ - 'access_token': access_token, - 'client_id': client_id, - 'client_location': client_location, - 'client_name': client_name, - })) - - sources = self._download_json(sources_api_url, clip_id, 'Downloading sources JSON') + sources = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, + clip_id, 'Downloading sources JSON', query={ + 'access_token': access_token, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': client_name, + }) server_id = sources['server_id'] - client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id, - client_location, source_ids_str, g, client_name]) - .encode('utf-8')).hexdigest() - - url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse_urlencode({ - 'access_token': access_token, - 'client_id': client_id, - 'client_location': client_location, - 'client_name': client_name, - 'server_id': server_id, - 'source_ids': source_ids_str, - })) - - urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON') - title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title') - description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) - - upload_date = unified_strdate(self._html_search_regex( - self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) - - formats = [] - - urls_sources = urls['sources'] - if isinstance(urls_sources, dict): - urls_sources = urls_sources.values() def fix_bitrate(bitrate): bitrate = int_or_none(bitrate) @@ -298,37 +275,73 @@ class ProSiebenSat1IE(InfoExtractor): return None return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate - for source in urls_sources: - protocol = source['protocol'] - source_url = source['url'] - if protocol == 'rtmp' or protocol == 'rtmpe': - mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) - if not mobj: - continue - path = mobj.group('path') - mp4colon_index = path.rfind('mp4:') - app = path[:mp4colon_index] - play_path = path[mp4colon_index:] - formats.append({ - 'url': '%s/%s' % (mobj.group('url'), app), - 'app': app, - 'play_path': play_path, - 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', - 'page_url': 'http://www.prosieben.de', - 'vbr': fix_bitrate(source['bitrate']), - 'ext': 'mp4', - 'format_id': '%s_%s' % (source['cdn'], source['bitrate']), - }) - elif 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': - formats.extend(self._extract_f4m_formats(source_url, clip_id)) - else: - formats.append({ - 'url': source_url, - 'vbr': fix_bitrate(source['bitrate']), + formats = [] + for source_id in source_ids: + client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id, client_location, source_id, g, client_name]).encode('utf-8')).hexdigest() + urls = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, + clip_id, 'Downloading urls JSON', fatal=False, query={ + 'access_token': access_token, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': client_name, + 'server_id': server_id, + 'source_ids': source_id, }) - + if not urls: + continue + if urls.get('status_code') != 0: + raise ExtractorError('This video is unavailable', expected=True) + urls_sources = urls['sources'] + if isinstance(urls_sources, dict): + urls_sources = urls_sources.values() + for source in urls_sources: + source_url = source.get('url') + if not source_url: + continue + protocol = source.get('protocol') + mimetype = source.get('mimetype') + if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + source_url, clip_id, f4m_id='hds', fatal=False)) + elif mimetype == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + tbr = fix_bitrate(source['bitrate']) + if protocol in ('rtmp', 'rtmpe'): + mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) + if not mobj: + continue + path = mobj.group('path') + mp4colon_index = path.rfind('mp4:') + app = path[:mp4colon_index] + play_path = path[mp4colon_index:] + formats.append({ + 'url': '%s/%s' % (mobj.group('url'), app), + 'app': app, + 'play_path': play_path, + 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', + 'page_url': 'http://www.prosieben.de', + 'tbr': tbr, + 'ext': 'flv', + 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), + }) + else: + formats.append({ + 'url': source_url, + 'tbr': tbr, + 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), + }) self._sort_formats(formats) + description = self._html_search_regex( + self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate(self._html_search_regex( + self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) + return { 'id': clip_id, 'title': title, diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index 4f05bbddc..8ec402646 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -12,6 +12,7 @@ from ..utils import ( unified_strdate, xpath_element, ExtractorError, + determine_protocol, ) @@ -22,13 +23,13 @@ class RadioCanadaIE(InfoExtractor): 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', 'info_dict': { 'id': '7184272', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Le parcours du tireur capté sur vidéo', 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', 'upload_date': '20141023', }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, } @@ -36,11 +37,14 @@ class RadioCanadaIE(InfoExtractor): def _real_extract(self, url): app_code, video_id = re.match(self._VALID_URL, url).groups() + device_types = ['ipad', 'android'] + if app_code != 'toutv': + device_types.append('flash') + formats = [] - # TODO: extract m3u8 and f4m formats - # m3u8 formats can be extracted using ipad device_type return 403 error code when ffmpeg try to download segements + # TODO: extract f4m formats # f4m formats can be extracted using flashhd device_type but they produce unplayable file - for device_type in ('flash',): + for device_type in device_types: v_data = self._download_xml( 'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx', video_id, note='Downloading %s XML' % device_type, query={ @@ -52,7 +56,7 @@ class RadioCanadaIE(InfoExtractor): # paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction 'paysJ391wsHjbOJwvCs26toz': 'CA', 'bypasslock': 'NZt5K62gRqfc', - }) + }, fatal=False) v_url = xpath_text(v_data, 'url') if not v_url: continue @@ -64,7 +68,8 @@ class RadioCanadaIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( v_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) elif ext == 'f4m': - formats.extend(self._extract_f4m_formats(v_url, video_id, f4m_id='hds', fatal=False)) + formats.extend(self._extract_f4m_formats( + v_url, video_id, f4m_id='hds', fatal=False)) else: ext = determine_ext(v_url) bitrates = xpath_element(v_data, 'bitrates') @@ -72,15 +77,28 @@ class RadioCanadaIE(InfoExtractor): tbr = int_or_none(url_e.get('bitrate')) if not tbr: continue + f_url = re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url) + protocol = determine_protocol({'url': f_url}) formats.append({ - 'format_id': 'rtmp-%d' % tbr, - 'url': re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url), - 'ext': 'flv', - 'protocol': 'rtmp', + 'format_id': '%s-%d' % (protocol, tbr), + 'url': f_url, + 'ext': 'flv' if protocol == 'rtmp' else ext, + 'protocol': protocol, 'width': int_or_none(url_e.get('width')), 'height': int_or_none(url_e.get('height')), 'tbr': tbr, }) + if protocol == 'rtsp': + base_url = self._search_regex( + r'rtsp://([^?]+)', f_url, 'base url', default=None) + if base_url: + base_url = 'http://' + base_url + formats.extend(self._extract_m3u8_formats( + base_url + '/playlist.m3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + base_url + '/manifest.f4m', video_id, + f4m_id='hds', fatal=False)) self._sort_formats(formats) metadata = self._download_xml( @@ -115,13 +133,13 @@ class RadioCanadaAudioVideoIE(InfoExtractor): 'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam', 'info_dict': { 'id': '7527184', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Barack Obama au Vietnam', 'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam', 'upload_date': '20160523', }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, } diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index e36ce1aa1..dc640b1bc 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -1,47 +1,141 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urlparse, -) +from ..compat import compat_urlparse from ..utils import ( - ExtractorError, determine_ext, + ExtractorError, + find_xpath_attr, + fix_xml_ampersands, + int_or_none, parse_duration, unified_strdate, - int_or_none, + update_url_query, xpath_text, ) -class RaiTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' +class RaiBaseIE(InfoExtractor): + def _extract_relinker_formats(self, relinker_url, video_id): + formats = [] + + for platform in ('mon', 'flash', 'native'): + relinker = self._download_xml( + relinker_url, video_id, + note='Downloading XML metadata for platform %s' % platform, + transform_source=fix_xml_ampersands, + query={'output': 45, 'pl': platform}, + headers=self.geo_verification_headers()) + + media_url = find_xpath_attr(relinker, './url', 'type', 'content').text + if media_url == 'http://download.rai.it/video_no_available.mp4': + self.raise_geo_restricted() + + ext = determine_ext(media_url) + if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): + continue + + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + manifest_url = update_url_query( + media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), + {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) + formats.extend(self._extract_f4m_formats( + manifest_url, video_id, f4m_id='hds', fatal=False)) + else: + bitrate = int_or_none(xpath_text(relinker, 'bitrate')) + formats.append({ + 'url': media_url, + 'tbr': bitrate if bitrate > 0 else None, + 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', + }) + + return formats + + def _extract_from_content_id(self, content_id, base_url): + media = self._download_json( + 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, + content_id, 'Downloading video JSON') + + thumbnails = [] + for image_type in ('image', 'image_medium', 'image_300'): + thumbnail_url = media.get(image_type) + if thumbnail_url: + thumbnails.append({ + 'url': compat_urlparse.urljoin(base_url, thumbnail_url), + }) + + formats = [] + media_type = media['type'] + if 'Audio' in media_type: + formats.append({ + 'format_id': media.get('formatoAudio'), + 'url': media['audioUrl'], + 'ext': media.get('formatoAudio'), + }) + elif 'Video' in media_type: + formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id)) + self._sort_formats(formats) + else: + raise ExtractorError('not a media file') + + subtitles = {} + captions = media.get('subtitlesUrl') + if captions: + STL_EXT = '.stl' + SRT_EXT = '.srt' + if captions.endswith(STL_EXT): + captions = captions[:-len(STL_EXT)] + SRT_EXT + subtitles['it'] = [{ + 'ext': 'srt', + 'url': captions, + }] + + return { + 'id': content_id, + 'title': media['name'], + 'description': media.get('desc'), + 'thumbnails': thumbnails, + 'uploader': media.get('author'), + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(media.get('length')), + 'formats': formats, + 'subtitles': subtitles, + } + + +class RaiTVIE(RaiBaseIE): + _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+(?:media|ondemand)/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', - 'md5': '96382709b61dd64a6b88e0f791e6df4c', + 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Report del 07/04/2014', 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', 'upload_date': '20140407', 'duration': 6160, + 'thumbnail': 're:^https?://.*\.jpg$', } }, { + # no m3u8 stream 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', - 'md5': 'd9751b78eac9710d62c2447b224dea39', + # HDS download, MD5 is unstable 'info_dict': { 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', 'ext': 'flv', 'title': 'TG PRIMO TEMPO', 'upload_date': '20140612', 'duration': 1758, + 'thumbnail': 're:^https?://.*\.jpg$', }, + 'skip': 'Geo-restricted to Italy', }, { 'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', @@ -67,127 +161,70 @@ class RaiTVIE(InfoExtractor): }, { 'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html', - 'md5': '496ab63e420574447f70d02578333437', + 'md5': 'e57493e1cb8bc7c564663f363b171847', 'info_dict': { 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Il Candidato - Primo episodio: "Le Primarie"', 'description': 'md5:364b604f7db50594678f483353164fb8', 'upload_date': '20140923', 'duration': 386, + 'thumbnail': 're:^https?://.*\.jpg$', } }, ] def _real_extract(self, url): video_id = self._match_id(url) - media = self._download_json( - 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % video_id, - video_id, 'Downloading video JSON') - - thumbnails = [] - for image_type in ('image', 'image_medium', 'image_300'): - thumbnail_url = media.get(image_type) - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - }) - - subtitles = [] - formats = [] - media_type = media['type'] - if 'Audio' in media_type: - formats.append({ - 'format_id': media.get('formatoAudio'), - 'url': media['audioUrl'], - 'ext': media.get('formatoAudio'), - }) - elif 'Video' in media_type: - def fix_xml(xml): - return xml.replace(' tag elementi', '').replace('>/', '</') - - relinker = self._download_xml( - media['mediaUri'] + '&output=43', - video_id, transform_source=fix_xml) - - has_subtitle = False - - for element in relinker.findall('element'): - media_url = xpath_text(element, 'url') - ext = determine_ext(media_url) - content_type = xpath_text(element, 'content-type') - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - media_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', - video_id, f4m_id='hds', fatal=False)) - elif ext == 'stl': - has_subtitle = True - elif content_type.startswith('video/'): - bitrate = int_or_none(xpath_text(element, 'bitrate')) - formats.append({ - 'url': media_url, - 'tbr': bitrate if bitrate > 0 else None, - 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', - }) - elif content_type.startswith('image/'): - thumbnails.append({ - 'url': media_url, - }) - - self._sort_formats(formats) - if has_subtitle: - webpage = self._download_webpage(url, video_id) - subtitles = self._get_subtitles(video_id, webpage) - else: - raise ExtractorError('not a media file') + return self._extract_from_content_id(video_id, url) - return { - 'id': video_id, - 'title': media['name'], - 'description': media.get('desc'), - 'thumbnails': thumbnails, - 'uploader': media.get('author'), - 'upload_date': unified_strdate(media.get('date')), - 'duration': parse_duration(media.get('length')), - 'formats': formats, - 'subtitles': subtitles, - } - def _get_subtitles(self, video_id, webpage): - subtitles = {} - m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage) - if m: - captions = m.group('captions') - STL_EXT = '.stl' - SRT_EXT = '.srt' - if captions.endswith(STL_EXT): - captions = captions[:-len(STL_EXT)] + SRT_EXT - subtitles['it'] = [{ - 'ext': 'srt', - 'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions), - }] - return subtitles - - -class RaiIE(InfoExtractor): +class RaiIE(RaiBaseIE): _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': 'e0e7a8a131e249d1aa0ebf270d1d8db7', + 'md5': '2dd727e61114e1ee9c47f0da6914e178', 'info_dict': { 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Il pacco', 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', 'upload_date': '20141221', }, - } + }, + { + # Direct relinker URL + 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', + # HDS live stream, MD5 is unstable + 'info_dict': { + 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', + 'ext': 'flv', + 'title': 'EuroNews', + }, + 'skip': 'Geo-restricted to Italy', + }, + { + # Embedded content item ID + 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', + 'md5': '84c1135ce960e8822ae63cec34441d63', + 'info_dict': { + 'id': '0960e765-62c8-474a-ac4b-7eb3e2be39c8', + 'ext': 'mp4', + 'title': 'TG1 ore 20:00 del 02/07/2016', + 'upload_date': '20160702', + }, + }, + { + 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', + # HDS live stream, MD5 is unstable + 'info_dict': { + 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', + 'ext': 'flv', + 'title': 'La diretta di Rainews24', + }, + }, ] @classmethod @@ -201,7 +238,30 @@ class RaiIE(InfoExtractor): iframe_url = self._search_regex( [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', r'drawMediaRaiTV\(["\'](.+?)["\']'], - webpage, 'iframe') - if not iframe_url.startswith('http'): - iframe_url = compat_urlparse.urljoin(url, iframe_url) - return self.url_result(iframe_url) + webpage, 'iframe', default=None) + if iframe_url: + if not iframe_url.startswith('http'): + iframe_url = compat_urlparse.urljoin(url, iframe_url) + return self.url_result(iframe_url) + + content_item_id = self._search_regex( + r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)', + webpage, 'content item ID', group='content_id', default=None) + if content_item_id: + return self._extract_from_content_id(content_item_id, url) + + relinker_url = compat_urlparse.urljoin(url, self._search_regex( + r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', + webpage, 'relinker URL', group='url')) + formats = self._extract_relinker_formats(relinker_url, video_id) + self._sort_formats(formats) + + title = self._search_regex( + r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', + webpage, 'title', group='title', default=None) or self._og_search_title(webpage) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py index 796adfdf9..bf200ea4d 100644 --- a/youtube_dl/extractor/rds.py +++ b/youtube_dl/extractor/rds.py @@ -1,23 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( parse_duration, parse_iso8601, + js_to_json, ) +from ..compat import compat_str class RDSIE(InfoExtractor): IE_DESC = 'RDS.ca' - _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<display_id>[^/]+)-(?P<id>\d+\.\d+)' + _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<id>[^/]+)-\d+\.\d+' _TESTS = [{ 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799', 'info_dict': { - 'id': '3.1132799', + 'id': '604333', 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', 'ext': 'mp4', 'title': 'Fowler Jr. prend la direction de Jacksonville', @@ -33,22 +33,17 @@ class RDSIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - # TODO: extract f4m from 9c9media.com - video_url = self._search_regex( - r'<span[^>]+itemprop="contentURL"[^>]+content="([^"]+)"', - webpage, 'video url') - - title = self._og_search_title(webpage) or self._html_search_meta( + item = self._parse_json(self._search_regex(r'(?s)itemToPush\s*=\s*({.+?});', webpage, 'item'), display_id, js_to_json) + video_id = compat_str(item['id']) + title = item.get('title') or self._og_search_title(webpage) or self._html_search_meta( 'title', webpage, 'title', fatal=True) description = self._og_search_description(webpage) or self._html_search_meta( 'description', webpage, 'description') - thumbnail = self._og_search_thumbnail(webpage) or self._search_regex( + thumbnail = item.get('urlImageBig') or self._og_search_thumbnail(webpage) or self._search_regex( [r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"', r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'], webpage, 'thumbnail', fatal=False) @@ -61,13 +56,15 @@ class RDSIE(InfoExtractor): age_limit = self._family_friendly_search(webpage) return { + '_type': 'url_transparent', 'id': video_id, 'display_id': display_id, - 'url': video_url, + 'url': '9c9media:rds_web:%s' % video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'timestamp': timestamp, 'duration': duration, 'age_limit': age_limit, + 'ie_key': 'NineCNineMedia', } diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py new file mode 100644 index 000000000..f5b2f560c --- /dev/null +++ b/youtube_dl/extractor/roosterteeth.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + strip_or_none, + unescapeHTML, + urlencode_postdata, +) + + +class RoosterTeethIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/episode/(?P<id>[^/?#&]+)' + _LOGIN_URL = 'https://roosterteeth.com/login' + _NETRC_MACHINE = 'roosterteeth' + _TESTS = [{ + 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'md5': 'e2bd7764732d785ef797700a2489f212', + 'info_dict': { + 'id': '26576', + 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'ext': 'mp4', + 'title': 'Million Dollars, But...: Million Dollars, But... The Game Announcement', + 'description': 'md5:0cc3b21986d54ed815f5faeccd9a9ca5', + 'thumbnail': 're:^https?://.*\.png$', + 'series': 'Million Dollars, But...', + 'episode': 'Million Dollars, But... The Game Announcement', + 'comment_count': int, + }, + }, { + 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', + 'only_matching': True, + }, { + 'url': 'http://funhaus.roosterteeth.com/episode/funhaus-shorts-2016-austin-sucks-funhaus-shorts', + 'only_matching': True, + }, { + 'url': 'http://screwattack.roosterteeth.com/episode/death-battle-season-3-mewtwo-vs-shadow', + 'only_matching': True, + }, { + 'url': 'http://theknow.roosterteeth.com/episode/the-know-game-news-season-1-boring-steam-sales-are-better', + 'only_matching': True, + }, { + # only available for FIRST members + 'url': 'http://roosterteeth.com/episode/rt-docs-the-world-s-greatest-head-massage-the-world-s-greatest-head-massage-an-asmr-journey-part-one', + 'only_matching': True, + }] + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, + note='Downloading login page', + errnote='Unable to download login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + login_request = self._download_webpage( + self._LOGIN_URL, None, + note='Logging in as %s' % username, + data=urlencode_postdata(login_form), + headers={ + 'Referer': self._LOGIN_URL, + }) + + if not any(re.search(p, login_request) for p in ( + r'href=["\']https?://(?:www\.)?roosterteeth\.com/logout"', + r'>Sign Out<')): + error = self._html_search_regex( + r'(?s)<div[^>]+class=(["\']).*?\balert-danger\b.*?\1[^>]*>(?:\s*<button[^>]*>.*?</button>)?(?P<error>.+?)</div>', + login_request, 'alert', default=None, group='error') + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + episode = strip_or_none(unescapeHTML(self._search_regex( + (r'videoTitle\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', + r'<title>(?P<title>[^<]+)</title>'), webpage, 'title', + default=None, group='title'))) + + title = strip_or_none(self._og_search_title( + webpage, default=None)) or episode + + m3u8_url = self._search_regex( + r'file\s*:\s*(["\'])(?P<url>http.+?\.m3u8.*?)\1', + webpage, 'm3u8 url', default=None, group='url') + + if not m3u8_url: + if re.search(r'<div[^>]+class=["\']non-sponsor', webpage): + self.raise_login_required( + '%s is only available for FIRST members' % display_id) + + if re.search(r'<div[^>]+class=["\']golive-gate', webpage): + self.raise_login_required('%s is not available yet' % display_id) + + raise ExtractorError('Unable to extract m3u8 URL') + + formats = self._extract_m3u8_formats( + m3u8_url, display_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + description = strip_or_none(self._og_search_description(webpage)) + thumbnail = self._proto_relative_url(self._og_search_thumbnail(webpage)) + + series = self._search_regex( + (r'<h2>More ([^<]+)</h2>', r'<a[^>]+>See All ([^<]+) Videos<'), + webpage, 'series', fatal=False) + + comment_count = int_or_none(self._search_regex( + r'>Comments \((\d+)\)<', webpage, + 'comment count', fatal=False)) + + video_id = self._search_regex( + (r'containerId\s*=\s*["\']episode-(\d+)\1', + r'<div[^<]+id=["\']episode-(\d+)'), webpage, + 'video id', default=display_id) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'series': series, + 'episode': episode, + 'comment_count': comment_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py index 4896d09d6..f6454c6b0 100644 --- a/youtube_dl/extractor/rtvnh.py +++ b/youtube_dl/extractor/rtvnh.py @@ -9,7 +9,7 @@ class RTVNHIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.rtvnh.nl/video/131946', - 'md5': '6e1d0ab079e2a00b6161442d3ceacfc1', + 'md5': 'cdbec9f44550763c8afc96050fa747dc', 'info_dict': { 'id': '131946', 'ext': 'mp4', @@ -29,15 +29,29 @@ class RTVNHIE(InfoExtractor): raise ExtractorError( '%s returned error code %d' % (self.IE_NAME, status), expected=True) - formats = self._extract_smil_formats( - 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id, fatal=False) - - for item in meta['source']['fb']: - if item.get('type') == 'hls': - formats.extend(self._extract_m3u8_formats( - item['file'], video_id, ext='mp4', entry_protocol='m3u8_native')) - elif item.get('type') == '': - formats.append({'url': item['file']}) + formats = [] + rtmp_formats = self._extract_smil_formats( + 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id) + formats.extend(rtmp_formats) + + for rtmp_format in rtmp_formats: + rtmp_url = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + rtsp_format = rtmp_format.copy() + del rtsp_format['play_path'] + del rtsp_format['ext'] + rtsp_format.update({ + 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), + 'url': rtmp_url.replace('rtmp://', 'rtsp://'), + 'protocol': 'rtsp', + }) + formats.append(rtsp_format) + http_base_url = rtmp_url.replace('rtmp://', 'http://') + formats.extend(self._extract_m3u8_formats( + http_base_url + '/playlist.m3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + http_base_url + '/manifest.f4m', + video_id, f4m_id='hds', fatal=False)) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/sandia.py b/youtube_dl/extractor/sandia.py index 759898a49..96e43af84 100644 --- a/youtube_dl/extractor/sandia.py +++ b/youtube_dl/extractor/sandia.py @@ -1,18 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -import itertools import json -import re from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( int_or_none, - js_to_json, mimetype2ext, - sanitized_Request, - unified_strdate, ) @@ -27,7 +21,8 @@ class SandiaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Xyce Software Training - Section 1', 'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}', - 'upload_date': '20120904', + 'upload_date': '20120409', + 'timestamp': 1333983600, 'duration': 7794, } } @@ -35,81 +30,36 @@ class SandiaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - req = sanitized_Request(url) - req.add_header('Cookie', 'MediasitePlayerCaps=ClientPlugins=4') - webpage = self._download_webpage(req, video_id) + presentation_data = self._download_json( + 'http://digitalops.sandia.gov/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions', + video_id, data=json.dumps({ + 'getPlayerOptionsRequest': { + 'ResourceId': video_id, + 'QueryString': '', + } + }), headers={ + 'Content-Type': 'application/json; charset=utf-8', + })['d']['Presentation'] - js_path = self._search_regex( - r'<script type="text/javascript" src="(/Mediasite/FileServer/Presentation/[^"]+)"', - webpage, 'JS code URL') - js_url = compat_urlparse.urljoin(url, js_path) - - js_code = self._download_webpage( - js_url, video_id, note='Downloading player') - - def extract_str(key, **args): - return self._search_regex( - r'Mediasite\.PlaybackManifest\.%s\s*=\s*(.+);\s*?\n' % re.escape(key), - js_code, key, **args) - - def extract_data(key, **args): - data_json = extract_str(key, **args) - if data_json is None: - return data_json - return self._parse_json( - data_json, video_id, transform_source=js_to_json) + title = presentation_data['Title'] formats = [] - for i in itertools.count(): - fd = extract_data('VideoUrls[%d]' % i, default=None) - if fd is None: - break - formats.append({ - 'format_id': '%s' % i, - 'format_note': fd['MimeType'].partition('/')[2], - 'ext': mimetype2ext(fd['MimeType']), - 'url': fd['Location'], - 'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None, - }) + for stream in presentation_data.get('Streams', []): + for fd in stream.get('VideoUrls', []): + formats.append({ + 'format_id': fd['MediaType'], + 'format_note': fd['MimeType'].partition('/')[2], + 'ext': mimetype2ext(fd['MimeType']), + 'url': fd['Location'], + 'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None, + }) self._sort_formats(formats) - slide_baseurl = compat_urlparse.urljoin( - url, extract_data('SlideBaseUrl')) - slide_template = slide_baseurl + re.sub( - r'\{0:D?([0-9+])\}', r'%0\1d', extract_data('SlideImageFileNameTemplate')) - slides = [] - last_slide_time = 0 - for i in itertools.count(1): - sd = extract_str('Slides[%d]' % i, default=None) - if sd is None: - break - timestamp = int_or_none(self._search_regex( - r'^Mediasite\.PlaybackManifest\.CreateSlide\("[^"]*"\s*,\s*([0-9]+),', - sd, 'slide %s timestamp' % i, fatal=False)) - slides.append({ - 'url': slide_template % i, - 'duration': timestamp - last_slide_time, - }) - last_slide_time = timestamp - formats.append({ - 'format_id': 'slides', - 'protocol': 'slideshow', - 'url': json.dumps(slides), - 'preference': -10000, # Downloader not yet written - }) - self._sort_formats(formats) - - title = extract_data('Title') - description = extract_data('Description', fatal=False) - duration = int_or_none(extract_data( - 'Duration', fatal=False), scale=1000) - upload_date = unified_strdate(extract_data('AirDate', fatal=False)) - return { 'id': video_id, 'title': title, - 'description': description, + 'description': presentation_data.get('Description'), 'formats': formats, - 'upload_date': upload_date, - 'duration': duration, + 'timestamp': int_or_none(presentation_data.get('UnixTime'), 1000), + 'duration': int_or_none(presentation_data.get('Duration'), 1000), } diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py new file mode 100644 index 000000000..d3aba58a2 --- /dev/null +++ b/youtube_dl/extractor/sixplay.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + qualities, + int_or_none, + mimetype2ext, + determine_ext, +) + + +class SixPlayIE(InfoExtractor): + _VALID_URL = r'(?:6play:|https?://(?:www\.)?6play\.fr/.+?-c_)(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.6play.fr/jamel-et-ses-amis-au-marrakech-du-rire-p_1316/jamel-et-ses-amis-au-marrakech-du-rire-2015-c_11495320', + 'md5': '42310bffe4ba3982db112b9cd3467328', + 'info_dict': { + 'id': '11495320', + 'ext': 'mp4', + 'title': 'Jamel et ses amis au Marrakech du rire 2015', + 'description': 'md5:ba2149d5c321d5201b78070ee839d872', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + clip_data = self._download_json( + 'https://player.m6web.fr/v2/video/config/6play-auth/FR/%s.json' % video_id, + video_id) + video_data = clip_data['videoInfo'] + + quality_key = qualities(['lq', 'sd', 'hq', 'hd']) + formats = [] + for source in clip_data['sources']: + source_type, source_url = source.get('type'), source.get('src') + if not source_url or source_type == 'hls/primetime': + continue + ext = mimetype2ext(source_type) or determine_ext(source_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + source_url.replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + elif ext == 'mp4': + quality = source.get('quality') + formats.append({ + 'url': source_url, + 'format_id': quality, + 'quality': quality_key(quality), + 'ext': ext, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['title'].strip(), + 'description': video_data.get('description'), + 'duration': int_or_none(video_data.get('duration')), + 'series': video_data.get('titlePgm'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/skynewsarabia.py b/youtube_dl/extractor/skynewsarabia.py index 05e1b02ad..fffc9aa22 100644 --- a/youtube_dl/extractor/skynewsarabia.py +++ b/youtube_dl/extractor/skynewsarabia.py @@ -67,7 +67,7 @@ class SkyNewsArabiaIE(SkyNewsArabiaBaseIE): class SkyNewsArabiaArticleIE(SkyNewsArabiaBaseIE): - IE_NAME = 'skynewsarabia:video' + IE_NAME = 'skynewsarabia:article' _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.skynewsarabia.com/web/article/794549/%D8%A7%D9%94%D8%AD%D8%AF%D8%A7%D8%AB-%D8%A7%D9%84%D8%B4%D8%B1%D9%82-%D8%A7%D9%84%D8%A7%D9%94%D9%88%D8%B3%D8%B7-%D8%AE%D8%B1%D9%8A%D8%B7%D8%A9-%D8%A7%D9%84%D8%A7%D9%94%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D8%B0%D9%83%D9%8A%D8%A9', diff --git a/youtube_dl/extractor/skysports.py b/youtube_dl/extractor/skysports.py new file mode 100644 index 000000000..9dc78c7d2 --- /dev/null +++ b/youtube_dl/extractor/skysports.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class SkySportsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', + 'md5': 'c44a1db29f27daf9a0003e010af82100', + 'info_dict': { + 'id': '10328419', + 'ext': 'flv', + 'title': 'Bale: Its our time to shine', + 'description': 'md5:9fd1de3614d525f5addda32ac3c482c9', + }, + 'add_ie': ['Ooyala'], + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'ooyala:%s' % self._search_regex( + r'data-video-id="([^"]+)"', webpage, 'ooyala id'), + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'ie_key': 'Ooyala', + } diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index 0b717a1e4..4967c1b77 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -9,6 +9,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + get_element_by_id, ) @@ -40,7 +41,7 @@ class SlideshareIE(InfoExtractor): bucket = info['jsplayer']['video_bucket'] ext = info['jsplayer']['video_extension'] video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) - description = self._html_search_regex( + description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex( r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage, 'description', fatal=False) @@ -51,5 +52,5 @@ class SlideshareIE(InfoExtractor): 'ext': ext, 'url': video_url, 'thumbnail': info['slideshow']['pin_image_url'], - 'description': description, + 'description': description.strip() if description else None, } diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 49e5d09ae..72fe66142 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -8,10 +8,7 @@ from ..compat import ( compat_str, compat_urllib_parse_urlencode, ) -from ..utils import ( - ExtractorError, - sanitized_Request, -) +from ..utils import ExtractorError class SohuIE(InfoExtractor): @@ -96,15 +93,10 @@ class SohuIE(InfoExtractor): else: base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' - req = sanitized_Request(base_data_url + vid_id) - - cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') - if cn_verification_proxy: - req.add_header('Ytdl-request-proxy', cn_verification_proxy) - return self._download_json( - req, video_id, - 'Downloading JSON data for %s' % vid_id) + base_data_url + vid_id, video_id, + 'Downloading JSON data for %s' % vid_id, + headers=self.geo_verification_headers()) mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 39a7aaf9d..3c552807e 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -4,8 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse from .spiegeltv import SpiegeltvIE +from ..compat import compat_urlparse +from ..utils import ( + extract_attributes, + unified_strdate, + get_element_by_attribute, +) class SpiegelIE(InfoExtractor): @@ -19,6 +24,7 @@ class SpiegelIE(InfoExtractor): 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', 'duration': 49, + 'upload_date': '20130311', }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', @@ -29,6 +35,7 @@ class SpiegelIE(InfoExtractor): 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', 'duration': 983, + 'upload_date': '20131115', }, }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', @@ -38,6 +45,7 @@ class SpiegelIE(InfoExtractor): 'ext': 'mp4', 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', + 'upload_date': '20140904', } }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', @@ -52,10 +60,10 @@ class SpiegelIE(InfoExtractor): if SpiegeltvIE.suitable(handle.geturl()): return self.url_result(handle.geturl(), 'Spiegeltv') - title = re.sub(r'\s+', ' ', self._html_search_regex( - r'(?s)<(?:h1|div) class="module-title"[^>]*>(.*?)</(?:h1|div)>', - webpage, 'title')) - description = self._html_search_meta('description', webpage, 'description') + video_data = extract_attributes(self._search_regex(r'(<div[^>]+id="spVideoElements"[^>]+>)', webpage, 'video element', default='')) + + title = video_data.get('data-video-title') or get_element_by_attribute('class', 'module-title', webpage) + description = video_data.get('data-video-teaser') or self._html_search_meta('description', webpage, 'description') base_url = self._search_regex( [r'server\s*:\s*(["\'])(?P<url>.+?)\1', r'var\s+server\s*=\s*"(?P<url>[^"]+)\"'], @@ -87,8 +95,9 @@ class SpiegelIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': description, + 'description': description.strip() if description else None, 'duration': duration, + 'upload_date': unified_strdate(video_data.get('data-video-date')), 'formats': formats, } @@ -104,6 +113,7 @@ class SpiegelArticleIE(InfoExtractor): 'ext': 'mp4', 'title': 'Faszination Badminton: Nennt es bloß nicht Federball', 'description': 're:^Patrick Kämnitz gehört.{100,}', + 'upload_date': '20140825', }, }, { 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py index 74d01183f..409d50304 100644 --- a/youtube_dl/extractor/srmediathek.py +++ b/youtube_dl/extractor/srmediathek.py @@ -9,8 +9,9 @@ from ..utils import ( class SRMediathekIE(ARDMediathekIE): + IE_NAME = 'sr:mediathek' IE_DESC = 'Saarländischer Rundfunk' - _VALID_URL = r'https?://sr-mediathek\.sr-online\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455', @@ -34,7 +35,9 @@ class SRMediathekIE(ARDMediathekIE): # m3u8 download 'skip_download': True, }, - 'expected_warnings': ['Unable to download f4m manifest'] + }, { + 'url': 'http://sr-mediathek.de/index.php?seite=7&id=7480', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index d5c852f52..0f8782d03 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -56,7 +56,7 @@ class StitcherIE(InfoExtractor): episode = self._parse_json( js_to_json(self._search_regex( - r'(?s)var\s+stitcher\s*=\s*({.+?});\n', webpage, 'episode config')), + r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')), display_id)['config']['episode'] title = unescapeHTML(episode['title']) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 6526a6345..1c04dfb7b 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -7,13 +7,13 @@ from .common import InfoExtractor from ..utils import ( determine_ext, dict_get, + int_or_none, + try_get, ) class SVTBaseIE(InfoExtractor): - def _extract_video(self, info, video_id): - video_info = self._get_video_info(info) - + def _extract_video(self, video_info, video_id): formats = [] for vr in video_info['videoReferences']: player_type = vr.get('playerType') @@ -37,6 +37,8 @@ class SVTBaseIE(InfoExtractor): 'format_id': player_type, 'url': vurl, }) + if not formats and video_info.get('rights', {}).get('geoBlockedSweden'): + self.raise_geo_restricted('This video is only available in Sweden') self._sort_formats(formats) subtitles = {} @@ -52,15 +54,32 @@ class SVTBaseIE(InfoExtractor): subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url}) - duration = video_info.get('materialLength') - age_limit = 18 if video_info.get('inappropriateForChildren') else 0 + title = video_info.get('title') + + series = video_info.get('programTitle') + season_number = int_or_none(video_info.get('season')) + episode = video_info.get('episodeTitle') + episode_number = int_or_none(video_info.get('episodeNumber')) + + duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration'))) + age_limit = None + adult = dict_get( + video_info, ('inappropriateForChildren', 'blockedForChildren'), + skip_false_values=False) + if adult is not None: + age_limit = 18 if adult else 0 return { 'id': video_id, + 'title': title, 'formats': formats, 'subtitles': subtitles, 'duration': duration, 'age_limit': age_limit, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, } @@ -85,9 +104,6 @@ class SVTIE(SVTBaseIE): if mobj: return mobj.group('url') - def _get_video_info(self, info): - return info['video'] - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) widget_id = mobj.group('widget_id') @@ -97,15 +113,15 @@ class SVTIE(SVTBaseIE): 'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id), article_id) - info_dict = self._extract_video(info, article_id) + info_dict = self._extract_video(info['video'], article_id) info_dict['title'] = info['context']['title'] return info_dict class SVTPlayIE(SVTBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp)/(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', 'info_dict': { @@ -121,25 +137,50 @@ class SVTPlayIE(SVTBaseIE): }] }, }, - } - - def _get_video_info(self, info): - return info['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'] + }, { + # geo restricted to Sweden + 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', + 'only_matching': True, + }, { + 'url': 'http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - data = self._parse_json(self._search_regex( - r'root\["__svtplay"\]\s*=\s*([^;]+);', webpage, 'embedded data'), video_id) + data = self._parse_json( + self._search_regex( + r'root\["__svtplay"\]\s*=\s*([^;]+);', + webpage, 'embedded data', default='{}'), + video_id, fatal=False) thumbnail = self._og_search_thumbnail(webpage) - info_dict = self._extract_video(data, video_id) - info_dict.update({ - 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], - 'thumbnail': thumbnail, - }) - - return info_dict + if data: + video_info = try_get( + data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'], + dict) + if video_info: + info_dict = self._extract_video(video_info, video_id) + info_dict.update({ + 'title': data['context']['dispatcher']['stores']['MetaStore']['title'], + 'thumbnail': thumbnail, + }) + return info_dict + + video_id = self._search_regex( + r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', + webpage, 'video id', default=None) + + if video_id: + data = self._download_json( + 'http://www.svt.se/videoplayer-api/video/%s' % video_id, video_id) + info_dict = self._extract_video(data, video_id) + if not info_dict.get('title'): + info_dict['title'] = re.sub( + r'\s*\|\s*.+?$', '', + info_dict.get('episode') or self._og_search_title(webpage)) + return info_dict diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index 4b4b740b4..2ecfd0405 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -1,50 +1,41 @@ # coding: utf-8 from __future__ import unicode_literals -import json +from .mitele import MiTeleBaseIE -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlencode, - compat_urlparse, -) -from ..utils import ( - get_element_by_attribute, - parse_duration, - strip_jsonp, -) - -class TelecincoIE(InfoExtractor): +class TelecincoIE(MiTeleBaseIE): IE_DESC = 'telecinco.es, cuatro.com and mediaset.es' _VALID_URL = r'https?://www\.(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html' _TESTS = [{ 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', - 'md5': '5cbef3ad5ef17bf0d21570332d140729', + 'md5': '8d7b2d5f699ee2709d992a63d5cd1712', 'info_dict': { - 'id': 'MDSVID20141015_0058', + 'id': 'JEA5ijCnF6p5W08A1rNKn7', 'ext': 'mp4', - 'title': 'Con Martín Berasategui, hacer un bacalao al ...', + 'title': 'Bacalao con kokotxas al pil-pil', + 'description': 'md5:1382dacd32dd4592d478cbdca458e5bb', 'duration': 662, }, }, { 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', - 'md5': '0a5b9f3cc8b074f50a0578f823a12694', + 'md5': '284393e5387b3b947b77c613ef04749a', 'info_dict': { - 'id': 'MDSVID20150916_0128', + 'id': 'jn24Od1zGLG4XUZcnUnZB6', 'ext': 'mp4', - 'title': '¿Quién es este ex futbolista con el que hablan ...', + 'title': '¿Quién es este ex futbolista con el que hablan Leo Messi y Luis Suárez?', + 'description': 'md5:a62ecb5f1934fc787107d7b9a2262805', 'duration': 79, }, }, { 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', - 'md5': 'ad1bfaaba922dd4a295724b05b68f86a', + 'md5': '749afab6ea5a136a8806855166ae46a2', 'info_dict': { - 'id': 'MDSVID20150513_0220', + 'id': 'aywerkD2Sv1vGNqq9b85Q2', 'ext': 'mp4', 'title': '#DOYLACARA. Con la trata no hay trato', + 'description': 'md5:2771356ff7bfad9179c5f5cd954f1477', 'duration': 50, }, }, { @@ -56,40 +47,16 @@ class TelecincoIE(InfoExtractor): }] def _real_extract(self, url): - episode = self._match_id(url) - webpage = self._download_webpage(url, episode) - embed_data_json = self._search_regex( - r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data', - ).replace('\'', '"') - embed_data = json.loads(embed_data_json) - - domain = embed_data['mediaUrl'] - if not domain.startswith('http'): - # only happens in telecinco.es videos - domain = 'http://' + domain - info_url = compat_urlparse.urljoin( - domain, - compat_urllib_parse_unquote(embed_data['flashvars']['host']) - ) - info_el = self._download_xml(info_url, episode).find('./video/info') - - video_link = info_el.find('videoUrl/link').text - token_query = compat_urllib_parse_urlencode({'id': video_link}) - token_info = self._download_json( - embed_data['flashvars']['ov_tk'] + '?' + token_query, - episode, - transform_source=strip_jsonp - ) - formats = self._extract_m3u8_formats( - token_info['tokenizedUrl'], episode, ext='mp4', entry_protocol='m3u8_native') - self._sort_formats(formats) - - return { - 'id': embed_data['videoId'], - 'display_id': episode, - 'title': info_el.find('title').text, - 'formats': formats, - 'description': get_element_by_attribute('class', 'text', webpage), - 'thumbnail': info_el.find('thumb').text, - 'duration': parse_duration(info_el.find('duration').text), - } + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage, 'title') + info = self._get_player_info(url, webpage) + info.update({ + 'display_id': display_id, + 'title': title, + 'description': self._html_search_meta( + ['og:description', 'twitter:description'], + webpage, 'title', fatal=False), + }) + return info diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 6c848dc6f..e595c4a69 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -48,6 +48,6 @@ class TF1IE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) wat_id = self._html_search_regex( - r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8}).*?\1', + r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})\1', webpage, 'wat id', group='id') return self.url_result('wat:%s' % wat_id, 'Wat') diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 07d222ae3..bb3efc4ea 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -6,6 +6,7 @@ import time import hmac import binascii import hashlib +import netrc from .once import OnceIE @@ -24,6 +25,9 @@ from ..utils import ( xpath_with_ns, mimetype2ext, find_xpath_attr, + unescapeHTML, + urlencode_postdata, + unified_timestamp, ) default_ns = 'http://www.w3.org/2005/SMIL21/Language' @@ -62,10 +66,11 @@ class ThePlatformBaseIE(OnceIE): return formats, subtitles - def get_metadata(self, path, video_id): + def _download_theplatform_metadata(self, path, video_id): info_url = 'http://link.theplatform.com/s/%s?format=preview' % path - info = self._download_json(info_url, video_id) + return self._download_json(info_url, video_id) + def _parse_theplatform_metadata(self, info): subtitles = {} captions = info.get('captions') if isinstance(captions, list): @@ -86,6 +91,10 @@ class ThePlatformBaseIE(OnceIE): 'uploader': info.get('billingCode'), } + def _extract_theplatform_metadata(self, path, video_id): + info = self._download_theplatform_metadata(path, video_id) + return self._parse_theplatform_metadata(info) + class ThePlatformIE(ThePlatformBaseIE): _VALID_URL = r'''(?x) @@ -158,6 +167,7 @@ class ThePlatformIE(ThePlatformBaseIE): 'url': 'http://player.theplatform.com/p/NnzsPC/onsite_universal/select/media/guid/2410887629/2928790?fwsitesection=nbc_the_blacklist_video_library&autoPlay=true&carouselID=137781', 'only_matching': True, }] + _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' @classmethod def _extract_urls(cls, webpage): @@ -192,6 +202,96 @@ class ThePlatformIE(ThePlatformBaseIE): sig = flags + expiration_date + checksum + str_to_hex(sig_secret) return '%s&sig=%s' % (url, sig) + def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): + def xml_text(xml_str, tag): + return self._search_regex( + '<%s>(.+?)</%s>' % (tag, tag), xml_str, tag) + + mvpd_headers = { + 'ap_42': 'anonymous', + 'ap_11': 'Linux i686', + 'ap_z': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0', + 'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0', + } + + guid = xml_text(resource, 'guid') + requestor_info = self._downloader.cache.load('mvpd', requestor_id) or {} + authn_token = requestor_info.get('authn_token') + if authn_token: + token_expires = unified_timestamp(xml_text(authn_token, 'simpleTokenExpires').replace('_GMT', '')) + if token_expires and token_expires >= time.time(): + authn_token = None + if not authn_token: + # TODO add support for other TV Providers + mso_id = 'DTV' + login_info = netrc.netrc().authenticators(mso_id) + if not login_info: + return None + + def post_form(form_page, note, data={}): + post_url = self._html_search_regex(r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_page, 'post url', group='url') + return self._download_webpage( + post_url, video_id, note, data=urlencode_postdata(data or self._hidden_inputs(form_page)), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + + provider_redirect_page = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, + 'Downloading Provider Redirect Page', query={ + 'noflash': 'true', + 'mso_id': mso_id, + 'requestor_id': requestor_id, + 'no_iframe': 'false', + 'domain_name': 'adobe.com', + 'redirect_url': url, + }) + provider_login_page = post_form( + provider_redirect_page, 'Downloading Provider Login Page') + mvpd_confirm_page = post_form(provider_login_page, 'Logging in', { + 'username': login_info[0], + 'password': login_info[2], + }) + post_form(mvpd_confirm_page, 'Confirming Login') + + session = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, + 'Retrieving Session', data=urlencode_postdata({ + '_method': 'GET', + 'requestor_id': requestor_id, + }), headers=mvpd_headers) + authn_token = unescapeHTML(xml_text(session, 'authnToken')) + requestor_info['authn_token'] = authn_token + self._downloader.cache.store('mvpd', requestor_id, requestor_info) + + authz_token = requestor_info.get(guid) + if not authz_token: + authorize = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id, + 'Retrieving Authorization Token', data=urlencode_postdata({ + 'resource_id': resource, + 'requestor_id': requestor_id, + 'authentication_token': authn_token, + 'mso_id': xml_text(authn_token, 'simpleTokenMsoID'), + 'userMeta': '1', + }), headers=mvpd_headers) + authz_token = unescapeHTML(xml_text(authorize, 'authzToken')) + requestor_info[guid] = authz_token + self._downloader.cache.store('mvpd', requestor_id, requestor_info) + + mvpd_headers.update({ + 'ap_19': xml_text(authn_token, 'simpleSamlNameID'), + 'ap_23': xml_text(authn_token, 'simpleSamlSessionIndex'), + }) + + return self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'shortAuthorize', + video_id, 'Retrieving Media Token', data=urlencode_postdata({ + 'authz_token': authz_token, + 'requestor_id': requestor_id, + 'session_guid': xml_text(authn_token, 'simpleTokenAuthenticationGuid'), + 'hashed_guid': 'false', + }), headers=mvpd_headers) + def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -265,7 +365,7 @@ class ThePlatformIE(ThePlatformBaseIE): formats, subtitles = self._extract_theplatform_smil(smil_url, video_id) self._sort_formats(formats) - ret = self.get_metadata(path, video_id) + ret = self._extract_theplatform_metadata(path, video_id) combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles) ret.update({ 'id': video_id, @@ -339,7 +439,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE): timestamp = int_or_none(entry.get('media$availableDate'), scale=1000) categories = [item['media$name'] for item in entry.get('media$categories', [])] - ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id) + ret = self._extract_theplatform_metadata('%s/%s' % (provider_id, first_video_id), video_id) subtitles = self._merge_subtitles(subtitles, ret['subtitles']) ret.update({ 'id': video_id, diff --git a/youtube_dl/extractor/threeqsdn.py b/youtube_dl/extractor/threeqsdn.py index c77a07989..a0bc12c81 100644 --- a/youtube_dl/extractor/threeqsdn.py +++ b/youtube_dl/extractor/threeqsdn.py @@ -92,12 +92,11 @@ class ThreeQSDNIE(InfoExtractor): if not item_url or item_url in urls: return urls.add(item_url) - type_ = item.get('type') - ext = determine_ext(item_url, default_ext=None) - if type_ == 'application/dash+xml' or ext == 'mpd': + ext = mimetype2ext(item.get('type')) or determine_ext(item_url, default_ext=None) + if ext == 'mpd': formats.extend(self._extract_mpd_formats( item_url, video_id, mpd_id='mpd', fatal=False)) - elif type_ in ('application/vnd.apple.mpegURL', 'application/x-mpegurl') or ext == 'm3u8': + elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( item_url, video_id, 'mp4', entry_protocol='m3u8' if live else 'm3u8_native', @@ -111,7 +110,7 @@ class ThreeQSDNIE(InfoExtractor): formats.append({ 'url': item_url, 'format_id': item.get('quality'), - 'ext': 'mp4' if item_url.startswith('rtsp') else mimetype2ext(type_) or ext, + 'ext': 'mp4' if item_url.startswith('rtsp') else ext, 'vcodec': 'none' if stream_type == 'audio' else None, }) diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 4797d1310..54c2d0aa6 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -1,74 +1,41 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - ExtractorError, - unified_strdate, -) +from ..utils import int_or_none class TouTvIE(InfoExtractor): IE_NAME = 'tou.tv' - _VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))' + _VALID_URL = r'https?://ici\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+/S[0-9]+E[0-9]+)' _TEST = { - 'url': 'http://www.tou.tv/30-vies/S04E41', + 'url': 'http://ici.tou.tv/garfield-tout-court/S2015E17', 'info_dict': { - 'id': '30-vies_S04E41', + 'id': '122017', 'ext': 'mp4', - 'title': '30 vies Saison 4 / Épisode 41', - 'description': 'md5:da363002db82ccbe4dafeb9cab039b09', - 'age_limit': 8, - 'uploader': 'Groupe des Nouveaux Médias', - 'duration': 1296, - 'upload_date': '20131118', - 'thumbnail': 'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg', + 'title': 'Saison 2015 Épisode 17', + 'description': 'La photo de famille 2', + 'upload_date': '20100717', }, 'params': { - 'skip_download': True, # Requires rtmpdump + # m3u8 download + 'skip_download': True, }, - 'skip': 'Only available in Canada' } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - - mediaId = self._search_regex( - r'"idMedia":\s*"([^"]+)"', webpage, 'media ID') - - streams_url = 'http://release.theplatform.com/content.select?pid=' + mediaId - streams_doc = self._download_xml( - streams_url, video_id, note='Downloading stream list') - - video_url = next(n.text - for n in streams_doc.findall('.//choice/url') - if '//ad.doubleclick' not in n.text) - if video_url.endswith('/Unavailable.flv'): - raise ExtractorError( - 'Access to this video is blocked from outside of Canada', - expected=True) - - duration_str = self._html_search_meta( - 'video:duration', webpage, 'duration') - duration = int(duration_str) if duration_str else None - upload_date_str = self._html_search_meta( - 'video:release_date', webpage, 'upload date') - upload_date = unified_strdate(upload_date_str) if upload_date_str else None + path = self._match_id(url) + metadata = self._download_json('http://ici.tou.tv/presentation/%s' % path, path) + video_id = metadata['IdMedia'] + details = metadata['Details'] + title = details['OriginalTitle'] return { + '_type': 'url_transparent', + 'url': 'radiocanada:%s:%s' % (metadata.get('AppCode', 'toutv'), video_id), 'id': video_id, - 'title': self._og_search_title(webpage), - 'url': video_url, - 'description': self._og_search_description(webpage), - 'uploader': self._dc_search_uploader(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'age_limit': self._media_rating_search(webpage), - 'duration': duration, - 'upload_date': upload_date, - 'ext': 'mp4', + 'title': title, + 'thumbnail': details.get('ImageUrl'), + 'duration': int_or_none(details.get('LengthInSeconds')), } diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index a4997cb89..5070082da 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -4,6 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + determine_ext, + clean_html, + get_element_by_attribute, + ExtractorError, +) class TVPIE(InfoExtractor): @@ -21,7 +27,7 @@ class TVPIE(InfoExtractor): }, }, { 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', - 'md5': 'c3b15ed1af288131115ff17a17c19dda', + 'md5': 'b0005b542e5b4de643a9690326ab1257', 'info_dict': { 'id': '17916176', 'ext': 'mp4', @@ -53,6 +59,11 @@ class TVPIE(InfoExtractor): webpage = self._download_webpage( 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) + error_massage = get_element_by_attribute('class', 'msg error', webpage) + if error_massage: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, clean_html(error_massage)), expected=True) + title = self._search_regex( r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1', webpage, 'title', group='title') @@ -66,24 +77,50 @@ class TVPIE(InfoExtractor): r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None) video_url = self._search_regex( - r'0:{src:([\'"])(?P<url>.*?)\1', webpage, 'formats', group='url', default=None) - if not video_url: + r'0:{src:([\'"])(?P<url>.*?)\1', webpage, + 'formats', group='url', default=None) + if not video_url or 'material_niedostepny.mp4' in video_url: video_url = self._download_json( 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id, video_id)['video_url'] - ext = video_url.rsplit('.', 1)[-1] - if ext != 'ism/manifest': - if '/' in ext: - ext = 'mp4' + formats = [] + video_url_base = self._search_regex( + r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', + video_url, 'video base url', default=None) + if video_url_base: + # TODO: Current DASH formats are broken - $Time$ pattern in + # <SegmentTemplate> not implemented yet + # formats.extend(self._extract_mpd_formats( + # video_url_base + '.ism/video.mpd', + # video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_f4m_formats( + video_url_base + '.ism/video.f4m', + video_id, f4m_id='hds', fatal=False)) + m3u8_formats = self._extract_m3u8_formats( + video_url_base + '.ism/video.m3u8', video_id, + 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + self._sort_formats(m3u8_formats) + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + m3u8_formats)) + formats.extend(m3u8_formats) + for i, m3u8_format in enumerate(m3u8_formats, 2): + http_url = '%s-%d.mp4' % (video_url_base, i) + if self._is_valid_url(http_url, video_id): + f = m3u8_format.copy() + f.update({ + 'url': http_url, + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) + else: formats = [{ 'format_id': 'direct', 'url': video_url, - 'ext': ext, + 'ext': determine_ext(video_url, 'mp4'), }] - else: - m3u8_url = re.sub('([^/]*)\.ism/manifest', r'\1.ism/\1.m3u8', video_url) - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') self._sort_formats(formats) diff --git a/youtube_dl/extractor/tweakers.py b/youtube_dl/extractor/tweakers.py index f3198fb85..7a9386cde 100644 --- a/youtube_dl/extractor/tweakers.py +++ b/youtube_dl/extractor/tweakers.py @@ -1,25 +1,62 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + int_or_none, + determine_ext, + mimetype2ext, +) class TweakersIE(InfoExtractor): _VALID_URL = r'https?://tweakers\.net/video/(?P<id>\d+)' _TEST = { 'url': 'https://tweakers.net/video/9926/new-nintendo-3ds-xl-op-alle-fronten-beter.html', - 'md5': '3147e4ddad366f97476a93863e4557c8', + 'md5': 'fe73e417c093a788e0160c4025f88b15', 'info_dict': { 'id': '9926', 'ext': 'mp4', 'title': 'New Nintendo 3DS XL - Op alle fronten beter', - 'description': 'md5:f97324cc71e86e11c853f0763820e3ba', + 'description': 'md5:3789b21fed9c0219e9bcaacd43fab280', 'thumbnail': 're:^https?://.*\.jpe?g$', 'duration': 386, + 'uploader_id': 's7JeEm', } } def _real_extract(self, url): - playlist_id = self._match_id(url) - entries = self._extract_xspf_playlist( - 'https://tweakers.net/video/s1playlist/%s/playlist.xspf' % playlist_id, playlist_id) - return self.playlist_result(entries, playlist_id) + video_id = self._match_id(url) + video_data = self._download_json( + 'https://tweakers.net/video/s1playlist/%s/1920/1080/playlist.json' % video_id, + video_id)['items'][0] + + title = video_data['title'] + + formats = [] + for location in video_data.get('locations', {}).get('progressive', []): + format_id = location.get('label') + width = int_or_none(location.get('width')) + height = int_or_none(location.get('height')) + for source in location.get('sources', []): + source_url = source.get('src') + if not source_url: + continue + ext = mimetype2ext(source.get('type')) or determine_ext(source_url) + formats.append({ + 'format_id': format_id, + 'url': source_url, + 'width': width, + 'height': height, + 'ext': ext, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('poster'), + 'duration': int_or_none(video_data.get('duration')), + 'uploader_id': video_data.get('account'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 20919774d..67b1277cc 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -29,7 +29,7 @@ class TwitchBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv' _API_BASE = 'https://api.twitch.tv' - _USHER_BASE = 'http://usher.twitch.tv' + _USHER_BASE = 'https://usher.ttvnw.net' _LOGIN_URL = 'http://www.twitch.tv/login' _NETRC_MACHINE = 'twitch' diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py new file mode 100644 index 000000000..ce3bf6b02 --- /dev/null +++ b/youtube_dl/extractor/urplay.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class URPlayIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?urplay\.se/program/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://urplay.se/program/190031-tripp-trapp-trad-sovkudde', + 'md5': '15ca67b63fd8fb320ac2bcd854bad7b6', + 'info_dict': { + 'id': '190031', + 'ext': 'mp4', + 'title': 'Tripp, Trapp, Träd : Sovkudde', + 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + urplayer_data = self._parse_json(self._search_regex( + r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id) + host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] + + formats = [] + for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)): + file_rtmp = urplayer_data.get('file_rtmp' + quality_attr) + if file_rtmp: + formats.append({ + 'url': 'rtmp://%s/urplay/mp4:%s' % (host, file_rtmp), + 'format_id': quality + '-rtmp', + 'ext': 'flv', + 'preference': preference, + }) + file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr) + if file_http: + file_http_base_url = 'http://%s/%s' % (host, file_http) + formats.extend(self._extract_f4m_formats( + file_http_base_url + 'manifest.f4m', video_id, + preference, '%s-hds' % quality, fatal=False)) + formats.extend(self._extract_m3u8_formats( + file_http_base_url + 'playlist.m3u8', video_id, 'mp4', + 'm3u8_native', preference, '%s-hls' % quality, fatal=False)) + self._sort_formats(formats) + + subtitles = {} + for subtitle in urplayer_data.get('subtitles', []): + subtitle_url = subtitle.get('file') + kind = subtitle.get('kind') + if subtitle_url or kind and kind != 'captions': + continue + subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({ + 'url': subtitle_url, + }) + + return { + 'id': video_id, + 'title': urplayer_data['title'], + 'description': self._og_search_description(webpage), + 'thumbnail': urplayer_data.get('image'), + 'series': urplayer_data.get('series_title'), + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dl/extractor/vidbit.py b/youtube_dl/extractor/vidbit.py new file mode 100644 index 000000000..e7ac5a842 --- /dev/null +++ b/youtube_dl/extractor/vidbit.py @@ -0,0 +1,84 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + js_to_json, + remove_end, + unified_strdate, +) + + +class VidbitIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vidbit\.co/(?:watch|embed)\?.*?\bv=(?P<id>[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'http://www.vidbit.co/watch?v=jkL2yDOEq2', + 'md5': '1a34b7f14defe3b8fafca9796892924d', + 'info_dict': { + 'id': 'jkL2yDOEq2', + 'ext': 'mp4', + 'title': 'Intro to VidBit', + 'description': 'md5:5e0d6142eec00b766cbf114bfd3d16b7', + 'thumbnail': 're:https?://.*\.jpg$', + 'upload_date': '20160618', + 'view_count': int, + 'comment_count': int, + } + }, { + 'url': 'http://www.vidbit.co/embed?v=jkL2yDOEq2&auto=0&water=0', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + compat_urlparse.urljoin(url, '/watch?v=%s' % video_id), video_id) + + video_url, title = [None] * 2 + + config = self._parse_json(self._search_regex( + r'(?s)\.setup\(({.+?})\);', webpage, 'setup', default='{}'), + video_id, transform_source=js_to_json) + if config: + if config.get('file'): + video_url = compat_urlparse.urljoin(url, config['file']) + title = config.get('title') + + if not video_url: + video_url = compat_urlparse.urljoin(url, self._search_regex( + r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'video URL', group='url')) + + if not title: + title = remove_end( + self._html_search_regex( + (r'<h1>(.+?)</h1>', r'<title>(.+?)</title>'), + webpage, 'title', default=None) or self._og_search_title(webpage), + ' - VidBit') + + description = self._html_search_meta( + ('description', 'og:description', 'twitter:description'), + webpage, 'description') + + upload_date = unified_strdate(self._html_search_meta( + 'datePublished', webpage, 'upload date')) + + view_count = int_or_none(self._search_regex( + r'<strong>(\d+)</strong> views', + webpage, 'view count', fatal=False)) + comment_count = int_or_none(self._search_regex( + r'id=["\']cmt_num["\'][^>]*>\((\d+)\)', + webpage, 'comment count', fatal=False)) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': self._og_search_thumbnail(webpage), + 'upload_date': upload_date, + 'view_count': view_count, + 'comment_count': comment_count, + } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index c52986af6..7e854f326 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -16,6 +16,7 @@ from ..utils import ( ExtractorError, InAdvancePagedList, int_or_none, + NO_DEFAULT, RegexNotFoundError, sanitized_Request, smuggle_url, @@ -56,6 +57,26 @@ class VimeoBaseInfoExtractor(InfoExtractor): self._set_vimeo_cookie('vuid', vuid) self._download_webpage(login_request, None, False, 'Wrong login info') + def _verify_video_password(self, url, video_id, webpage): + password = self._downloader.params.get('videopassword') + if password is None: + raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) + token, vuid = self._extract_xsrft_and_vuid(webpage) + data = urlencode_postdata({ + 'password': password, + 'token': token, + }) + if url.startswith('http://'): + # vimeo only supports https now, but the user can give an http url + url = url.replace('http://', 'https://') + password_request = sanitized_Request(url + '/password', data) + password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + password_request.add_header('Referer', url) + self._set_vimeo_cookie('vuid', vuid) + return self._download_webpage( + password_request, video_id, + 'Verifying the password', 'Wrong password') + def _extract_xsrft_and_vuid(self, webpage): xsrft = self._search_regex( r'(?:(?P<q1>["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)', @@ -146,7 +167,7 @@ class VimeoIE(VimeoBaseInfoExtractor): \. )? vimeo(?P<pro>pro)?\.com/ - (?!channels/[^/?#]+/?(?:$|[?#])|[^/]+/review/|(?:album|ondemand)/) + (?!(?:channels|album)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) (?:.*?/)? (?: (?: @@ -227,8 +248,6 @@ class VimeoIE(VimeoBaseInfoExtractor): { 'url': 'http://vimeo.com/channels/keypeele/75629013', 'md5': '2f86a05afe9d7abc0b9126d229bbe15d', - 'note': 'Video is freely available via original URL ' - 'and protected with password when accessed via http://vimeo.com/75629013', 'info_dict': { 'id': '75629013', 'ext': 'mp4', @@ -272,7 +291,7 @@ class VimeoIE(VimeoBaseInfoExtractor): { # contains original format 'url': 'https://vimeo.com/33951933', - 'md5': '53c688fa95a55bf4b7293d37a89c5c53', + 'md5': '2d9f5475e0537f013d0073e812ab89e6', 'info_dict': { 'id': '33951933', 'ext': 'mp4', @@ -285,6 +304,29 @@ class VimeoIE(VimeoBaseInfoExtractor): }, }, { + # only available via https://vimeo.com/channels/tributes/6213729 and + # not via https://vimeo.com/6213729 + 'url': 'https://vimeo.com/channels/tributes/6213729', + 'info_dict': { + 'id': '6213729', + 'ext': 'mp4', + 'title': 'Vimeo Tribute: The Shining', + 'uploader': 'Casey Donahue', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/caseydonahue', + 'uploader_id': 'caseydonahue', + 'upload_date': '20090821', + 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download JSON metadata'], + }, + { + 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741', + 'only_matching': True, + }, + { 'url': 'https://vimeo.com/109815029', 'note': 'Video not completely processed, "failed" seed status', 'only_matching': True, @@ -294,6 +336,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'only_matching': True, }, { + 'url': 'https://vimeo.com/album/2632481/video/79010983', + 'only_matching': True, + }, + { # source file returns 403: Forbidden 'url': 'https://vimeo.com/7809605', 'only_matching': True, @@ -318,26 +364,11 @@ class VimeoIE(VimeoBaseInfoExtractor): r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) if mobj: return mobj.group(1) - - def _verify_video_password(self, url, video_id, webpage): - password = self._downloader.params.get('videopassword') - if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) - token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata({ - 'password': password, - 'token': token, - }) - if url.startswith('http://'): - # vimeo only supports https now, but the user can give an http url - url = url.replace('http://', 'https://') - password_request = sanitized_Request(url + '/password', data) - password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Referer', url) - self._set_vimeo_cookie('vuid', vuid) - return self._download_webpage( - password_request, video_id, - 'Verifying the password', 'Wrong password') + # Look more for non-standard embedded Vimeo player + mobj = re.search( + r'<video[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)(?P=q1)', webpage) + if mobj: + return mobj.group('url') def _verify_player_video_password(self, url, video_id): password = self._downloader.params.get('videopassword') @@ -369,7 +400,7 @@ class VimeoIE(VimeoBaseInfoExtractor): orig_url = url if mobj.group('pro') or mobj.group('player'): url = 'https://player.vimeo.com/video/' + video_id - else: + elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id # Retrieve video webpage to extract further information @@ -630,8 +661,21 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): webpage = self._login_list_password(page_url, list_id, webpage) yield self._extract_list_title(webpage) - for video_id in re.findall(r'id="clip_(\d+?)"', webpage): - yield self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo') + # Try extracting href first since not all videos are available via + # short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729) + clips = re.findall( + r'id="clip_(\d+)"[^>]*>\s*<a[^>]+href="(/(?:[^/]+/)*\1)', webpage) + if clips: + for video_id, video_url in clips: + yield self.url_result( + compat_urlparse.urljoin(base_url, video_url), + VimeoIE.ie_key(), video_id=video_id) + # More relaxed fallback + else: + for video_id in re.findall(r'id=["\']clip_(\d+)', webpage): + yield self.url_result( + 'https://vimeo.com/%s' % video_id, + VimeoIE.ie_key(), video_id=video_id) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: break @@ -668,7 +712,7 @@ class VimeoUserIE(VimeoChannelIE): class VimeoAlbumIE(VimeoChannelIE): IE_NAME = 'vimeo:album' - _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)' + _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)(?:$|[?#]|/(?!video))' _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>' _TESTS = [{ 'url': 'https://vimeo.com/album/2632481', @@ -688,6 +732,13 @@ class VimeoAlbumIE(VimeoChannelIE): 'params': { 'videopassword': 'youtube-dl', } + }, { + 'url': 'https://vimeo.com/album/2632481/sort:plays/format:thumbnail', + 'only_matching': True, + }, { + # TODO: respect page number + 'url': 'https://vimeo.com/album/2632481/page:2/sort:plays/format:thumbnail', + 'only_matching': True, }] def _page_url(self, base_url, pagenum): @@ -746,12 +797,39 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'uploader_id': 'user22258446', } + }, { + 'note': 'Password protected', + 'url': 'https://vimeo.com/user37284429/review/138823582/c4d865efde', + 'info_dict': { + 'id': '138823582', + 'ext': 'mp4', + 'title': 'EFFICIENT PICKUP MASTERCLASS MODULE 1', + 'uploader': 'TMB', + 'uploader_id': 'user37284429', + }, + 'params': { + 'videopassword': 'holygrail', + }, }] + def _real_initialize(self): + self._login() + + def _get_config_url(self, webpage_url, video_id, video_password_verified=False): + webpage = self._download_webpage(webpage_url, video_id) + config_url = self._html_search_regex( + r'data-config-url="([^"]+)"', webpage, 'config URL', + default=NO_DEFAULT if video_password_verified else None) + if config_url is None: + self._verify_video_password(webpage_url, video_id, webpage) + config_url = self._get_config_url( + webpage_url, video_id, video_password_verified=True) + return config_url + def _real_extract(self, url): video_id = self._match_id(url) - config = self._download_json( - 'https://player.vimeo.com/video/%s/config' % video_id, video_id) + config_url = self._get_config_url(url, video_id) + config = self._download_json(config_url, video_id) info_dict = self._parse_config(config, video_id) self._vimeo_sort_formats(info_dict['formats']) info_dict['id'] = video_id diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index a6a6cc479..0183f052a 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -24,6 +24,7 @@ class VineIE(InfoExtractor): 'upload_date': '20130519', 'uploader': 'Jack Dorsey', 'uploader_id': '76', + 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, @@ -39,6 +40,7 @@ class VineIE(InfoExtractor): 'upload_date': '20140815', 'uploader': 'Mars Ruiz', 'uploader_id': '1102363502380728320', + 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, @@ -54,6 +56,7 @@ class VineIE(InfoExtractor): 'upload_date': '20130430', 'uploader': 'Z3k3', 'uploader_id': '936470460173008896', + 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, @@ -71,6 +74,7 @@ class VineIE(InfoExtractor): 'upload_date': '20150705', 'uploader': 'Pimry_zaa', 'uploader_id': '1135760698325307392', + 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, @@ -86,10 +90,12 @@ class VineIE(InfoExtractor): data = self._parse_json( self._search_regex( - r'window\.POST_DATA\s*=\s*{\s*%s\s*:\s*({.+?})\s*};\s*</script>' % video_id, + r'window\.POST_DATA\s*=\s*({.+?});\s*</script>', webpage, 'vine data'), video_id) + data = data[list(data.keys())[0]] + formats = [{ 'format_id': '%(format)s-%(rate)s' % f, 'vcodec': f.get('format'), @@ -109,6 +115,7 @@ class VineIE(InfoExtractor): 'upload_date': unified_strdate(data.get('created')), 'uploader': username, 'uploader_id': data.get('userIdStr'), + 'view_count': int_or_none(data.get('loops', {}).get('count')), 'like_count': int_or_none(data.get('likes', {}).get('count')), 'comment_count': int_or_none(data.get('comments', {}).get('count')), 'repost_count': int_or_none(data.get('reposts', {}).get('count')), diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 79c819bc3..758d9c86b 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re import json +import sys from .common import InfoExtractor from ..compat import compat_str @@ -10,7 +11,6 @@ from ..utils import ( ExtractorError, int_or_none, orderedSet, - sanitized_Request, str_to_int, unescapeHTML, unified_strdate, @@ -27,12 +27,12 @@ class VKIE(InfoExtractor): https?:// (?: (?: - (?:m\.)?vk\.com/video_| + (?:(?:m|new)\.)?vk\.com/video_| (?:www\.)?daxab.com/ ) ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)| (?: - (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video| + (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?video| (?:www\.)?daxab.com/embed/ ) (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))? @@ -182,6 +182,10 @@ class VKIE(InfoExtractor): # pladform embed 'url': 'https://vk.com/video-76116461_171554880', 'only_matching': True, + }, + { + 'url': 'http://new.vk.com/video205387401_165548505', + 'only_matching': True, } ] @@ -190,7 +194,7 @@ class VKIE(InfoExtractor): if username is None: return - login_page = self._download_webpage( + login_page, url_handle = self._download_webpage_handle( 'https://vk.com', None, 'Downloading login page') login_form = self._hidden_inputs(login_page) @@ -200,11 +204,26 @@ class VKIE(InfoExtractor): 'pass': password.encode('cp1251'), }) - request = sanitized_Request( - 'https://login.vk.com/?act=login', - urlencode_postdata(login_form)) + # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header + # and expects the first one to be set rather than second (see + # https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201). + # As of RFC6265 the newer one cookie should be set into cookie store + # what actually happens. + # We will workaround this VK issue by resetting the remixlhk cookie to + # the first one manually. + cookies = url_handle.headers.get('Set-Cookie') + if sys.version_info[0] >= 3: + cookies = cookies.encode('iso-8859-1') + cookies = cookies.decode('utf-8') + remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies) + if remixlhk: + value, domain = remixlhk.groups() + self._set_cookie(domain, 'remixlhk', value) + login_page = self._download_webpage( - request, None, note='Logging in as %s' % username) + 'https://login.vk.com/?act=login', None, + note='Logging in as %s' % username, + data=urlencode_postdata(login_form)) if re.search(r'onLoginFailed', login_page): raise ExtractorError( @@ -339,7 +358,7 @@ class VKIE(InfoExtractor): class VKUserVideosIE(InfoExtractor): IE_NAME = 'vk:uservideos' IE_DESC = "VK - User's Videos" - _VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)' + _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)' _TEMPLATE_URL = 'https://vk.com/videos' _TESTS = [{ 'url': 'http://vk.com/videos205387401', @@ -354,6 +373,12 @@ class VKUserVideosIE(InfoExtractor): }, { 'url': 'http://vk.com/videos-97664626?section=all', 'only_matching': True, + }, { + 'url': 'http://m.vk.com/videos205387401', + 'only_matching': True, + }, { + 'url': 'http://new.vk.com/videos205387401', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py index 8e35f24e8..bec7ab327 100644 --- a/youtube_dl/extractor/vrt.py +++ b/youtube_dl/extractor/vrt.py @@ -25,7 +25,8 @@ class VRTIE(InfoExtractor): 'timestamp': 1414271750.949, 'upload_date': '20141025', 'duration': 929, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, # sporza.be { @@ -39,7 +40,8 @@ class VRTIE(InfoExtractor): 'timestamp': 1413835980.560, 'upload_date': '20141020', 'duration': 3238, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, # cobra.be { @@ -53,16 +55,39 @@ class VRTIE(InfoExtractor): 'timestamp': 1413967500.494, 'upload_date': '20141022', 'duration': 661, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, { # YouTube video 'url': 'http://deredactie.be/cm/vrtnieuws/videozone/nieuws/cultuurenmedia/1.2622957', - 'only_matching': True, + 'md5': 'b8b93da1df1cea6c8556255a796b7d61', + 'info_dict': { + 'id': 'Wji-BZ0oCwg', + 'ext': 'mp4', + 'title': 'ROGUE ONE: A STAR WARS STORY Official Teaser Trailer', + 'description': 'md5:8e468944dce15567a786a67f74262583', + 'uploader': 'Star Wars', + 'uploader_id': 'starwars', + 'upload_date': '20160407', + }, + 'add_ie': ['Youtube'], }, { 'url': 'http://cobra.canvas.be/cm/cobra/videozone/rubriek/film-videozone/1.2377055', - 'only_matching': True, + 'md5': '', + 'info_dict': { + 'id': '2377055', + 'ext': 'mp4', + 'title': 'Cafe Derby', + 'description': 'Lenny Van Wesemael debuteert met de langspeelfilm Café Derby. Een waar gebeurd maar ook verzonnen verhaal.', + 'upload_date': '20150626', + 'timestamp': 1435305240.769, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } } ] @@ -98,6 +123,32 @@ class VRTIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( src, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + src.replace('playlist.m3u8', 'manifest.f4m'), + video_id, f4m_id='hds', fatal=False)) + if 'data-video-geoblocking="true"' not in webpage: + rtmp_formats = self._extract_smil_formats( + src.replace('playlist.m3u8', 'jwplayer.smil'), + video_id, fatal=False) + formats.extend(rtmp_formats) + for rtmp_format in rtmp_formats: + rtmp_format_c = rtmp_format.copy() + rtmp_format_c['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + del rtmp_format_c['play_path'] + del rtmp_format_c['ext'] + http_format = rtmp_format_c.copy() + http_format.update({ + 'url': rtmp_format_c['url'].replace('rtmp://', 'http://').replace('vod.', 'download.').replace('/_definst_/', '/').replace('mp4:', ''), + 'format_id': rtmp_format['format_id'].replace('rtmp', 'http'), + 'protocol': 'http', + }) + rtsp_format = rtmp_format_c.copy() + rtsp_format.update({ + 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), + 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), + 'protocol': 'rtsp', + }) + formats.extend([http_format, rtsp_format]) else: formats.extend(self._extract_f4m_formats( '%s/manifest.f4m' % src, video_id, f4m_id='hds', fatal=False)) diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py index 5a41f8ffa..bcb140305 100644 --- a/youtube_dl/extractor/xnxx.py +++ b/youtube_dl/extractor/xnxx.py @@ -6,17 +6,23 @@ from ..compat import compat_urllib_parse_unquote class XNXXIE(InfoExtractor): - _VALID_URL = r'^https?://(?:video|www)\.xnxx\.com/video(?P<id>[0-9]+)/(.*)' - _TEST = { - 'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_', - 'md5': '0831677e2b4761795f68d417e0b7b445', + _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/' + _TESTS = [{ + 'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video', + 'md5': 'ef7ecee5af78f8b03dca2cf31341d3a0', 'info_dict': { - 'id': '1135332', + 'id': '55awb78', 'ext': 'flv', - 'title': 'lida » Naked Funny Actress (5)', + 'title': 'Skyrim Test Video', 'age_limit': 18, - } - } + }, + }, { + 'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_', + 'only_matching': True, + }, { + 'url': 'http://www.xnxx.com/video-55awb78/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 4075b8a4f..83bc1fef2 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -4,17 +4,23 @@ import itertools import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, orderedSet, + parse_duration, sanitized_Request, str_to_int, ) class XTubeIE(InfoExtractor): - _VALID_URL = r'(?:xtube:|https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-))(?P<id>[^/?&#]+)' + _VALID_URL = r'''(?x) + (?: + xtube:| + https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-) + ) + (?P<id>[^/?&#]+) + ''' _TESTS = [{ # old URL schema @@ -27,6 +33,8 @@ class XTubeIE(InfoExtractor): 'description': 'contains:an ET kind of thing', 'uploader': 'greenshowers', 'duration': 450, + 'view_count': int, + 'comment_count': int, 'age_limit': 18, } }, { @@ -51,21 +59,30 @@ class XTubeIE(InfoExtractor): req.add_header('Cookie', 'age_verified=1; cookiesAccepted=1') webpage = self._download_webpage(req, display_id) - flashvars = self._parse_json( - self._search_regex( - r'xt\.playerOps\s*=\s*({.+?});', webpage, 'player ops'), - video_id)['flashvars'] - - title = flashvars.get('title') or self._search_regex( - r'<h1>([^<]+)</h1>', webpage, 'title') - video_url = compat_urllib_parse_unquote(flashvars['video_url']) - duration = int_or_none(flashvars.get('video_duration')) - - uploader = self._search_regex( - r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"', - webpage, 'uploader', fatal=False) + sources = self._parse_json(self._search_regex( + r'sources\s*:\s*({.+?}),', webpage, 'sources'), video_id) + + formats = [] + for format_id, format_url in sources.items(): + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + self._sort_formats(formats) + + title = self._search_regex( + (r'<h1>(?P<title>[^<]+)</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), + webpage, 'title', group='title') description = self._search_regex( r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False) + uploader = self._search_regex( + (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"', + r'<span[^>]+class="nickname"[^>]*>([^<]+)'), + webpage, 'uploader', fatal=False) + duration = parse_duration(self._search_regex( + r'<dt>Runtime:</dt>\s*<dd>([^<]+)</dd>', + webpage, 'duration', fatal=False)) view_count = str_to_int(self._search_regex( r'<dt>Views:</dt>\s*<dd>([\d,\.]+)</dd>', webpage, 'view count', fatal=False)) @@ -76,7 +93,6 @@ class XTubeIE(InfoExtractor): return { 'id': video_id, 'display_id': display_id, - 'url': video_url, 'title': title, 'description': description, 'uploader': uploader, @@ -84,6 +100,7 @@ class XTubeIE(InfoExtractor): 'view_count': view_count, 'comment_count': comment_count, 'age_limit': 18, + 'formats': formats, } diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 0be8932ad..a66daee46 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -68,6 +68,20 @@ class XuiteIE(InfoExtractor): }, 'skip': 'Video removed', }, { + # Video with encoded media id + # from http://forgetfulbc.blogspot.com/2016/06/date.html + 'url': 'http://vlog.xuite.net/embed/cE1xbENoLTI3NDQ3MzM2LmZsdg==?ar=0&as=0', + 'info_dict': { + 'id': 'cE1xbENoLTI3NDQ3MzM2LmZsdg==', + 'ext': 'mp4', + 'title': '男女平權只是口號?專家解釋約會時男生是否該幫女生付錢 (中字)', + 'description': 'md5:f0abdcb69df300f522a5442ef3146f2a', + 'timestamp': 1466160960, + 'upload_date': '20160617', + 'uploader': 'B.C. & Lowy', + 'uploader_id': '232279340', + }, + }, { 'url': 'http://vlog.xuite.net/play/S1dDUjdyLTMyOTc3NjcuZmx2/%E5%AD%AB%E7%87%95%E5%A7%BF-%E7%9C%BC%E6%B7%9A%E6%88%90%E8%A9%A9', 'only_matching': True, }] @@ -80,10 +94,9 @@ class XuiteIE(InfoExtractor): def base64_encode_utf8(data): return base64.b64encode(data.encode('utf-8')).decode('utf-8') - def _extract_flv_config(self, media_id): - base64_media_id = self.base64_encode_utf8(media_id) + def _extract_flv_config(self, encoded_media_id): flv_config = self._download_xml( - 'http://vlog.xuite.net/flash/player?media=%s' % base64_media_id, + 'http://vlog.xuite.net/flash/player?media=%s' % encoded_media_id, 'flv config') prop_dict = {} for prop in flv_config.findall('./property'): @@ -108,9 +121,14 @@ class XuiteIE(InfoExtractor): '%s returned error: %s' % (self.IE_NAME, error_msg), expected=True) - video_id = self._html_search_regex( - r'data-mediaid="(\d+)"', webpage, 'media id') - flv_config = self._extract_flv_config(video_id) + encoded_media_id = self._search_regex( + r'attributes\.name\s*=\s*"([^"]+)"', webpage, + 'encoded media id', default=None) + if encoded_media_id is None: + video_id = self._html_search_regex( + r'data-mediaid="(\d+)"', webpage, 'media id') + encoded_media_id = self.base64_encode_utf8(video_id) + flv_config = self._extract_flv_config(encoded_media_id) FORMATS = { 'audio': 'mp3', diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 927a964a4..b0679dfb7 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -19,6 +19,7 @@ from ..utils import ( mimetype2ext, ) +from .brightcove import BrightcoveNewIE from .nbc import NBCSportsVPlayerIE @@ -227,7 +228,12 @@ class YahooIE(InfoExtractor): # Look for NBCSports iframes nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) if nbc_sports_url: - return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') + return self.url_result(nbc_sports_url, NBCSportsVPlayerIE.ie_key()) + + # Look for Brightcove New Studio embeds + bc_url = BrightcoveNewIE._extract_url(webpage) + if bc_url: + return self.url_result(bc_url, BrightcoveNewIE.ie_key()) # Query result is often embedded in webpage as JSON. Sometimes explicit requests # to video API results in a failure with geo restriction reason therefore using diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 147608ebe..e37f237c7 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -16,7 +16,6 @@ from ..compat import ( from ..utils import ( ExtractorError, get_element_by_attribute, - sanitized_Request, ) @@ -218,14 +217,10 @@ class YoukuIE(InfoExtractor): headers = { 'Referer': req_url, } + headers.update(self.geo_verification_headers()) self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com') - req = sanitized_Request(req_url, headers=headers) - cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') - if cn_verification_proxy: - req.add_header('Ytdl-request-proxy', cn_verification_proxy) - - raw_data = self._download_json(req, video_id, note=note) + raw_data = self._download_json(req_url, video_id, note=note, headers=headers) return raw_data['data'] diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 00dd602ff..8aa7dfc41 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -501,6 +501,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'youtube_include_dash_manifest': True, 'format': '141', }, + 'skip': 'format 141 not served anymore', }, # DASH manifest with encrypted signature { @@ -517,7 +518,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'params': { 'youtube_include_dash_manifest': True, - 'format': '141', + 'format': '141/bestaudio[ext=m4a]', }, }, # JS player signature function name containing $ @@ -537,7 +538,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'params': { 'youtube_include_dash_manifest': True, - 'format': '141', + 'format': '141/bestaudio[ext=m4a]', }, }, # Controversy video @@ -618,7 +619,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic', 'license': 'Standard YouTube License', 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', - 'uploader': 'Olympics', + 'uploader': 'Olympic', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', }, 'params': { @@ -671,7 +672,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader': 'dorappi2000', 'license': 'Standard YouTube License', - 'formats': 'mincount:33', + 'formats': 'mincount:32', }, }, # DASH manifest with segment_list @@ -691,7 +692,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'youtube_include_dash_manifest': True, 'format': '135', # bestvideo - } + }, + 'skip': 'This live event has ended.', }, { # Multifeed videos (multiple cameras), URL is for Main Camera @@ -762,6 +764,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30', }, 'playlist_count': 2, + 'skip': 'Not multifeed anymore', }, { 'url': 'http://vid.plus/FlRa-iH7PGw', @@ -814,6 +817,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video does not exist.', }, { # Video licensed under Creative Commons @@ -1331,7 +1335,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:[a-zA-Z-]+="[^"]*"\s+)*? (?:title|href)="([^"]+)"\s+ (?:[a-zA-Z-]+="[^"]*"\s+)*? - class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*> + class="[^"]*"[^>]*> [^<]+\.{3}\s* </a> ''', r'\1', video_description) @@ -1726,6 +1730,39 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } +class YoutubeSharedVideoIE(InfoExtractor): + _VALID_URL = r'(?:https?:)?//(?:www\.)?youtube\.com/shared\?ci=(?P<id>[0-9A-Za-z_-]{11})' + IE_NAME = 'youtube:shared' + + _TEST = { + 'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU', + 'info_dict': { + 'id': 'uPDB5I9wfp8', + 'ext': 'webm', + 'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3', + 'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d', + 'upload_date': '20160219', + 'uploader': 'Pocoyo - Português (BR)', + 'uploader_id': 'PocoyoBrazil', + }, + 'add_ie': ['Youtube'], + 'params': { + # There are already too many Youtube downloads + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + real_video_id = self._html_search_meta( + 'videoId', webpage, 'YouTube video id', fatal=True) + + return self.url_result(real_video_id, YoutubeIE.ie_key()) + + class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com playlists' _VALID_URL = r"""(?x)(?: @@ -1941,10 +1978,13 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url)) + def _build_template_url(self, url, channel_id): + return self._TEMPLATE_URL % channel_id + def _real_extract(self, url): channel_id = self._match_id(url) - url = self._TEMPLATE_URL % channel_id + url = self._build_template_url(url, channel_id) # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) # Workaround by extracting as a playlist if managed to obtain channel playlist URL @@ -1958,9 +1998,13 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): channel_playlist_id = self._html_search_meta( 'channelId', channel_page, 'channel id', default=None) if not channel_playlist_id: - channel_playlist_id = self._search_regex( - r'data-(?:channel-external-|yt)id="([^"]+)"', - channel_page, 'channel id', default=None) + channel_url = self._html_search_meta( + ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'), + channel_page, 'channel url', default=None) + if channel_url: + channel_playlist_id = self._search_regex( + r'vnd\.youtube://user/([0-9A-Za-z_-]+)', + channel_url, 'channel id', default=None) if channel_playlist_id and channel_playlist_id.startswith('UC'): playlist_id = 'UU' + channel_playlist_id[2:] return self.url_result( @@ -1983,20 +2027,39 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): for video_id, video_title in self.extract_videos_from_page(channel_page)] return self.playlist_result(entries, channel_id) + try: + next(self._entries(channel_page, channel_id)) + except StopIteration: + alert_message = self._html_search_regex( + r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>', + channel_page, 'alert', default=None, group='alert') + if alert_message: + raise ExtractorError('Youtube said: %s' % alert_message, expected=True) + return self.playlist_result(self._entries(channel_page, channel_id), channel_id) class YoutubeUserIE(YoutubeChannelIE): IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/|c/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos' + _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' + _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' IE_NAME = 'youtube:user' _TESTS = [{ 'url': 'https://www.youtube.com/user/TheLinuxFoundation', 'playlist_mincount': 320, 'info_dict': { - 'title': 'TheLinuxFoundation', + 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', + 'title': 'Uploads from The Linux Foundation', + } + }, { + # Only available via https://www.youtube.com/c/12minuteathlete/videos + # but not https://www.youtube.com/user/12minuteathlete/videos + 'url': 'https://www.youtube.com/c/12minuteathlete/videos', + 'playlist_mincount': 249, + 'info_dict': { + 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ', + 'title': 'Uploads from 12 Minute Athlete', } }, { 'url': 'ytuser:phihag', @@ -2004,6 +2067,13 @@ class YoutubeUserIE(YoutubeChannelIE): }, { 'url': 'https://www.youtube.com/c/gametrailers', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/gametrailers', + 'only_matching': True, + }, { + # This channel is not available. + 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', + 'only_matching': True, }] @classmethod @@ -2016,6 +2086,10 @@ class YoutubeUserIE(YoutubeChannelIE): else: return super(YoutubeUserIE, cls).suitable(url) + def _build_template_url(self, url, channel_id): + mobj = re.match(self._VALID_URL, url) + return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id')) + class YoutubeLiveIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com live streams' diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index a7440c582..9737f7002 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -232,7 +232,7 @@ class JSInterpreter(object): def extract_function(self, funcname): func_m = re.search( r'''(?x) - (?:function\s+%s|[{;,]%s\s*=\s*function|var\s+%s\s*=\s*function)\s* + (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s* \((?P<args>[^)]*)\)\s* \{(?P<code>[^}]+)\}''' % ( re.escape(funcname), re.escape(funcname), re.escape(funcname)), diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 99ce4131f..c4a85b2c0 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -26,9 +26,11 @@ def parseOpts(overrideArguments=None): except IOError: return default # silently skip if file is not present try: - res = [] - for l in optionf: - res += compat_shlex_split(l, comments=True) + # FIXME: https://github.com/rg3/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56 + contents = optionf.read() + if sys.version_info < (3,): + contents = contents.decode(preferredencoding()) + res = compat_shlex_split(contents, comments=True) finally: optionf.close() return res @@ -212,10 +214,15 @@ def parseOpts(overrideArguments=None): help='Make all connections via IPv6 (experimental)', ) network.add_option( + '--geo-verification-proxy', + dest='geo_verification_proxy', default=None, metavar='URL', + help='Use this proxy to verify the IP address for some geo-restricted sites. ' + 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)' + ) + network.add_option( '--cn-verification-proxy', dest='cn_verification_proxy', default=None, metavar='URL', - help='Use this proxy to verify the IP address for some Chinese sites. ' - 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)' + help=optparse.SUPPRESS_HELP, ) selection = optparse.OptionGroup(parser, 'Video Selection') @@ -809,11 +816,11 @@ def parseOpts(overrideArguments=None): system_conf = [] user_conf = [] else: - system_conf = compat_conf(_readOptions('/etc/youtube-dl.conf')) + system_conf = _readOptions('/etc/youtube-dl.conf') if '--ignore-config' in system_conf: user_conf = [] else: - user_conf = compat_conf(_readUserConf()) + user_conf = _readUserConf() argv = system_conf + user_conf + command_line_conf opts, args = parser.parse_args(argv) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index fa99b0c2a..c1e9eb159 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -363,8 +363,10 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): input_files = [filename] + sub_filenames opts = [ - '-map', '0', - '-c', 'copy', + '-map', '0:v', + '-c:v', 'copy', + '-map', '0:a', + '-c:a', 'copy', # Don't copy the existing subtitles, we may be running the # postprocessor a second time '-map', '-0:s', diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py index fd49d7435..104807242 100644 --- a/youtube_dl/socks.py +++ b/youtube_dl/socks.py @@ -76,7 +76,7 @@ class Socks4Error(ProxyError): CODES = { 91: 'request rejected or failed', - 92: 'request rejected becasue SOCKS server cannot connect to identd on the client', + 92: 'request rejected because SOCKS server cannot connect to identd on the client', 93: 'request rejected because the client program and identd report different user-ids' } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index fe175e82c..4c1d0d526 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -110,6 +110,49 @@ ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐ،٠itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'], 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy'))) +DATE_FORMATS = ( + '%d %B %Y', + '%d %b %Y', + '%B %d %Y', + '%b %d %Y', + '%b %dst %Y %I:%M', + '%b %dnd %Y %I:%M', + '%b %dth %Y %I:%M', + '%Y %m %d', + '%Y-%m-%d', + '%Y/%m/%d', + '%Y/%m/%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S.%f', + '%d.%m.%Y %H:%M', + '%d.%m.%Y %H.%M', + '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S.%fZ', + '%Y-%m-%dT%H:%M:%S.%f0Z', + '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%dT%H:%M:%S.%f', + '%Y-%m-%dT%H:%M', +) + +DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) +DATE_FORMATS_DAY_FIRST.extend([ + '%d-%m-%Y', + '%d.%m.%Y', + '%d.%m.%y', + '%d/%m/%Y', + '%d/%m/%y', + '%d/%m/%Y %H:%M:%S', +]) + +DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) +DATE_FORMATS_MONTH_FIRST.extend([ + '%m-%d-%Y', + '%m.%d.%Y', + '%m/%d/%Y', + '%m/%d/%y', + '%m/%d/%Y %H:%M:%S', +]) + def preferredencoding(): """Get preferred encoding. @@ -267,9 +310,17 @@ def get_element_by_id(id, html): return get_element_by_attribute('id', id, html) -def get_element_by_attribute(attribute, value, html): +def get_element_by_class(class_name, html): + return get_element_by_attribute( + 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), + html, escape_value=False) + + +def get_element_by_attribute(attribute, value, html, escape_value=True): """Return the content of the tag with the specified attribute in the passed HTML document""" + value = re.escape(value) if escape_value else value + m = re.search(r'''(?xs) <([a-zA-Z0-9:._-]+) (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? @@ -278,7 +329,7 @@ def get_element_by_attribute(attribute, value, html): \s*> (?P<content>.*?) </\1> - ''' % (re.escape(attribute), re.escape(value)), html) + ''' % (re.escape(attribute), value), html) if not m: return None @@ -975,6 +1026,24 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): https_response = http_response +def extract_timezone(date_str): + m = re.search( + r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', + date_str) + if not m: + timezone = datetime.timedelta() + else: + date_str = date_str[:-len(m.group('tz'))] + if not m.group('sign'): + timezone = datetime.timedelta() + else: + sign = 1 if m.group('sign') == '+' else -1 + timezone = datetime.timedelta( + hours=sign * int(m.group('hours')), + minutes=sign * int(m.group('minutes'))) + return timezone, date_str + + def parse_iso8601(date_str, delimiter='T', timezone=None): """ Return a UNIX timestamp from the given date """ @@ -984,20 +1053,8 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): date_str = re.sub(r'\.[0-9]+', '', date_str) if timezone is None: - m = re.search( - r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', - date_str) - if not m: - timezone = datetime.timedelta() - else: - date_str = date_str[:-len(m.group(0))] - if not m.group('sign'): - timezone = datetime.timedelta() - else: - sign = 1 if m.group('sign') == '+' else -1 - timezone = datetime.timedelta( - hours=sign * int(m.group('hours')), - minutes=sign * int(m.group('minutes'))) + timezone, date_str = extract_timezone(date_str) + try: date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) dt = datetime.datetime.strptime(date_str, date_format) - timezone @@ -1006,6 +1063,10 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): pass +def date_formats(day_first=True): + return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST + + def unified_strdate(date_str, day_first=True): """Return a string with the date in the format YYYYMMDD""" @@ -1014,53 +1075,11 @@ def unified_strdate(date_str, day_first=True): upload_date = None # Replace commas date_str = date_str.replace(',', ' ') - # %z (UTC offset) is only supported in python>=3.2 - if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str): - date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) # Remove AM/PM + timezone date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + _, date_str = extract_timezone(date_str) - format_expressions = [ - '%d %B %Y', - '%d %b %Y', - '%B %d %Y', - '%b %d %Y', - '%b %dst %Y %I:%M', - '%b %dnd %Y %I:%M', - '%b %dth %Y %I:%M', - '%Y %m %d', - '%Y-%m-%d', - '%Y/%m/%d', - '%Y/%m/%d %H:%M:%S', - '%Y-%m-%d %H:%M:%S', - '%Y-%m-%d %H:%M:%S.%f', - '%d.%m.%Y %H:%M', - '%d.%m.%Y %H.%M', - '%Y-%m-%dT%H:%M:%SZ', - '%Y-%m-%dT%H:%M:%S.%fZ', - '%Y-%m-%dT%H:%M:%S.%f0Z', - '%Y-%m-%dT%H:%M:%S', - '%Y-%m-%dT%H:%M:%S.%f', - '%Y-%m-%dT%H:%M', - ] - if day_first: - format_expressions.extend([ - '%d-%m-%Y', - '%d.%m.%Y', - '%d.%m.%y', - '%d/%m/%Y', - '%d/%m/%y', - '%d/%m/%Y %H:%M:%S', - ]) - else: - format_expressions.extend([ - '%m-%d-%Y', - '%m.%d.%Y', - '%m/%d/%Y', - '%m/%d/%y', - '%m/%d/%Y %H:%M:%S', - ]) - for expression in format_expressions: + for expression in date_formats(day_first): try: upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') except ValueError: @@ -1076,6 +1095,29 @@ def unified_strdate(date_str, day_first=True): return compat_str(upload_date) +def unified_timestamp(date_str, day_first=True): + if date_str is None: + return None + + date_str = date_str.replace(',', ' ') + + pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0) + timezone, date_str = extract_timezone(date_str) + + # Remove AM/PM + timezone + date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + + for expression in date_formats(day_first): + try: + dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta + return calendar.timegm(dt.timetuple()) + except ValueError: + pass + timetuple = email.utils.parsedate_tz(date_str) + if timetuple: + return calendar.timegm(timetuple.timetuple()) + + def determine_ext(url, default_ext='unknown_video'): if url is None: return default_ext @@ -1410,6 +1452,8 @@ def shell_quote(args): def smuggle_url(url, data): """ Pass additional data in a URL for internal use. """ + url, idata = unsmuggle_url(url, {}) + data.update(idata) sdata = compat_urllib_parse_urlencode( {'__youtubedl_smuggle': json.dumps(data)}) return url + '#' + sdata @@ -1591,6 +1635,11 @@ class HEADRequest(compat_urllib_request.Request): return 'HEAD' +class PUTRequest(compat_urllib_request.Request): + def get_method(self): + return 'PUT' + + def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): if get_attr: if v is not None: @@ -1626,6 +1675,10 @@ def float_or_none(v, scale=1, invscale=1, default=None): return default +def strip_or_none(v): + return None if v is None else v.strip() + + def parse_duration(s): if not isinstance(s, compat_basestring): return None @@ -1882,7 +1935,13 @@ def update_Request(req, url=None, data=None, headers={}, query={}): req_headers.update(headers) req_data = data or req.data req_url = update_url_query(url or req.get_full_url(), query) - req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + req_get_method = req.get_method() + if req_get_method == 'HEAD': + req_type = HEADRequest + elif req_get_method == 'PUT': + req_type = PUTRequest + else: + req_type = compat_urllib_request.Request new_req = req_type( req_url, data=req_data, headers=req_headers, origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) @@ -2046,6 +2105,7 @@ def mimetype2ext(mt): return ext _, _, res = mt.rpartition('/') + res = res.lower() return { '3gpp': '3gp', @@ -2057,6 +2117,12 @@ def mimetype2ext(mt): 'x-flv': 'flv', 'x-mp4-fragmented': 'mp4', 'x-ms-wmv': 'wmv', + 'mpegurl': 'm3u8', + 'x-mpegurl': 'm3u8', + 'vnd.apple.mpegurl': 'm3u8', + 'dash+xml': 'mpd', + 'f4m': 'f4m', + 'f4m+xml': 'f4m', }.get(res, res) @@ -2897,3 +2963,7 @@ def parse_m3u8_attributes(attrib): val = val[1:-1] info[key] = val return info + + +def urshift(val, n): + return val >> n if val >= 0 else (val + 0x100000000) >> n diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4a9f162c1..728ad2d50 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.06.20' +__version__ = '2016.07.09.2' |