diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 7 | ||||
-rw-r--r-- | youtube_dl/extractor/animeondemand.py | 160 | ||||
-rw-r--r-- | youtube_dl/extractor/cbc.py | 113 | ||||
-rw-r--r-- | youtube_dl/extractor/comcarcoff.py | 16 | ||||
-rw-r--r-- | youtube_dl/extractor/comedycentral.py | 9 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 18 | ||||
-rw-r--r-- | youtube_dl/extractor/crackle.py | 95 | ||||
-rw-r--r-- | youtube_dl/extractor/fox.py | 5 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 19 | ||||
-rw-r--r-- | youtube_dl/extractor/hotstar.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/nbc.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/pbs.py | 61 | ||||
-rw-r--r-- | youtube_dl/extractor/plays.py | 51 | ||||
-rw-r--r-- | youtube_dl/extractor/theplatform.py | 5 | ||||
-rw-r--r-- | youtube_dl/extractor/viddler.py | 50 | ||||
-rw-r--r-- | youtube_dl/extractor/vimeo.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/youku.py | 3 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 20 |
18 files changed, 581 insertions, 67 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 18951c287..6937f28d3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -20,6 +20,7 @@ from .aftonbladet import AftonbladetIE from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE +from .animeondemand import AnimeOnDemandIE from .anitube import AnitubeIE from .anysex import AnySexIE from .aol import AolIE @@ -89,6 +90,10 @@ from .camdemy import ( from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .canvas import CanvasIE +from .cbc import ( + CBCIE, + CBCPlayerIE, +) from .cbs import CBSIE from .cbsnews import ( CBSNewsIE, @@ -126,6 +131,7 @@ from .comcarcoff import ComCarCoffIE from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .condenast import CondeNastIE from .cracked import CrackedIE +from .crackle import CrackleIE from .criterion import CriterionIE from .crooksandliars import CrooksAndLiarsIE from .crunchyroll import ( @@ -533,6 +539,7 @@ from .planetaplay import PlanetaPlayIE from .pladform import PladformIE from .played import PlayedIE from .playfm import PlayFMIE +from .plays import PlaysTVIE from .playtvak import PlaytvakIE from .playvid import PlayvidIE from .playwire import PlaywireIE diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py new file mode 100644 index 000000000..a7d8daf7b --- /dev/null +++ b/youtube_dl/extractor/animeondemand.py @@ -0,0 +1,160 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + determine_ext, + encode_dict, + ExtractorError, + sanitized_Request, + urlencode_postdata, +) + + +class AnimeOnDemandIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?anime-on-demand\.de/anime/(?P<id>\d+)' + _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' + _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' + _NETRC_MACHINE = 'animeondemand' + _TEST = { + 'url': 'https://www.anime-on-demand.de/anime/161', + 'info_dict': { + 'id': '161', + 'title': 'Grimgar, Ashes and Illusions (OmU)', + 'description': 'md5:6681ce3c07c7189d255ac6ab23812d31', + }, + 'playlist_mincount': 4, + } + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._form_hidden_inputs('new_user', login_page) + + login_form.update({ + 'user[login]': username, + 'user[password]': password, + }) + + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + request = sanitized_Request( + post_url, urlencode_postdata(encode_dict(login_form))) + request.add_header('Referer', self._LOGIN_URL) + + response = self._download_webpage( + request, None, 'Logging in as %s' % username) + + if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')): + error = self._search_regex( + r'<p class="alert alert-danger">(.+?)</p>', + response, 'error', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + anime_id = self._match_id(url) + + webpage = self._download_webpage(url, anime_id) + + if 'data-playlist=' not in webpage: + self._download_webpage( + self._APPLY_HTML5_URL, anime_id, + 'Activating HTML5 beta', 'Unable to apply HTML5 beta') + webpage = self._download_webpage(url, anime_id) + + csrf_token = self._html_search_meta( + 'csrf-token', webpage, 'csrf token', fatal=True) + + anime_title = self._html_search_regex( + r'(?s)<h1[^>]+itemprop="name"[^>]*>(.+?)</h1>', + webpage, 'anime name') + anime_description = self._html_search_regex( + r'(?s)<div[^>]+itemprop="description"[^>]*>(.+?)</div>', + webpage, 'anime description', default=None) + + entries = [] + + for episode_html in re.findall(r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage): + m = re.search( + r'class="episodebox-title"[^>]+title="Episode (?P<number>\d+) - (?P<title>.+?)"', episode_html) + if not m: + continue + + episode_number = int(m.group('number')) + episode_title = m.group('title') + video_id = 'episode-%d' % episode_number + + common_info = { + 'id': video_id, + 'series': anime_title, + 'episode': episode_title, + 'episode_number': episode_number, + } + + formats = [] + + playlist_url = self._search_regex( + r'data-playlist=(["\'])(?P<url>.+?)\1', + episode_html, 'data playlist', default=None, group='url') + if playlist_url: + request = sanitized_Request( + compat_urlparse.urljoin(url, playlist_url), + headers={ + 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRF-Token': csrf_token, + 'Referer': url, + 'Accept': 'application/json, text/javascript, */*; q=0.01', + }) + + playlist = self._download_json( + request, video_id, 'Downloading playlist JSON', fatal=False) + if playlist: + playlist = playlist['playlist'][0] + title = playlist['title'] + description = playlist.get('description') + for source in playlist.get('sources', []): + file_ = source.get('file') + if file_ and determine_ext(file_) == 'm3u8': + formats = self._extract_m3u8_formats( + file_, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + + if formats: + f = common_info.copy() + f.update({ + 'title': title, + 'description': description, + 'formats': formats, + }) + entries.append(f) + + m = re.search( + r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<', + episode_html) + if m: + f = common_info.copy() + f.update({ + 'id': '%s-teaser' % f['id'], + 'title': m.group('title'), + 'url': compat_urlparse.urljoin(url, m.group('href')), + }) + entries.append(f) + + return self.playlist_result(entries, anime_id, anime_title, anime_description) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py new file mode 100644 index 000000000..d8aa31038 --- /dev/null +++ b/youtube_dl/extractor/cbc.py @@ -0,0 +1,113 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import js_to_json + + +class CBCIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?:[^/]+/)+(?P<id>[^/?#]+)' + _TESTS = [{ + # with mediaId + 'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs', + 'info_dict': { + 'id': '2682904050', + 'ext': 'flv', + 'title': 'Don Cherry – All-Stars', + 'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.', + 'timestamp': 1454475540, + 'upload_date': '20160203', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + # with clipId + 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live', + 'info_dict': { + 'id': '2487345465', + 'ext': 'flv', + 'title': 'Robin Williams freestyles on 90 Minutes Live', + 'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.', + 'upload_date': '19700101', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + # multiple iframes + 'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot', + 'playlist': [{ + 'info_dict': { + 'id': '2680832926', + 'ext': 'flv', + 'title': 'An Eagle\'s-Eye View Off Burrard Bridge', + 'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.', + 'upload_date': '19700101', + }, + }, { + 'info_dict': { + 'id': '2658915080', + 'ext': 'flv', + 'title': 'Fly like an eagle!', + 'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower', + 'upload_date': '19700101', + }, + }], + 'params': { + # rtmp download + 'skip_download': True, + }, + }] + + @classmethod + def suitable(cls, url): + return False if CBCPlayerIE.suitable(url) else super(CBCIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + player_init = self._search_regex( + r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage, 'player init', + default=None) + if player_init: + player_info = self._parse_json(player_init, display_id, js_to_json) + media_id = player_info.get('mediaId') + if not media_id: + clip_id = player_info['clipId'] + media_id = self._download_json( + 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, + clip_id)['entries'][0]['id'].split('/')[-1] + return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) + else: + entries = [self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"', webpage)] + return self.playlist_result(entries) + + +class CBCPlayerIE(InfoExtractor): + _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)' + _TEST = { + 'url': 'http://www.cbc.ca/player/play/2683190193', + 'info_dict': { + 'id': '2683190193', + 'ext': 'flv', + 'title': 'Gerry Runs a Sweat Shop', + 'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0', + 'timestamp': 1455067800, + 'upload_date': '20160210', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'http://feed.theplatform.com/f/ExhSPC/vms_5akSXx4Ng_Zn?byGuid=%s' % video_id, + 'ThePlatformFeed', video_id) diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 2efa200b5..7dff68492 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, parse_duration, @@ -14,14 +15,13 @@ class ComCarCoffIE(InfoExtractor): _TESTS = [{ 'url': 'http://comediansincarsgettingcoffee.com/miranda-sings-happy-thanksgiving-miranda/', 'info_dict': { - 'id': 'miranda-sings-happy-thanksgiving-miranda', + 'id': '2494164', 'ext': 'mp4', 'upload_date': '20141127', 'timestamp': 1417107600, 'duration': 1232, 'title': 'Happy Thanksgiving Miranda', 'description': 'Jerry Seinfeld and his special guest Miranda Sings cruise around town in search of coffee, complaining and apologizing along the way.', - 'thumbnail': 'http://ccc.crackle.com/images/s5e4_thumb.jpg', }, 'params': { 'skip_download': 'requires ffmpeg', @@ -39,15 +39,14 @@ class ComCarCoffIE(InfoExtractor): r'window\.app\s*=\s*({.+?});\n', webpage, 'full data json'), display_id)['videoData'] - video_id = full_data['activeVideo']['video'] - video_data = full_data.get('videos', {}).get(video_id) or full_data['singleshots'][video_id] + display_id = full_data['activeVideo']['video'] + video_data = full_data.get('videos', {}).get(display_id) or full_data['singleshots'][display_id] + video_id = compat_str(video_data['mediaId']) thumbnails = [{ 'url': video_data['images']['thumb'], }, { 'url': video_data['images']['poster'], }] - formats = self._extract_m3u8_formats( - video_data['mediaUrl'], video_id, ext='mp4') timestamp = int_or_none(video_data.get('pubDateTime')) or parse_iso8601( video_data.get('pubDate')) @@ -55,6 +54,8 @@ class ComCarCoffIE(InfoExtractor): video_data.get('duration')) return { + '_type': 'url_transparent', + 'url': 'crackle:%s' % video_id, 'id': video_id, 'display_id': display_id, 'title': video_data['title'], @@ -62,6 +63,7 @@ class ComCarCoffIE(InfoExtractor): 'timestamp': timestamp, 'duration': duration, 'thumbnails': thumbnails, - 'formats': formats, + 'season_number': int_or_none(video_data.get('season')), + 'episode_number': int_or_none(video_data.get('episode')), 'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))), } diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 3e4bd10b6..055c9eec5 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -16,11 +16,11 @@ from ..utils import ( class ComedyCentralIE(MTVServicesInfoExtractor): _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ - (video-clips|episodes|cc-studios|video-collections|full-episodes) + (video-clips|episodes|cc-studios|video-collections|full-episodes|shows) /(?P<title>.*)''' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' - _TEST = { + _TESTS = [{ 'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', 'md5': 'c4f48e9eda1b16dd10add0744344b6d8', 'info_dict': { @@ -29,7 +29,10 @@ class ComedyCentralIE(MTVServicesInfoExtractor): 'title': 'CC:Stand-Up|Greg Fitzsimmons: Life on Stage|Uncensored - Too Good of a Mother', 'description': 'After a certain point, breastfeeding becomes c**kblocking.', }, - } + }, { + 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview', + 'only_matching': True, + }] class ComedyCentralShowsIE(MTVServicesInfoExtractor): diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1143f6dbb..444d412d9 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1186,12 +1186,13 @@ class InfoExtractor(object): http_count = 0 m3u8_count = 0 - src_urls = [] + srcs = [] videos = smil.findall(self._xpath_ns('.//video', namespace)) for video in videos: src = video.get('src') - if not src: + if not src or src in srcs: continue + srcs.append(src) bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) filesize = int_or_none(video.get('size') or video.get('fileSize')) @@ -1223,9 +1224,7 @@ class InfoExtractor(object): continue src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) - if src_url in src_urls: - continue - src_urls.append(src_url) + src_url = src_url.strip() if proto == 'm3u8' or src_ext == 'm3u8': m3u8_formats = self._extract_m3u8_formats( @@ -1436,12 +1435,16 @@ class InfoExtractor(object): base_url = base_url_e.text + base_url if re.match(r'^https?://', base_url): break - if not re.match(r'^https?://', base_url): + if mpd_base_url and not re.match(r'^https?://', base_url): + if not mpd_base_url.endswith('/') and not base_url.startswith('/'): + mpd_base_url += '/' base_url = mpd_base_url + base_url representation_id = representation_attrib.get('id') lang = representation_attrib.get('lang') + url_el = representation.find(_add_ns('BaseURL')) + filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) f = { - 'format_id': mpd_id or representation_id, + 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 'url': base_url, 'width': int_or_none(representation_attrib.get('width')), 'height': int_or_none(representation_attrib.get('height')), @@ -1452,6 +1455,7 @@ class InfoExtractor(object): 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'), 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, 'format_note': 'DASH %s' % content_type, + 'filesize': filesize, } representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info: diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py new file mode 100644 index 000000000..79238cce7 --- /dev/null +++ b/youtube_dl/extractor/crackle.py @@ -0,0 +1,95 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class CrackleIE(InfoExtractor): + _VALID_URL = r'(?:crackle:|https?://(?:www\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)' + _TEST = { + 'url': 'http://www.crackle.com/the-art-of-more/2496419', + 'info_dict': { + 'id': '2496419', + 'ext': 'mp4', + 'title': 'Heavy Lies the Head', + 'description': 'md5:bb56aa0708fe7b9a4861535f15c3abca', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + } + + # extracted from http://legacyweb-us.crackle.com/flash/QueryReferrer.ashx + _SUBTITLE_SERVER = 'http://web-us-az.crackle.com' + _UPLYNK_OWNER_ID = 'e8773f7770a44dbd886eee4fca16a66b' + _THUMBNAIL_TEMPLATE = 'http://images-us-am.crackle.com/%stnl_1920x1080.jpg?ts=20140107233116?c=635333335057637614' + + # extracted from http://legacyweb-us.crackle.com/flash/ReferrerRedirect.ashx + _MEDIA_FILE_SLOTS = { + 'c544.flv': { + 'width': 544, + 'height': 306, + }, + '360p.mp4': { + 'width': 640, + 'height': 360, + }, + '480p.mp4': { + 'width': 852, + 'height': 478, + }, + '480p_1mbps.mp4': { + 'width': 852, + 'height': 478, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + item = self._download_xml( + 'http://legacyweb-us.crackle.com/app/revamp/vidwallcache.aspx?flags=-1&fm=%s' % video_id, + video_id).find('i') + title = item.attrib['t'] + + thumbnail = None + subtitles = {} + formats = self._extract_m3u8_formats( + 'http://content.uplynk.com/ext/%s/%s.m3u8' % (self._UPLYNK_OWNER_ID, video_id), + video_id, 'mp4', m3u8_id='hls', fatal=None) + path = item.attrib.get('p') + if path: + thumbnail = self._THUMBNAIL_TEMPLATE % path + http_base_url = 'http://ahttp.crackle.com/' + path + for mfs_path, mfs_info in self._MEDIA_FILE_SLOTS.items(): + formats.append({ + 'url': http_base_url + mfs_path, + 'format_id': 'http-' + mfs_path.split('.')[0], + 'width': mfs_info['width'], + 'height': mfs_info['height'], + }) + for cc in item.findall('cc'): + locale = cc.attrib.get('l') + v = cc.attrib.get('v') + if locale and v: + if locale not in subtitles: + subtitles[locale] = [] + subtitles[locale] = [{ + 'url': '%s/%s%s_%s.xml' % (self._SUBTITLE_SERVER, path, locale, v), + 'ext': 'ttml', + }] + self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) + + return { + 'id': video_id, + 'title': title, + 'description': item.attrib.get('d'), + 'duration': int(item.attrib.get('r'), 16) if item.attrib.get('r') else None, + 'series': item.attrib.get('sn'), + 'season_number': int_or_none(item.attrib.get('se')), + 'episode_number': int_or_none(item.attrib.get('ep')), + 'thumbnail': thumbnail, + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 4a8acd53d..fa05af50d 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -9,6 +9,7 @@ class FOXIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.fox.com/watch/255180355939/7684182528', + 'md5': 'ebd296fcc41dd4b19f8115d8461a3165', 'info_dict': { 'id': '255180355939', 'ext': 'mp4', @@ -17,10 +18,6 @@ class FOXIE(InfoExtractor): 'duration': 129, }, 'add_ie': ['ThePlatform'], - 'params': { - # m3u8 download - 'skip_download': True, - }, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index bf61ab2e7..45adbb7a3 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -224,6 +224,20 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + # MPD from http://dash-mse-test.appspot.com/media.html + { + 'url': 'http://yt-dash-mse-test.commondatastorage.googleapis.com/media/car-20120827-manifest.mpd', + 'md5': '4b57baab2e30d6eb3a6a09f0ba57ef53', + 'info_dict': { + 'id': 'car-20120827-manifest', + 'ext': 'mp4', + 'title': 'car-20120827-manifest', + 'formats': 'mincount:9', + }, + 'params': { + 'format': 'bestvideo', + }, + }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', @@ -1302,7 +1316,8 @@ class GenericIE(InfoExtractor): return { 'id': video_id, 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), - 'formats': self._parse_mpd_formats(doc, video_id), + 'formats': self._parse_mpd_formats( + doc, video_id, mpd_base_url=url.rpartition('/')[0]), } except compat_xml_parse_error: pass @@ -1413,7 +1428,7 @@ class GenericIE(InfoExtractor): # Look for embedded Dailymotion player matches = re.findall( - r'<(?:embed|iframe)[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage) + r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage) if matches: return _playlist_from_matches( matches, lambda m: unescapeHTML(m[1])) diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index a7c3ce4ab..f05d765d6 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -10,8 +10,8 @@ from ..utils import ( class HotStarIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hotstar\.com/.*?[/-](?P<id>\d{10})' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+?[/-])?(?P<id>\d{10})' + _TESTS = [{ 'url': 'http://www.hotstar.com/on-air-with-aib--english-1000076273', 'info_dict': { 'id': '1000076273', @@ -26,7 +26,13 @@ class HotStarIE(InfoExtractor): # m3u8 download 'skip_download': True, } - } + }, { + 'url': 'http://www.hotstar.com/sports/cricket/rajitha-sizzles-on-debut-with-329/2001477583', + 'only_matching': True, + }, { + 'url': 'http://www.hotstar.com/1000000515', + 'only_matching': True, + }] _GET_CONTENT_TEMPLATE = 'http://account.hotstar.com/AVS/besc?action=GetAggregatedContentDetails&channel=PCTV&contentId=%s' _GET_CDN_TEMPLATE = 'http://getcdn.hotstar.com/AVS/besc?action=GetCDN&asJson=Y&channel=%s&id=%s&type=%s' diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 18d01f423..2202cfa33 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -57,7 +57,7 @@ class NBCIE(InfoExtractor): { # This video has expired but with an escaped embedURL 'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515', - 'skip': 'Expired' + 'only_matching': True, } ] diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 97e8ffc97..cca012953 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,10 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, determine_ext, int_or_none, + js_to_json, strip_jsonp, unified_strdate, US_RATINGS, @@ -199,7 +201,7 @@ class PBSIE(InfoExtractor): 'id': '2365006249', 'ext': 'mp4', 'title': 'Constitution USA with Peter Sagal - A More Perfect Union', - 'description': 'md5:ba0c207295339c8d6eced00b7c363c6a', + 'description': 'md5:36f341ae62e251b8f5bd2b754b95a071', 'duration': 3190, }, 'params': { @@ -213,7 +215,7 @@ class PBSIE(InfoExtractor): 'id': '2365297690', 'ext': 'mp4', 'title': 'FRONTLINE - Losing Iraq', - 'description': 'md5:f5bfbefadf421e8bb8647602011caf8e', + 'description': 'md5:4d3eaa01f94e61b3e73704735f1196d9', 'duration': 5050, }, 'params': { @@ -227,7 +229,7 @@ class PBSIE(InfoExtractor): 'id': '2201174722', 'ext': 'mp4', 'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist', - 'description': 'md5:5871c15cba347c1b3d28ac47a73c7c28', + 'description': 'md5:95a19f568689d09a166dff9edada3301', 'duration': 801, }, }, @@ -237,8 +239,8 @@ class PBSIE(InfoExtractor): 'info_dict': { 'id': '2365297708', 'ext': 'mp4', - 'description': 'md5:68d87ef760660eb564455eb30ca464fe', 'title': 'Great Performances - Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full', + 'description': 'md5:657897370e09e2bc6bf0f8d2cd313c6b', 'duration': 6559, 'thumbnail': 're:^https?://.*\.jpg$', }, @@ -278,7 +280,7 @@ class PBSIE(InfoExtractor): 'display_id': 'player', 'ext': 'mp4', 'title': 'American Experience - Death and the Civil War, Chapter 1', - 'description': 'American Experience, TV’s most-watched history series, brings to life the compelling stories from our past that inform our understanding of the world today.', + 'description': 'md5:1b80a74e0380ed2a4fb335026de1600d', 'duration': 682, 'thumbnail': 're:^https?://.*\.jpg$', }, @@ -287,20 +289,19 @@ class PBSIE(InfoExtractor): }, }, { - 'url': 'http://video.pbs.org/video/2365367186/', + 'url': 'http://www.pbs.org/video/2365245528/', 'info_dict': { - 'id': '2365367186', - 'display_id': '2365367186', + 'id': '2365245528', + 'display_id': '2365245528', 'ext': 'mp4', - 'title': 'To Catch A Comet - Full Episode', - 'description': 'On November 12, 2014, billions of kilometers from Earth, spacecraft orbiter Rosetta and lander Philae did what no other had dared to attempt \u2014 land on the volatile surface of a comet as it zooms around the sun at 67,000 km/hr. The European Space Agency hopes this mission can help peer into our past and unlock secrets of our origins.', - 'duration': 3342, + 'title': 'FRONTLINE - United States of Secrets (Part One)', + 'description': 'md5:55756bd5c551519cc4b7703e373e217e', + 'duration': 6851, 'thumbnail': 're:^https?://.*\.jpg$', }, 'params': { 'skip_download': True, # requires ffmpeg }, - 'skip': 'Expired', }, { # Video embedded in iframe containing angle brackets as attribute's value (e.g. @@ -312,7 +313,7 @@ class PBSIE(InfoExtractor): 'display_id': 'a-chefs-life-season-3-episode-5-prickly-business', 'ext': 'mp4', 'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business", - 'description': 'md5:61db2ddf27c9912f09c241014b118ed1', + 'description': 'md5:54033c6baa1f9623607c6e2ed245888b', 'duration': 1480, 'thumbnail': 're:^https?://.*\.jpg$', }, @@ -328,7 +329,7 @@ class PBSIE(InfoExtractor): 'display_id': 'the-atomic-artists', 'ext': 'mp4', 'title': 'FRONTLINE - The Atomic Artists', - 'description': 'md5:f5bfbefadf421e8bb8647602011caf8e', + 'description': 'md5:1a2481e86b32b2e12ec1905dd473e2c1', 'duration': 723, 'thumbnail': 're:^https?://.*\.jpg$', }, @@ -365,10 +366,14 @@ class PBSIE(InfoExtractor): webpage, 'upload date', default=None)) # tabbed frontline videos - tabbed_videos = re.findall( - r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"', webpage) - if tabbed_videos: - return tabbed_videos, presumptive_id, upload_date + MULTI_PART_REGEXES = ( + r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"', + r'<a[^>]+href=["\']#video-\d+["\'][^>]+data-coveid=["\'](\d+)', + ) + for p in MULTI_PART_REGEXES: + tabbed_videos = re.findall(p, webpage) + if tabbed_videos: + return tabbed_videos, presumptive_id, upload_date MEDIA_ID_REGEXES = [ r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed @@ -432,9 +437,21 @@ class PBSIE(InfoExtractor): for vid_id in video_id] return self.playlist_result(entries, display_id) - info = self._download_json( - 'http://player.pbs.org/videoInfo/%s?format=json&type=partner' % video_id, - display_id) + try: + info = self._download_json( + 'http://player.pbs.org/videoInfo/%s?format=json&type=partner' % video_id, + display_id, 'Downloading video info JSON') + except ExtractorError as e: + if not isinstance(e.cause, compat_HTTPError) or e.cause.code != 404: + raise + # videoInfo API may not work for some videos, fallback to portalplayer API + player = self._download_webpage( + 'http://player.pbs.org/portalplayer/%s' % video_id, display_id) + info = self._parse_json( + self._search_regex( + r'(?s)PBS\.videoData\s*=\s*({.+?});\n', + player, 'video data', default='{}'), + display_id, transform_source=js_to_json, fatal=False) formats = [] for encoding_name in ('recommended_encoding', 'alternate_encoding'): @@ -493,7 +510,7 @@ class PBSIE(InfoExtractor): 'id': video_id, 'display_id': display_id, 'title': info['title'], - 'description': info['program'].get('description'), + 'description': info.get('description') or info.get('program', {}).get('description'), 'thumbnail': info.get('image_url'), 'duration': int_or_none(info.get('duration')), 'age_limit': age_limit, diff --git a/youtube_dl/extractor/plays.py b/youtube_dl/extractor/plays.py new file mode 100644 index 000000000..c3c38cf4a --- /dev/null +++ b/youtube_dl/extractor/plays.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class PlaysTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?plays\.tv/video/(?P<id>[0-9a-f]{18})' + _TEST = { + 'url': 'http://plays.tv/video/56af17f56c95335490/when-you-outplay-the-azir-wall', + 'md5': 'dfeac1198506652b5257a62762cec7bc', + 'info_dict': { + 'id': '56af17f56c95335490', + 'ext': 'mp4', + 'title': 'When you outplay the Azir wall', + 'description': 'Posted by Bjergsen', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + content = self._parse_json( + self._search_regex( + r'R\.bindContent\(({.+?})\);', webpage, + 'content'), video_id)['content'] + mpd_url, sources = re.search( + r'(?s)<video[^>]+data-mpd="([^"]+)"[^>]*>(.+?)</video>', + content).groups() + formats = self._extract_mpd_formats( + self._proto_relative_url(mpd_url), video_id, mpd_id='DASH') + for format_id, height, format_url in re.findall(r'<source\s+res="((\d+)h?)"\s+src="([^"]+)"', sources): + formats.append({ + 'url': self._proto_relative_url(format_url), + 'format_id': 'http-' + format_id, + 'height': int_or_none(height), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': formats, + } diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 10f2cad55..755f816ff 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -20,7 +20,6 @@ from ..utils import ( int_or_none, sanitized_Request, unsmuggle_url, - url_basename, xpath_with_ns, ) @@ -283,8 +282,8 @@ class ThePlatformFeedIE(ThePlatformBaseIE): first_video_id = None duration = None for item in entry['media$content']: - smil_url = item['plfile$url'] + '&format=SMIL&Tracking=true&Embedded=true&formats=MPEG4,F4M' - cur_video_id = url_basename(smil_url) + smil_url = item['plfile$url'] + '&format=SMIL&mbr=true' + cur_video_id = ThePlatformIE._match_id(smil_url) if first_video_id is None: first_video_id = cur_video_id duration = float_or_none(item.get('plfile$duration')) diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 40ffbad2a..6bfbd4d85 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -1,6 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse, + compat_urlparse, +) from ..utils import ( float_or_none, int_or_none, @@ -12,10 +16,10 @@ class ViddlerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P<id>[a-z0-9]+)' _TESTS = [{ 'url': 'http://www.viddler.com/v/43903784', - 'md5': 'ae43ad7cb59431ce043f0ff7fa13cbf4', + 'md5': '9eee21161d2c7f5b39690c3e325fab2f', 'info_dict': { 'id': '43903784', - 'ext': 'mp4', + 'ext': 'mov', 'title': 'Video Made Easy', 'description': 'md5:6a697ebd844ff3093bd2e82c37b409cd', 'uploader': 'viddler', @@ -29,10 +33,10 @@ class ViddlerIE(InfoExtractor): } }, { 'url': 'http://www.viddler.com/v/4d03aad9/', - 'md5': 'faa71fbf70c0bee7ab93076fd007f4b0', + 'md5': 'f12c5a7fa839c47a79363bfdf69404fb', 'info_dict': { 'id': '4d03aad9', - 'ext': 'mp4', + 'ext': 'ts', 'title': 'WALL-TO-GORTAT', 'upload_date': '20150126', 'uploader': 'deadspin', @@ -42,10 +46,10 @@ class ViddlerIE(InfoExtractor): } }, { 'url': 'http://www.viddler.com/player/221ebbbd/0/', - 'md5': '0defa2bd0ea613d14a6e9bd1db6be326', + 'md5': '740511f61d3d1bb71dc14a0fe01a1c10', 'info_dict': { 'id': '221ebbbd', - 'ext': 'mp4', + 'ext': 'mov', 'title': 'LETeens-Grammar-snack-third-conditional', 'description': ' ', 'upload_date': '20140929', @@ -54,16 +58,42 @@ class ViddlerIE(InfoExtractor): 'view_count': int, 'comment_count': int, } + }, { + # secret protected + 'url': 'http://www.viddler.com/v/890c0985?secret=34051570', + 'info_dict': { + 'id': '890c0985', + 'ext': 'mp4', + 'title': 'Complete Property Training - Traineeships', + 'description': ' ', + 'upload_date': '20130606', + 'uploader': 'TiffanyBowtell', + 'timestamp': 1370496993, + 'view_count': int, + 'comment_count': int, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): video_id = self._match_id(url) - json_url = ( - 'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json?video_id=%s&key=v0vhrt7bg2xq1vyxhkct' % - video_id) + query = { + 'video_id': video_id, + 'key': 'v0vhrt7bg2xq1vyxhkct', + } + + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + secret = qs.get('secret', [None])[0] + if secret: + query['secret'] = secret + headers = {'Referer': 'http://static.cdn-ec.viddler.com/js/arpeggio/v2/embed.html'} - request = sanitized_Request(json_url, None, headers) + request = sanitized_Request( + 'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json?%s' + % compat_urllib_parse.urlencode(query), None, headers) data = self._download_json(request, video_id)['video'] formats = [] diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 2389e7f0f..6a8f9b49d 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -57,7 +57,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): def _extract_xsrft_and_vuid(self, webpage): xsrft = self._search_regex( - r'xsrft\s*[=:]\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)', + r'(?:(?P<q1>["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)', webpage, 'login token', group='xsrft') vuid = self._search_regex( r'["\']vuid["\']\s*:\s*(["\'])(?P<vuid>.+?)\1', diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index f767fa15f..49687371a 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -229,6 +229,9 @@ class YoukuIE(InfoExtractor): if error_note is not None and '因版权原因无法观看此视频' in error_note: raise ExtractorError( 'Youku said: Sorry, this video is available in China only', expected=True) + elif error_note and '该视频被设为私密' in error_note: + raise ExtractorError( + 'Youku said: Sorry, this video is private', expected=True) else: msg = 'Youku server reported error %i' % error.get('code') if error_note is not None: diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b9a91dea2..18f7d37f4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -375,7 +375,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): IE_NAME = 'youtube' _TESTS = [ { - 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9', + 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', 'info_dict': { 'id': 'BaW_jenozKc', 'ext': 'mp4', @@ -441,7 +441,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } }, { - 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY', + 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY', 'note': 'Use the first video ID in the URL', 'info_dict': { 'id': 'BaW_jenozKc', @@ -705,6 +705,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, }, { + # Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536) + 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo', + 'info_dict': { + 'id': 'gVfLd0zydlo', + 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30', + }, + 'playlist_count': 2, + }, + { 'url': 'http://vid.plus/FlRa-iH7PGw', 'only_matching': True, }, @@ -1196,9 +1205,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not self._downloader.params.get('noplaylist'): entries = [] feed_ids = [] - multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0]) + multifeed_metadata_list = video_info['multifeed_metadata_list'][0] for feed in multifeed_metadata_list.split(','): - feed_data = compat_parse_qs(feed) + # Unquote should take place before split on comma (,) since textual + # fields may contain comma as well (see + # https://github.com/rg3/youtube-dl/issues/8536) + feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) entries.append({ '_type': 'url_transparent', 'ie_key': 'Youtube', |