diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 5 | ||||
-rw-r--r-- | youtube_dl/extractor/crunchyroll.py | 28 | ||||
-rw-r--r-- | youtube_dl/extractor/musicvault.py | 63 | ||||
-rw-r--r-- | youtube_dl/extractor/openfilm.py | 70 | ||||
-rw-r--r-- | youtube_dl/extractor/vidme.py | 126 | ||||
-rw-r--r-- | youtube_dl/extractor/vier.py | 13 | ||||
-rw-r--r-- | youtube_dl/extractor/washingtonpost.py | 8 | ||||
-rw-r--r-- | youtube_dl/extractor/wimp.py | 23 | ||||
-rw-r--r-- | youtube_dl/extractor/xuite.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/yahoo.py | 60 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 6 |
12 files changed, 184 insertions, 222 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 84830c885..44ab7ce3c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -347,7 +347,6 @@ from .mtv import ( ) from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE -from .musicvault import MusicVaultIE from .muzu import MuzuTVIE from .mwave import MwaveIE from .myspace import MySpaceIE, MySpaceAlbumIE @@ -435,7 +434,6 @@ from .ooyala import ( OoyalaIE, OoyalaExternalIE, ) -from .openfilm import OpenFilmIE from .orf import ( ORFTVthekIE, ORFOE1IE, diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5eeeda08d..d694e818e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -731,8 +731,9 @@ class InfoExtractor(object): @staticmethod def _hidden_inputs(html): + html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) hidden_inputs = {} - for input in re.findall(r'<input([^>]+)>', html): + for input in re.findall(r'(?i)<input([^>]+)>', html): if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): continue name = re.search(r'name=(["\'])(?P<value>.+?)\1', input) @@ -746,7 +747,7 @@ class InfoExtractor(object): def _form_hidden_inputs(self, form_id, html): form = self._search_regex( - r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, + r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, html, '%s form' % form_id, group='form') return self._hidden_inputs(form) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index ce123482e..95952bc29 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -31,7 +31,23 @@ from ..aes import ( ) -class CrunchyrollIE(InfoExtractor): +class CrunchyrollBaseIE(InfoExtractor): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): + request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) + else compat_urllib_request.Request(url_or_request)) + # Accept-Language must be set explicitly to accept any language to avoid issues + # similar to https://github.com/rg3/youtube-dl/issues/6797. + # Along with IP address Crunchyroll uses Accept-Language to guess whether georestriction + # should be imposed or not (from what I can see it just takes the first language + # ignoring the priority and requires it to correspond the IP). By the way this causes + # Crunchyroll to not work in georestriction cases in some browsers that don't place + # the locale lang first in header. However allowing any language seems to workaround the issue. + request.add_header('Accept-Language', '*') + return super(CrunchyrollBaseIE, self)._download_webpage( + request, video_id, note, errnote, fatal, tries, timeout, encoding) + + +class CrunchyrollIE(CrunchyrollBaseIE): _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)' _NETRC_MACHINE = 'crunchyroll' _TESTS = [{ @@ -259,10 +275,14 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') if not video_description: video_description = None - video_upload_date = self._html_search_regex(r'<div>Availability for free users:(.+?)</div>', webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) + video_upload_date = self._html_search_regex( + [r'<div>Availability for free users:(.+?)</div>', r'<div>[^<>]+<span>\s*(.+?\d{4})\s*</span></div>'], + webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) if video_upload_date: video_upload_date = unified_strdate(video_upload_date) - video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, 'video_uploader', fatal=False, flags=re.DOTALL) + video_uploader = self._html_search_regex( + r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', webpage, + 'video_uploader', fatal=False) playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) playerdata_req = compat_urllib_request.Request(playerdata_url) @@ -330,7 +350,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text } -class CrunchyrollShowPlaylistIE(InfoExtractor): +class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): IE_NAME = "crunchyroll:playlist" _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?$' diff --git a/youtube_dl/extractor/musicvault.py b/youtube_dl/extractor/musicvault.py deleted file mode 100644 index 0e46ac7c1..000000000 --- a/youtube_dl/extractor/musicvault.py +++ /dev/null @@ -1,63 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class MusicVaultIE(InfoExtractor): - _VALID_URL = r'https?://www\.musicvault\.com/(?P<uploader_id>[^/?#]*)/video/(?P<display_id>[^/?#]*)_(?P<id>[0-9]+)\.html' - _TEST = { - 'url': 'http://www.musicvault.com/the-allman-brothers-band/video/straight-from-the-heart_1010863.html', - 'md5': '3adcbdb3dcc02d647539e53f284ba171', - 'info_dict': { - 'id': '1010863', - 'ext': 'mp4', - 'uploader_id': 'the-allman-brothers-band', - 'title': 'Straight from the Heart', - 'duration': 244, - 'uploader': 'The Allman Brothers Band', - 'thumbnail': 're:^https?://.*/thumbnail/.*', - 'upload_date': '20131219', - 'location': 'Capitol Theatre (Passaic, NJ)', - 'description': 'Listen to The Allman Brothers Band perform Straight from the Heart at Capitol Theatre (Passaic, NJ) on Dec 16, 1981', - 'timestamp': int, - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - webpage = self._download_webpage(url, display_id) - - thumbnail = self._search_regex( - r'<meta itemprop="thumbnail" content="([^"]+)"', - webpage, 'thumbnail', fatal=False) - - data_div = self._search_regex( - r'(?s)<div class="data">(.*?)</div>', webpage, 'data fields') - uploader = self._html_search_regex( - r'<h1.*?>(.*?)</h1>', data_div, 'uploader', fatal=False) - title = self._html_search_regex( - r'<h2.*?>(.*?)</h2>', data_div, 'title') - location = self._html_search_regex( - r'<h4.*?>(.*?)</h4>', data_div, 'location', fatal=False) - - kaltura_id = self._search_regex( - r'<div id="video-detail-player" data-kaltura-id="([^"]+)"', - webpage, 'kaltura ID') - wid = self._search_regex(r'/wid/_([0-9]+)/', webpage, 'wid') - - return { - 'id': mobj.group('id'), - '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (wid, kaltura_id), - 'ie_key': 'Kaltura', - 'display_id': display_id, - 'uploader_id': mobj.group('uploader_id'), - 'thumbnail': thumbnail, - 'description': self._html_search_meta('description', webpage), - 'location': location, - 'title': title, - 'uploader': uploader, - } diff --git a/youtube_dl/extractor/openfilm.py b/youtube_dl/extractor/openfilm.py deleted file mode 100644 index d2ceedd01..000000000 --- a/youtube_dl/extractor/openfilm.py +++ /dev/null @@ -1,70 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote_plus -from ..utils import ( - parse_iso8601, - parse_age_limit, - int_or_none, -) - - -class OpenFilmIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)openfilm\.com/videos/(?P<id>.+)' - _TEST = { - 'url': 'http://www.openfilm.com/videos/human-resources-remastered', - 'md5': '42bcd88c2f3ec13b65edf0f8ad1cac37', - 'info_dict': { - 'id': '32736', - 'display_id': 'human-resources-remastered', - 'ext': 'mp4', - 'title': 'Human Resources (Remastered)', - 'description': 'Social Engineering in the 20th Century.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 7164, - 'timestamp': 1334756988, - 'upload_date': '20120418', - 'uploader_id': '41117', - 'view_count': int, - 'age_limit': 0, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - player = compat_urllib_parse_unquote_plus( - self._og_search_video_url(webpage)) - - video = json.loads(self._search_regex( - r'\bp=({.+?})(?:&|$)', player, 'video JSON')) - - video_url = '%s1.mp4' % video['location'] - video_id = video.get('video_id') - display_id = video.get('alias') or display_id - title = video.get('title') - description = video.get('description') - thumbnail = video.get('main_thumb') - duration = int_or_none(video.get('duration')) - timestamp = parse_iso8601(video.get('dt_published'), ' ') - uploader_id = video.get('user_id') - view_count = int_or_none(video.get('views_count')) - age_limit = parse_age_limit(video.get('age_limit')) - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'uploader_id': uploader_id, - 'view_count': view_count, - 'age_limit': age_limit, - } diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 157bb74fe..9a794e609 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -1,10 +1,12 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, int_or_none, float_or_none, - str_to_int, + parse_iso8601, ) @@ -12,18 +14,41 @@ class VidmeIE(InfoExtractor): _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]+)' _TESTS = [{ 'url': 'https://vid.me/QNB', - 'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', + 'md5': 'c62f1156138dc3323902188c5b5a8bd6', 'info_dict': { 'id': 'QNB', 'ext': 'mp4', 'title': 'Fishing for piranha - the easy way', 'description': 'source: https://www.facebook.com/photo.php?v=312276045600871', - 'duration': 119.92, + 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1406313244, 'upload_date': '20140725', + 'age_limit': 0, + 'duration': 119.92, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + }, { + 'url': 'https://vid.me/Gc6M', + 'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', + 'info_dict': { + 'id': 'Gc6M', + 'ext': 'mp4', + 'title': 'O Mere Dil ke chain - Arnav and Khushi VM', 'thumbnail': 're:^https?://.*\.jpg', + 'timestamp': 1441211642, + 'upload_date': '20150902', + 'uploader': 'SunshineM', + 'uploader_id': '3552827', + 'age_limit': 0, + 'duration': 223.72, 'view_count': int, 'like_count': int, + 'comment_count': int, + }, + 'params': { + 'skip_download': True, }, }, { # tests uploader field @@ -33,63 +58,94 @@ class VidmeIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Carver', 'description': 'md5:e9c24870018ae8113be936645b93ba3c', - 'duration': 97.859999999999999, + 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1433203629, 'upload_date': '20150602', 'uploader': 'Thomas', - 'thumbnail': 're:^https?://.*\.jpg', + 'uploader_id': '109747', + 'age_limit': 0, + 'duration': 97.859999999999999, 'view_count': int, 'like_count': int, + 'comment_count': int, }, 'params': { 'skip_download': True, }, }, { - # From http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching + # nsfw test from http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching 'url': 'https://vid.me/e/Wmur', - 'only_matching': True, + 'info_dict': { + 'id': 'Wmur', + 'ext': 'mp4', + 'title': 'naked smoking & stretching', + 'thumbnail': 're:^https?://.*\.jpg', + 'timestamp': 1430931613, + 'upload_date': '20150506', + 'uploader': 'naked-yogi', + 'uploader_id': '1638622', + 'age_limit': 18, + 'duration': 653.26999999999998, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): - url = url.replace('vid.me/e/', 'vid.me/') video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'<source src="([^"]+)"', webpage, 'video URL') + try: + response = self._download_json( + 'https://api.vid.me/videoByUrl/%s' % video_id, video_id) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + response = self._parse_json(e.cause.read(), video_id) + else: + raise + + error = response.get('error') + if error: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error), expected=True) - title = self._og_search_title(webpage) - description = self._og_search_description(webpage, default='') - thumbnail = self._og_search_thumbnail(webpage) - timestamp = int_or_none(self._og_search_property( - 'updated_time', webpage, fatal=False)) - width = int_or_none(self._og_search_property( - 'video:width', webpage, fatal=False)) - height = int_or_none(self._og_search_property( - 'video:height', webpage, fatal=False)) - duration = float_or_none(self._html_search_regex( - r'data-duration="([^"]+)"', webpage, 'duration', fatal=False)) - view_count = str_to_int(self._html_search_regex( - r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', - webpage, 'view count', fatal=False)) - like_count = str_to_int(self._html_search_regex( - r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">', - webpage, 'like count', fatal=False)) - uploader = self._html_search_regex( - 'class="video_author_username"[^>]*>([^<]+)', - webpage, 'uploader', default=None) + video = response['video'] + + formats = [{ + 'format_id': f.get('type'), + 'url': f['uri'], + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + } for f in video.get('formats', []) if f.get('uri')] + self._sort_formats(formats) + + title = video['title'] + description = video.get('description') + thumbnail = video.get('thumbnail_url') + timestamp = parse_iso8601(video.get('date_created'), ' ') + uploader = video.get('user', {}).get('username') + uploader_id = video.get('user', {}).get('user_id') + age_limit = 18 if video.get('nsfw') is True else 0 + duration = float_or_none(video.get('duration')) + view_count = int_or_none(video.get('view_count')) + like_count = int_or_none(video.get('likes_count')) + comment_count = int_or_none(video.get('comment_count')) return { 'id': video_id, - 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'age_limit': age_limit, 'timestamp': timestamp, - 'width': width, - 'height': height, 'duration': duration, 'view_count': view_count, 'like_count': like_count, - 'uploader': uploader, + 'comment_count': comment_count, + 'formats': formats, } diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index 15377097e..c76c20614 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import re +import itertools from .common import InfoExtractor @@ -91,31 +92,27 @@ class VierVideosIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) program = mobj.group('program') - webpage = self._download_webpage(url, program) - page_id = mobj.group('page') if page_id: page_id = int(page_id) start_page = page_id - last_page = start_page + 1 playlist_id = '%s-page%d' % (program, page_id) else: start_page = 0 - last_page = int(self._search_regex( - r'videos\?page=(\d+)">laatste</a>', - webpage, 'last page', default=0)) + 1 playlist_id = program entries = [] - for current_page_id in range(start_page, last_page): + for current_page_id in itertools.count(start_page): current_page = self._download_webpage( 'http://www.vier.be/%s/videos?page=%d' % (program, current_page_id), program, - 'Downloading page %d' % (current_page_id + 1)) if current_page_id != page_id else webpage + 'Downloading page %d' % (current_page_id + 1)) page_entries = [ self.url_result('http://www.vier.be' + video_url, 'Vier') for video_url in re.findall( r'<h3><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)] entries.extend(page_entries) + if page_id or '>Meer<' not in current_page: + break return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py index 72eb010f8..ec8b99998 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/youtube_dl/extractor/washingtonpost.py @@ -19,25 +19,25 @@ class WashingtonPostIE(InfoExtractor): 'title': 'Sinkhole of bureaucracy', }, 'playlist': [{ - 'md5': '79132cc09ec5309fa590ae46e4cc31bc', + 'md5': 'b9be794ceb56c7267d410a13f99d801a', 'info_dict': { 'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f', 'ext': 'mp4', 'title': 'Breaking Points: The Paper Mine', - 'duration': 1287, + 'duration': 1290, 'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.', 'uploader': 'The Washington Post', 'timestamp': 1395527908, 'upload_date': '20140322', }, }, { - 'md5': 'e1d5734c06865cc504ad99dc2de0d443', + 'md5': '1fff6a689d8770966df78c8cb6c8c17c', 'info_dict': { 'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f', 'ext': 'mp4', 'title': 'The town bureaucracy sustains', 'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.', - 'duration': 2217, + 'duration': 2220, 'timestamp': 1395528005, 'upload_date': '20140322', 'uploader': 'The Washington Post', diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index f69d46a28..e4f50e64c 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -1,40 +1,33 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from .youtube import YoutubeIE class WimpIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?wimp\.com/([^/]+)/' + _VALID_URL = r'http://(?:www\.)?wimp\.com/(?P<id>[^/]+)/' _TESTS = [{ 'url': 'http://www.wimp.com/maruexhausted/', - 'md5': 'f1acced123ecb28d9bb79f2479f2b6a1', + 'md5': 'ee21217ffd66d058e8b16be340b74883', 'info_dict': { 'id': 'maruexhausted', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Maru is exhausted.', 'description': 'md5:57e099e857c0a4ea312542b684a869b8', } }, { - # youtube video 'url': 'http://www.wimp.com/clowncar/', + 'md5': '4e2986c793694b55b37cf92521d12bb4', 'info_dict': { - 'id': 'cG4CEr2aiSg', + 'id': 'clowncar', 'ext': 'mp4', - 'title': 'Basset hound clown car...incredible!', - 'description': 'md5:8d228485e0719898c017203f900b3a35', - 'uploader': 'Gretchen Hoey', - 'uploader_id': 'gretchenandjeff1', - 'upload_date': '20140303', + 'title': 'It\'s like a clown car.', + 'description': 'md5:0e56db1370a6e49c5c1d19124c0d2fb2', }, - 'add_ie': ['Youtube'], }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_url = self._search_regex( [r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", r"videoId\s*:\s*[\"']([^\"']+)[\"']"], diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 5aac8adb3..8bbac54e2 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -19,7 +19,7 @@ class XuiteIE(InfoExtractor): _TESTS = [{ # Audio 'url': 'http://vlog.xuite.net/play/RGkzc1ZULTM4NjA5MTQuZmx2', - 'md5': '63a42c705772aa53fd4c1a0027f86adf', + 'md5': 'e79284c87b371424885448d11f6398c8', 'info_dict': { 'id': '3860914', 'ext': 'mp3', diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index b8579b573..fca5ddc69 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -144,6 +144,17 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://tw.news.yahoo.com/-100120367.html', 'only_matching': True, + }, { + # Query result is embedded in webpage, but explicit request to video API fails with geo restriction + 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', + 'md5': '4fbafb9c9b6f07aa8f870629f6671b35', + 'info_dict': { + 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', + 'ext': 'mp4', + 'title': 'Communitary - Community Episode 1: Ladders', + 'description': 'md5:8fc39608213295748e1e289807838c97', + 'duration': 1646, + }, } ] @@ -171,6 +182,19 @@ class YahooIE(InfoExtractor): if nbc_sports_url: return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') + # Query result is often embedded in webpage as JSON. Sometimes explicit requests + # to video API results in a failure with geo restriction reason therefore using + # embedded query result when present sounds reasonable. + config_json = self._search_regex( + r'window\.Af\.bootstrap\[[^\]]+\]\s*=\s*({.*?"applet_type"\s*:\s*"td-applet-videoplayer".*?});(?:</script>|$)', + webpage, 'videoplayer applet', default=None) + if config_json: + config = self._parse_json(config_json, display_id, fatal=False) + if config: + sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi') + if sapi: + return self._extract_info(display_id, sapi, webpage) + items_json = self._search_regex( r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, default=None) @@ -190,22 +214,10 @@ class YahooIE(InfoExtractor): video_id = info['id'] return self._get_info(video_id, display_id, webpage) - def _get_info(self, video_id, display_id, webpage): - region = self._search_regex( - r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', - webpage, 'region', fatal=False, default='US') - data = compat_urllib_parse.urlencode({ - 'protocol': 'http', - 'region': region, - }) - query_url = ( - 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' - '{id}?{data}'.format(id=video_id, data=data)) - query_result = self._download_json( - query_url, display_id, 'Downloading video info') - - info = query_result['query']['results']['mediaObj'][0] + def _extract_info(self, display_id, query, webpage): + info = query['query']['results']['mediaObj'][0] meta = info.get('meta') + video_id = info.get('id') if not meta: msg = info['status'].get('msg') @@ -231,6 +243,9 @@ class YahooIE(InfoExtractor): 'ext': 'flv', }) else: + if s.get('format') == 'm3u8_playlist': + format_info['protocol'] = 'm3u8_native' + format_info['ext'] = 'mp4' format_url = compat_urlparse.urljoin(host, path) format_info['url'] = format_url formats.append(format_info) @@ -264,6 +279,21 @@ class YahooIE(InfoExtractor): 'subtitles': subtitles, } + def _get_info(self, video_id, display_id, webpage): + region = self._search_regex( + r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', + webpage, 'region', fatal=False, default='US') + data = compat_urllib_parse.urlencode({ + 'protocol': 'http', + 'region': region, + }) + query_url = ( + 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' + '{id}?{data}'.format(id=video_id, data=data)) + query_result = self._download_json( + query_url, display_id, 'Downloading video info') + return self._extract_info(display_id, query_result, webpage) + class YahooSearchIE(SearchInfoExtractor): IE_DESC = 'Yahoo screen search' diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 231c86c37..97ce36550 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1835,8 +1835,8 @@ class YoutubeShowIE(InfoExtractor): _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)' IE_NAME = 'youtube:show' _TESTS = [{ - 'url': 'http://www.youtube.com/show/airdisasters', - 'playlist_mincount': 3, + 'url': 'https://www.youtube.com/show/airdisasters', + 'playlist_mincount': 5, 'info_dict': { 'id': 'airdisasters', 'title': 'Air Disasters', @@ -1847,7 +1847,7 @@ class YoutubeShowIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') webpage = self._download_webpage( - url, playlist_id, 'Downloading show webpage') + 'https://www.youtube.com/show/%s/playlists' % playlist_id, playlist_id, 'Downloading show webpage') # There's one playlist for each season of the show m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage)) self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons))) |