diff options
Diffstat (limited to 'youtube_dl/extractor')
26 files changed, 681 insertions, 427 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7408f68d6..99da52d96 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -347,7 +347,6 @@ from .mtv import ( ) from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE -from .musicvault import MusicVaultIE from .muzu import MuzuTVIE from .mwave import MwaveIE from .myspace import MySpaceIE, MySpaceAlbumIE @@ -368,6 +367,9 @@ from .nbc import ( from .ndr import ( NDRIE, NJoyIE, + NDREmbedBaseIE, + NDREmbedIE, + NJoyEmbedIE, ) from .ndtv import NDTVIE from .netzkino import NetzkinoIE @@ -437,7 +439,6 @@ from .ooyala import ( OoyalaIE, OoyalaExternalIE, ) -from .openfilm import OpenFilmIE from .orf import ( ORFTVthekIE, ORFOE1IE, diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py index 47313fba8..34095501c 100644 --- a/youtube_dl/extractor/academicearth.py +++ b/youtube_dl/extractor/academicearth.py @@ -15,7 +15,7 @@ class AcademicEarthCourseIE(InfoExtractor): 'title': 'Laws of Nature', 'description': 'Introduce yourself to the laws of nature with these free online college lectures from Yale, Harvard, and MIT.', }, - 'playlist_count': 4, + 'playlist_count': 3, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 39335b827..4327c2f61 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -156,7 +156,7 @@ class AdultSwimIE(InfoExtractor): xpath_text(idoc, './/trt', 'segment duration').strip()) formats = [] - file_els = idoc.findall('.//files/file') + file_els = idoc.findall('.//files/file') or idoc.findall('./files/file') for file_el in file_els: bitrate = file_el.attrib.get('bitrate') diff --git a/youtube_dl/extractor/airmozilla.py b/youtube_dl/extractor/airmozilla.py index 611ad1e9d..f8e70f4e5 100644 --- a/youtube_dl/extractor/airmozilla.py +++ b/youtube_dl/extractor/airmozilla.py @@ -20,14 +20,14 @@ class AirMozillaIE(InfoExtractor): 'id': '6x4q2w', 'ext': 'mp4', 'title': 'Privacy Lab - a meetup for privacy minded people in San Francisco', - 'thumbnail': 're:https://\w+\.cloudfront\.net/6x4q2w/poster\.jpg\?t=\d+', + 'thumbnail': 're:https?://vid\.ly/(?P<id>[0-9a-z-]+)/poster', 'description': 'Brings together privacy professionals and others interested in privacy at for-profits, non-profits, and NGOs in an effort to contribute to the state of the ecosystem...', 'timestamp': 1422487800, 'upload_date': '20150128', 'location': 'SFO Commons', 'duration': 3780, 'view_count': int, - 'categories': ['Main'], + 'categories': ['Main', 'Privacy'], } } diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index 612708e25..184a14a4f 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -16,6 +16,7 @@ class AlJazeeraIE(InfoExtractor): 'uploader': 'Al Jazeera English', }, 'add_ie': ['Brightcove'], + 'skip': 'Not accessible from Travis CI server', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index dda583680..e857e66f4 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -17,55 +17,81 @@ from ..utils import ( class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)' - - _TESTS = [ - { - 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', + _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(?:[^/]+/)*(?P<id>[^/#?]+)/*(?:[#?].*)?$' + _TESTS = [{ + 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', + 'info_dict': { + 'id': '61924494876951776', + 'ext': 'mp4', + 'title': 'Hyde Park Civilizace', + 'description': 'md5:fe93f6eda372d150759d11644ebbfb4a', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 3350, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina', + 'info_dict': { + 'id': '61924494876844374', + 'ext': 'mp4', + 'title': 'První republika: Zpěvačka z Dupárny Bobina', + 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 88.4, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # video with 18+ caution trailer + 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + 'info_dict': { + 'id': '215562210900007-bogotart', + 'title': 'Queer: Bogotart', + 'description': 'Alternativní průvodce současným queer světem', + }, + 'playlist': [{ 'info_dict': { - 'id': '214411058091220', + 'id': '61924494876844842', 'ext': 'mp4', - 'title': 'Hyde Park Civilizace', - 'description': 'Věda a současná civilizace. Interaktivní pořad - prostor pro vaše otázky a komentáře', - 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 3350, - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'title': 'Queer: Bogotart (Varování 18+)', + 'duration': 10.2, }, - }, - { - 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina', + }, { 'info_dict': { - 'id': '14716', + 'id': '61924494877068022', 'ext': 'mp4', - 'title': 'První republika: Zpěvačka z Dupárny Bobina', - 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.', + 'title': 'Queer: Bogotart (Queer)', 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 88.4, - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'duration': 1558.3, }, + }], + 'params': { + # m3u8 download + 'skip_download': True, }, - ] + }] def _real_extract(self, url): url = url.replace('/porady/', '/ivysilani/').replace('/video/', '') mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + playlist_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, playlist_id) NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' if '%s</p>' % NOT_AVAILABLE_STRING in webpage: raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - typ = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type') - episode_id = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id') + typ = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type') + episode_id = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id') data = { 'playlist[0][type]': typ, @@ -83,7 +109,7 @@ class CeskaTelevizeIE(InfoExtractor): req.add_header('X-Requested-With', 'XMLHttpRequest') req.add_header('Referer', url) - playlistpage = self._download_json(req, video_id) + playlistpage = self._download_json(req, playlist_id) playlist_url = playlistpage['url'] if playlist_url == 'error_region': @@ -92,33 +118,43 @@ class CeskaTelevizeIE(InfoExtractor): req = compat_urllib_request.Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) - playlist = self._download_json(req, video_id) - - item = playlist['playlist'][0] - formats = [] - for format_id, stream_url in item['streamUrls'].items(): - formats.extend(self._extract_m3u8_formats(stream_url, video_id, 'mp4')) - self._sort_formats(formats) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - duration = float_or_none(item.get('duration')) - thumbnail = item.get('previewImageUrl') - - subtitles = {} - subs = item.get('subtitles') - if subs: - subtitles = self.extract_subtitles(episode_id, subs) - - return { - 'id': episode_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } + playlist_title = self._og_search_title(webpage) + playlist_description = self._og_search_description(webpage) + + playlist = self._download_json(req, playlist_id)['playlist'] + playlist_len = len(playlist) + + entries = [] + for item in playlist: + formats = [] + for format_id, stream_url in item['streamUrls'].items(): + formats.extend(self._extract_m3u8_formats( + stream_url, playlist_id, 'mp4', entry_protocol='m3u8_native')) + self._sort_formats(formats) + + item_id = item.get('id') or item['assetId'] + title = item['title'] + + duration = float_or_none(item.get('duration')) + thumbnail = item.get('previewImageUrl') + + subtitles = {} + if item.get('type') == 'VOD': + subs = item.get('subtitles') + if subs: + subtitles = self.extract_subtitles(episode_id, subs) + + entries.append({ + 'id': item_id, + 'title': playlist_title if playlist_len == 1 else '%s (%s)' % (playlist_title, title), + 'description': playlist_description if playlist_len == 1 else None, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + }) + + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) def _get_subtitles(self, episode_id, subs): original_subtitles = self._download_webpage( diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 39cef9c5b..d694e818e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -731,9 +731,10 @@ class InfoExtractor(object): @staticmethod def _hidden_inputs(html): + html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) hidden_inputs = {} - for input in re.findall(r'<input([^>]+)>', html): - if not re.search(r'type=(["\'])hidden\1', input): + for input in re.findall(r'(?i)<input([^>]+)>', html): + if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): continue name = re.search(r'name=(["\'])(?P<value>.+?)\1', input) if not name: @@ -746,7 +747,7 @@ class InfoExtractor(object): def _form_hidden_inputs(self, form_id, html): form = self._search_regex( - r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, + r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, html, '%s form' % form_id, group='form') return self._hidden_inputs(form) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index ce123482e..95952bc29 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -31,7 +31,23 @@ from ..aes import ( ) -class CrunchyrollIE(InfoExtractor): +class CrunchyrollBaseIE(InfoExtractor): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): + request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) + else compat_urllib_request.Request(url_or_request)) + # Accept-Language must be set explicitly to accept any language to avoid issues + # similar to https://github.com/rg3/youtube-dl/issues/6797. + # Along with IP address Crunchyroll uses Accept-Language to guess whether georestriction + # should be imposed or not (from what I can see it just takes the first language + # ignoring the priority and requires it to correspond the IP). By the way this causes + # Crunchyroll to not work in georestriction cases in some browsers that don't place + # the locale lang first in header. However allowing any language seems to workaround the issue. + request.add_header('Accept-Language', '*') + return super(CrunchyrollBaseIE, self)._download_webpage( + request, video_id, note, errnote, fatal, tries, timeout, encoding) + + +class CrunchyrollIE(CrunchyrollBaseIE): _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)' _NETRC_MACHINE = 'crunchyroll' _TESTS = [{ @@ -259,10 +275,14 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') if not video_description: video_description = None - video_upload_date = self._html_search_regex(r'<div>Availability for free users:(.+?)</div>', webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) + video_upload_date = self._html_search_regex( + [r'<div>Availability for free users:(.+?)</div>', r'<div>[^<>]+<span>\s*(.+?\d{4})\s*</span></div>'], + webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) if video_upload_date: video_upload_date = unified_strdate(video_upload_date) - video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, 'video_uploader', fatal=False, flags=re.DOTALL) + video_uploader = self._html_search_regex( + r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', webpage, + 'video_uploader', fatal=False) playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) playerdata_req = compat_urllib_request.Request(playerdata_url) @@ -330,7 +350,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text } -class CrunchyrollShowPlaylistIE(InfoExtractor): +class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): IE_NAME = "crunchyroll:playlist" _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?$' diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 688dfc2f7..a1ee51568 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -79,7 +79,7 @@ class EaglePlatformIE(InfoExtractor): age_limit = 0 if age_restriction == 'allow_all' else 18 m3u8_data = self._download_json( - media['sources']['secure_m3u8']['auto'], + self._proto_relative_url(media['sources']['secure_m3u8']['auto'], 'http:'), video_id, 'Downloading m3u8 JSON') formats = self._extract_m3u8_formats( diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index 5c1137e94..a406945e8 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -10,6 +10,7 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + encode_dict, ExtractorError, ) @@ -55,10 +56,7 @@ class FC2IE(InfoExtractor): 'Submit': ' Login ', } - # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode - # chokes on unicode - login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items()) - login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8') + login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('utf-8') request = compat_urllib_request.Request( 'https://secure.id.fc2.com/index.php?mode=login&switch_language=en', login_data) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 953ec32c3..ec748ed9f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1797,7 +1797,7 @@ class GenericIE(InfoExtractor): found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)) if not found: # HTML5 video - found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage) + found = re.findall(r'(?s)<(?:video|audio)[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage) if not found: REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' found = re.search( diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/gorillavid.py index f006f0cb1..d23e3eac1 100644 --- a/youtube_dl/extractor/gorillavid.py +++ b/youtube_dl/extractor/gorillavid.py @@ -10,15 +10,16 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + encode_dict, int_or_none, ) class GorillaVidIE(InfoExtractor): - IE_DESC = 'GorillaVid.in, daclips.in, movpod.in, fastvideo.in and realvid.net' + IE_DESC = 'GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net and filehoot.com' _VALID_URL = r'''(?x) https?://(?P<host>(?:www\.)? - (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net))/ + (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com))/ (?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)? ''' @@ -67,13 +68,22 @@ class GorillaVidIE(InfoExtractor): }, { 'url': 'http://movpod.in/0wguyyxi1yca', 'only_matching': True, + }, { + 'url': 'http://filehoot.com/3ivfabn7573c.html', + 'info_dict': { + 'id': '3ivfabn7573c', + 'ext': 'mp4', + 'title': 'youtube-dl test video \'äBaW_jenozKc.mp4.mp4', + 'thumbnail': 're:http://.*\.jpg', + } }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - webpage = self._download_webpage('http://%s/%s' % (mobj.group('host'), video_id), video_id) + url = 'http://%s/%s' % (mobj.group('host'), video_id) + webpage = self._download_webpage(url, video_id) if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None: raise ExtractorError('Video %s does not exist' % video_id, expected=True) @@ -87,7 +97,7 @@ class GorillaVidIE(InfoExtractor): if countdown: self._sleep(countdown, video_id) - post = compat_urllib_parse.urlencode(fields) + post = compat_urllib_parse.urlencode(encode_dict(fields)) req = compat_urllib_request.Request(url, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') @@ -95,7 +105,7 @@ class GorillaVidIE(InfoExtractor): webpage = self._download_webpage(req, video_id, 'Downloading video page') title = self._search_regex( - [r'style="z-index: [0-9]+;">([^<]+)</span>', r'>Watch (.+) '], + [r'style="z-index: [0-9]+;">([^<]+)</span>', r'<td nowrap>([^<]+)</td>', r'>Watch (.+) '], webpage, 'title', default=None) or self._og_search_title(webpage) video_url = self._search_regex( r'file\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'file url') diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 1077846f2..fa233377d 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -202,6 +202,7 @@ class KuwoSingerIE(InfoExtractor): 'title': 'Ali', }, 'playlist_mincount': 95, + 'skip': 'Regularly stalls travis build', # See https://travis-ci.org/rg3/youtube-dl/jobs/78878540 }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index d7ab6a9ae..f088ab9e2 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -18,12 +18,12 @@ class TechTVMITIE(InfoExtractor): _TEST = { 'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', - 'md5': '1f8cb3e170d41fd74add04d3c9330e5f', + 'md5': '00a3a27ee20d44bcaa0933ccec4a2cf7', 'info_dict': { 'id': '25418', 'ext': 'mp4', - 'title': 'MIT DNA Learning Center Set', - 'description': 'md5:82313335e8a8a3f243351ba55bc1b474', + 'title': 'MIT DNA and Protein Sets', + 'description': 'md5:46f5c69ce434f0a97e7c628cc142802d', }, } @@ -33,8 +33,8 @@ class TechTVMITIE(InfoExtractor): 'http://techtv.mit.edu/videos/%s' % video_id, video_id) clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page) - base_url = self._search_regex( - r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url') + base_url = self._proto_relative_url(self._search_regex( + r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url'), 'http:') formats_json = self._search_regex( r'bitrates: (\[.+?\])', raw_page, 'video formats') formats_mit = json.loads(formats_json) diff --git a/youtube_dl/extractor/musicvault.py b/youtube_dl/extractor/musicvault.py deleted file mode 100644 index 0e46ac7c1..000000000 --- a/youtube_dl/extractor/musicvault.py +++ /dev/null @@ -1,63 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class MusicVaultIE(InfoExtractor): - _VALID_URL = r'https?://www\.musicvault\.com/(?P<uploader_id>[^/?#]*)/video/(?P<display_id>[^/?#]*)_(?P<id>[0-9]+)\.html' - _TEST = { - 'url': 'http://www.musicvault.com/the-allman-brothers-band/video/straight-from-the-heart_1010863.html', - 'md5': '3adcbdb3dcc02d647539e53f284ba171', - 'info_dict': { - 'id': '1010863', - 'ext': 'mp4', - 'uploader_id': 'the-allman-brothers-band', - 'title': 'Straight from the Heart', - 'duration': 244, - 'uploader': 'The Allman Brothers Band', - 'thumbnail': 're:^https?://.*/thumbnail/.*', - 'upload_date': '20131219', - 'location': 'Capitol Theatre (Passaic, NJ)', - 'description': 'Listen to The Allman Brothers Band perform Straight from the Heart at Capitol Theatre (Passaic, NJ) on Dec 16, 1981', - 'timestamp': int, - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - webpage = self._download_webpage(url, display_id) - - thumbnail = self._search_regex( - r'<meta itemprop="thumbnail" content="([^"]+)"', - webpage, 'thumbnail', fatal=False) - - data_div = self._search_regex( - r'(?s)<div class="data">(.*?)</div>', webpage, 'data fields') - uploader = self._html_search_regex( - r'<h1.*?>(.*?)</h1>', data_div, 'uploader', fatal=False) - title = self._html_search_regex( - r'<h2.*?>(.*?)</h2>', data_div, 'title') - location = self._html_search_regex( - r'<h4.*?>(.*?)</h4>', data_div, 'location', fatal=False) - - kaltura_id = self._search_regex( - r'<div id="video-detail-player" data-kaltura-id="([^"]+)"', - webpage, 'kaltura ID') - wid = self._search_regex(r'/wid/_([0-9]+)/', webpage, 'wid') - - return { - 'id': mobj.group('id'), - '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (wid, kaltura_id), - 'ie_key': 'Kaltura', - 'display_id': display_id, - 'uploader_id': mobj.group('uploader_id'), - 'thumbnail': thumbnail, - 'description': self._html_search_meta('description', webpage), - 'location': location, - 'title': title, - 'uploader': uploader, - } diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 79a13958b..e3cc6fde8 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -1,130 +1,380 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( - ExtractorError, + determine_ext, int_or_none, + parse_iso8601, qualities, - parse_duration, ) class NDRBaseIE(InfoExtractor): def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + return self._extract_embed(webpage, display_id) + - page = self._download_webpage(url, video_id, 'Downloading page') +class NDRIE(NDRBaseIE): + IE_NAME = 'ndr' + IE_DESC = 'NDR.de - Norddeutscher Rundfunk' + _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)+(?P<id>[^/?#]+),[\da-z]+\.html' + _TESTS = [{ + # httpVideo, same content id + 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', + 'md5': '6515bc255dc5c5f8c85bbc38e035a659', + 'info_dict': { + 'id': 'hafengeburtstag988', + 'display_id': 'Party-Poette-und-Parade', + 'ext': 'mp4', + 'title': 'Party, Pötte und Parade', + 'description': 'md5:ad14f9d2f91d3040b6930c697e5f6b4c', + 'uploader': 'ndrtv', + 'timestamp': 1431108900, + 'upload_date': '20150510', + 'duration': 3498, + }, + 'params': { + 'skip_download': True, + }, + }, { + # httpVideo, different content id + 'url': 'http://www.ndr.de/sport/fussball/40-Osnabrueck-spielt-sich-in-einen-Rausch,osna270.html', + 'md5': '1043ff203eab307f0c51702ec49e9a71', + 'info_dict': { + 'id': 'osna272', + 'display_id': '40-Osnabrueck-spielt-sich-in-einen-Rausch', + 'ext': 'mp4', + 'title': 'Osnabrück - Wehen Wiesbaden: Die Highlights', + 'description': 'md5:32e9b800b3d2d4008103752682d5dc01', + 'uploader': 'ndrtv', + 'timestamp': 1442059200, + 'upload_date': '20150912', + 'duration': 510, + }, + 'params': { + 'skip_download': True, + }, + }, { + # httpAudio, same content id + 'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html', + 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', + 'info_dict': { + 'id': 'audio51535', + 'display_id': 'La-Valette-entgeht-der-Hinrichtung', + 'ext': 'mp3', + 'title': 'La Valette entgeht der Hinrichtung', + 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', + 'uploader': 'ndrinfo', + 'timestamp': 1290626100, + 'upload_date': '20140729', + 'duration': 884, + }, + 'params': { + 'skip_download': True, + }, + }] - title = self._og_search_title(page).strip() - description = self._og_search_description(page) - if description: - description = description.strip() + def _extract_embed(self, webpage, display_id): + embed_url = self._html_search_meta( + 'embedURL', webpage, 'embed URL', fatal=True) + description = self._search_regex( + r'<p[^>]+itemprop="description">([^<]+)</p>', + webpage, 'description', fatal=False) + timestamp = parse_iso8601( + self._search_regex( + r'<span itemprop="datePublished" content="([^"]+)">', + webpage, 'upload date', fatal=False)) + return { + '_type': 'url_transparent', + 'url': embed_url, + 'display_id': display_id, + 'description': description, + 'timestamp': timestamp, + } - duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', default=None)) - if not duration: - duration = parse_duration(self._html_search_regex( - r'(<span class="min">\d+</span>:<span class="sec">\d+</span>)', - page, 'duration', default=None)) - formats = [] +class NJoyIE(NDRBaseIE): + IE_NAME = 'njoy' + IE_DESC = 'N-JOY' + _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)+(?P<id>[^/?#]+),[\da-z]+\.html' + _TESTS = [{ + # httpVideo, same content id + 'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html', + 'md5': 'cb63be60cd6f9dd75218803146d8dc67', + 'info_dict': { + 'id': 'comedycontest2480', + 'display_id': 'Benaissa-beim-NDR-Comedy-Contest', + 'ext': 'mp4', + 'title': 'Benaissa beim NDR Comedy Contest', + 'description': 'md5:f057a6c4e1c728b10d33b5ffd36ddc39', + 'uploader': 'ndrtv', + 'upload_date': '20141129', + 'duration': 654, + }, + 'params': { + 'skip_download': True, + }, + }, { + # httpVideo, different content id + 'url': 'http://www.n-joy.de/musik/Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-,felixjaehn168.html', + 'md5': '417660fffa90e6df2fda19f1b40a64d8', + 'info_dict': { + 'id': 'dockville882', + 'display_id': 'Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-', + 'ext': 'mp4', + 'title': '"Ich hab noch nie" mit Felix Jaehn', + 'description': 'md5:85dd312d53be1b99e1f998a16452a2f3', + 'uploader': 'njoy', + 'upload_date': '20150822', + 'duration': 211, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _extract_embed(self, webpage, display_id): + video_id = self._search_regex( + r'<iframe[^>]+id="pp_([\da-z]+)"', webpage, 'embed id') + description = self._search_regex( + r'<div[^>]+class="subline"[^>]*>[^<]+</div>\s*<p>([^<]+)</p>', + webpage, 'description', fatal=False) + return { + '_type': 'url_transparent', + 'ie_key': 'NDREmbedBase', + 'url': 'ndr:%s' % video_id, + 'display_id': display_id, + 'description': description, + } - mp3_url = re.search(r'''\{src:'(?P<audio>[^']+)', type:"audio/mp3"},''', page) - if mp3_url: - formats.append({ - 'url': mp3_url.group('audio'), - 'format_id': 'mp3', - }) - thumbnail = None +class NDREmbedBaseIE(InfoExtractor): + IE_NAME = 'ndr:embed:base' + _VALID_URL = r'(?:ndr:(?P<id_s>[\da-z]+)|https?://www\.ndr\.de/(?P<id>[\da-z]+)-ppjson\.json)' + _TESTS = [{ + 'url': 'ndr:soundcheck3366', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/soundcheck3366-ppjson.json', + 'only_matching': True, + }] - video_url = re.search(r'''3: \{src:'(?P<video>.+?)\.(lo|hi|hq)\.mp4', type:"video/mp4"},''', page) - if video_url: - thumbnails = re.findall(r'''\d+: \{src: "([^"]+)"(?: \|\| '[^']+')?, quality: '([^']+)'}''', page) - if thumbnails: - quality_key = qualities(['xs', 's', 'm', 'l', 'xl']) - largest = max(thumbnails, key=lambda thumb: quality_key(thumb[1])) - thumbnail = 'http://www.ndr.de' + largest[0] + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('id_s') - for format_id in 'lo', 'hi', 'hq': - formats.append({ - 'url': '%s.%s.mp4' % (video_url.group('video'), format_id), - 'format_id': format_id, - }) + ppjson = self._download_json( + 'http://www.ndr.de/%s-ppjson.json' % video_id, video_id) - if not formats: - raise ExtractorError('No media links available for %s' % video_id) + playlist = ppjson['playlist'] + + formats = [] + quality_key = qualities(('xs', 's', 'm', 'l', 'xl')) + + for format_id, f in playlist.items(): + src = f.get('src') + if not src: + continue + ext = determine_ext(src, None) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, f4m_id='hds')) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, m3u8_id='hls', entry_protocol='m3u8_native')) + else: + quality = f.get('quality') + ff = { + 'url': src, + 'format_id': quality or format_id, + 'quality': quality_key(quality), + } + type_ = f.get('type') + if type_ and type_.split('/')[0] == 'audio': + ff['vcodec'] = 'none' + ff['ext'] = ext or 'mp3' + formats.append(ff) + self._sort_formats(formats) + + config = playlist['config'] + + live = playlist.get('config', {}).get('streamType') in ['httpVideoLive', 'httpAudioLive'] + title = config['title'] + if live: + title = self._live_title(title) + uploader = ppjson.get('config', {}).get('branding') + upload_date = ppjson.get('config', {}).get('publicationDate') + duration = int_or_none(config.get('duration')) + + thumbnails = [{ + 'id': thumbnail.get('quality') or thumbnail_id, + 'url': thumbnail['src'], + 'preference': quality_key(thumbnail.get('quality')), + } for thumbnail_id, thumbnail in config.get('poster', {}).items() if thumbnail.get('src')] return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'is_live': live, + 'uploader': uploader if uploader != '-' else None, + 'upload_date': upload_date[0:8] if upload_date else None, 'duration': duration, + 'thumbnails': thumbnails, 'formats': formats, } -class NDRIE(NDRBaseIE): - IE_NAME = 'ndr' - IE_DESC = 'NDR.de - Mediathek' - _VALID_URL = r'https?://www\.ndr\.de/.+?(?P<id>\d+)\.html' - - _TESTS = [ - { - 'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html', - 'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c', - 'note': 'Video file', - 'info_dict': { - 'id': '25866', - 'ext': 'mp4', - 'title': 'Kartoffeltage in der Lewitz', - 'description': 'md5:48c4c04dde604c8a9971b3d4e3b9eaa8', - 'duration': 166, - }, - 'skip': '404 Not found', - }, - { - 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', - 'md5': 'dadc003c55ae12a5d2f6bd436cd73f59', - 'info_dict': { - 'id': '988', - 'ext': 'mp4', - 'title': 'Party, Pötte und Parade', - 'description': 'Hunderttausende feiern zwischen Speicherstadt und St. Pauli den 826. Hafengeburtstag. Die NDR Sondersendung zeigt die schönsten und spektakulärsten Bilder vom Auftakt.', - 'duration': 3498, - }, - }, - { - 'url': 'http://www.ndr.de/info/audio51535.html', - 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', - 'note': 'Audio file', - 'info_dict': { - 'id': '51535', - 'ext': 'mp3', - 'title': 'La Valette entgeht der Hinrichtung', - 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', - 'duration': 884, - } - } - ] - +class NDREmbedIE(NDREmbedBaseIE): + IE_NAME = 'ndr:embed' + _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)+(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html' + _TESTS = [{ + 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html', + 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9', + 'info_dict': { + 'id': 'ndraktuell28488', + 'ext': 'mp4', + 'title': 'Norddeutschland begrüßt Flüchtlinge', + 'is_live': False, + 'uploader': 'ndrtv', + 'upload_date': '20150907', + 'duration': 132, + }, + }, { + 'url': 'http://www.ndr.de/ndr2/events/soundcheck/soundcheck3366-player.html', + 'md5': '002085c44bae38802d94ae5802a36e78', + 'info_dict': { + 'id': 'soundcheck3366', + 'ext': 'mp4', + 'title': 'Ella Henderson braucht Vergleiche nicht zu scheuen', + 'is_live': False, + 'uploader': 'ndr2', + 'upload_date': '20150912', + 'duration': 3554, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.ndr.de/info/audio51535-player.html', + 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', + 'info_dict': { + 'id': 'audio51535', + 'ext': 'mp3', + 'title': 'La Valette entgeht der Hinrichtung', + 'is_live': False, + 'uploader': 'ndrinfo', + 'upload_date': '20140729', + 'duration': 884, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.ndr.de/fernsehen/sendungen/visite/visite11010-externalPlayer.html', + 'md5': 'ae57f80511c1e1f2fd0d0d3d31aeae7c', + 'info_dict': { + 'id': 'visite11010', + 'ext': 'mp4', + 'title': 'Visite - die ganze Sendung', + 'is_live': False, + 'uploader': 'ndrtv', + 'upload_date': '20150902', + 'duration': 3525, + }, + 'params': { + 'skip_download': True, + }, + }, { + # httpVideoLive + 'url': 'http://www.ndr.de/fernsehen/livestream/livestream217-externalPlayer.html', + 'info_dict': { + 'id': 'livestream217', + 'ext': 'flv', + 'title': 're:^NDR Fernsehen Niedersachsen \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': True, + 'upload_date': '20150910', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.ndr.de/ndrkultur/audio255020-player.html', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/fernsehen/sendungen/nordtour/nordtour7124-player.html', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/kultur/film/videos/videoimport10424-player.html', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/fernsehen/sendungen/hamburg_journal/hamj43006-player.html', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/fernsehen/sendungen/weltbilder/weltbilder4518-player.html', + 'only_matching': True, + }, { + 'url': 'http://www.ndr.de/fernsehen/doku952-player.html', + 'only_matching': True, + }] -class NJoyIE(NDRBaseIE): - IE_NAME = 'N-JOY' - _VALID_URL = r'https?://www\.n-joy\.de/.+?(?P<id>\d+)\.html' - _TEST = { - 'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html', - 'md5': 'cb63be60cd6f9dd75218803146d8dc67', +class NJoyEmbedIE(NDREmbedBaseIE): + IE_NAME = 'njoy:embed' + _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)+(?P<id>[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html' + _TESTS = [{ + # httpVideo + 'url': 'http://www.n-joy.de/events/reeperbahnfestival/doku948-player_image-bc168e87-5263-4d6d-bd27-bb643005a6de_theme-n-joy.html', + 'md5': '8483cbfe2320bd4d28a349d62d88bd74', 'info_dict': { - 'id': '2480', + 'id': 'doku948', 'ext': 'mp4', - 'title': 'Benaissa beim NDR Comedy Contest', - 'description': 'Von seinem sehr "behaarten" Leben lässt sich Benaissa trotz aller Schwierigkeiten nicht unterkriegen.', - 'duration': 654, - } - } + 'title': 'Zehn Jahre Reeperbahn Festival - die Doku', + 'is_live': False, + 'upload_date': '20150807', + 'duration': 1011, + }, + }, { + # httpAudio + 'url': 'http://www.n-joy.de/news_wissen/stefanrichter100-player_image-d5e938b1-f21a-4b9a-86b8-aaba8bca3a13_theme-n-joy.html', + 'md5': 'd989f80f28ac954430f7b8a48197188a', + 'info_dict': { + 'id': 'stefanrichter100', + 'ext': 'mp3', + 'title': 'Interview mit einem Augenzeugen', + 'is_live': False, + 'uploader': 'njoy', + 'upload_date': '20150909', + 'duration': 140, + }, + 'params': { + 'skip_download': True, + }, + }, { + # httpAudioLive, no explicit ext + 'url': 'http://www.n-joy.de/news_wissen/webradioweltweit100-player_image-3fec0484-2244-4565-8fb8-ed25fd28b173_theme-n-joy.html', + 'info_dict': { + 'id': 'webradioweltweit100', + 'ext': 'mp3', + 'title': 're:^N-JOY Weltweit \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': True, + 'uploader': 'njoy', + 'upload_date': '20150810', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.n-joy.de/musik/dockville882-player_image-3905259e-0803-4764-ac72-8b7de077d80a_theme-n-joy.html', + 'only_matching': True, + }, { + 'url': 'http://www.n-joy.de/radio/sendungen/morningshow/urlaubsfotos190-player_image-066a5df1-5c95-49ec-a323-941d848718db_theme-n-joy.html', + 'only_matching': True, + }, { + 'url': 'http://www.n-joy.de/entertainment/comedy/krudetv290-player_image-ab261bfe-51bf-4bf3-87ba-c5122ee35b3d_theme-n-joy.html', + 'only_matching': True, + }] diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 0f8aa5ada..bda1cff05 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -12,6 +12,7 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + encode_dict, ExtractorError, int_or_none, parse_duration, @@ -100,10 +101,7 @@ class NiconicoIE(InfoExtractor): 'mail': username, 'password': password, } - # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode - # chokes on unicode - login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items()) - login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8') + login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('utf-8') request = compat_urllib_request.Request( 'https://secure.nicovideo.jp/secure/login', login_data) login_results = self._download_webpage( diff --git a/youtube_dl/extractor/openfilm.py b/youtube_dl/extractor/openfilm.py deleted file mode 100644 index d2ceedd01..000000000 --- a/youtube_dl/extractor/openfilm.py +++ /dev/null @@ -1,70 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote_plus -from ..utils import ( - parse_iso8601, - parse_age_limit, - int_or_none, -) - - -class OpenFilmIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)openfilm\.com/videos/(?P<id>.+)' - _TEST = { - 'url': 'http://www.openfilm.com/videos/human-resources-remastered', - 'md5': '42bcd88c2f3ec13b65edf0f8ad1cac37', - 'info_dict': { - 'id': '32736', - 'display_id': 'human-resources-remastered', - 'ext': 'mp4', - 'title': 'Human Resources (Remastered)', - 'description': 'Social Engineering in the 20th Century.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 7164, - 'timestamp': 1334756988, - 'upload_date': '20120418', - 'uploader_id': '41117', - 'view_count': int, - 'age_limit': 0, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - player = compat_urllib_parse_unquote_plus( - self._og_search_video_url(webpage)) - - video = json.loads(self._search_regex( - r'\bp=({.+?})(?:&|$)', player, 'video JSON')) - - video_url = '%s1.mp4' % video['location'] - video_id = video.get('video_id') - display_id = video.get('alias') or display_id - title = video.get('title') - description = video.get('description') - thumbnail = video.get('main_thumb') - duration = int_or_none(video.get('duration')) - timestamp = parse_iso8601(video.get('dt_published'), ' ') - uploader_id = video.get('user_id') - view_count = int_or_none(video.get('views_count')) - age_limit = parse_age_limit(video.get('age_limit')) - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'uploader_id': uploader_id, - 'view_count': view_count, - 'age_limit': age_limit, - } diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 84fe71aef..5f7ac4b35 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -2,14 +2,12 @@ from __future__ import unicode_literals -import re -import json - from .common import InfoExtractor +from ..compat import compat_str class TudouIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/.*?/(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/([^/]+/)*(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])' _TESTS = [{ 'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html', 'md5': '140a49ed444bd22f93330985d8475fcb', @@ -27,41 +25,41 @@ class TudouIE(InfoExtractor): 'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012', 'thumbnail': 're:^https?://.*\.jpg$', } + }, { + 'url': 'http://www.tudou.com/albumplay/cJAHGih4yYg.html', + 'only_matching': True, }] _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' - def _url_for_id(self, id, quality=None): - info_url = "http://v2.tudou.com/f?id=" + str(id) + def _url_for_id(self, video_id, quality=None): + info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id) if quality: info_url += '&hd' + quality - webpage = self._download_webpage(info_url, id, "Opening the info webpage") - final_url = self._html_search_regex('>(.+?)</f>', webpage, 'video url') + xml_data = self._download_xml(info_url, video_id, "Opening the info XML page") + final_url = xml_data.text return final_url def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage) - if m and m.group(1): - return { - '_type': 'url', - 'url': 'youku:' + m.group(1), - 'ie_key': 'Youku' - } + youku_vcode = self._search_regex( + r'vcode\s*:\s*[\'"]([^\'"]*)[\'"]', webpage, 'youku vcode', default=None) + if youku_vcode: + return self.url_result('youku:' + youku_vcode, ie='Youku') title = self._search_regex( - r",kw:\s*['\"](.+?)[\"']", webpage, 'title') + r',kw\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'title') thumbnail_url = self._search_regex( - r",pic:\s*[\"'](.+?)[\"']", webpage, 'thumbnail URL', fatal=False) + r',pic\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'thumbnail URL', fatal=False) player_url = self._search_regex( - r"playerUrl\s*:\s*['\"](.+?\.swf)[\"']", + r'playerUrl\s*:\s*[\'"]([^\'"]+\.swf)[\'"]', webpage, 'player URL', default=self._PLAYER_URL) - segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments') - segments = json.loads(segs_json) + segments = self._parse_json(self._search_regex( + r'segs: \'([^\']+)\'', webpage, 'segments'), video_id) # It looks like the keys are the arguments that have to be passed as # the hd field in the request url, we pick the higher # Also, filter non-number qualities (see issue #3643). diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 157bb74fe..9a794e609 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -1,10 +1,12 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, int_or_none, float_or_none, - str_to_int, + parse_iso8601, ) @@ -12,18 +14,41 @@ class VidmeIE(InfoExtractor): _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]+)' _TESTS = [{ 'url': 'https://vid.me/QNB', - 'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', + 'md5': 'c62f1156138dc3323902188c5b5a8bd6', 'info_dict': { 'id': 'QNB', 'ext': 'mp4', 'title': 'Fishing for piranha - the easy way', 'description': 'source: https://www.facebook.com/photo.php?v=312276045600871', - 'duration': 119.92, + 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1406313244, 'upload_date': '20140725', + 'age_limit': 0, + 'duration': 119.92, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + }, { + 'url': 'https://vid.me/Gc6M', + 'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', + 'info_dict': { + 'id': 'Gc6M', + 'ext': 'mp4', + 'title': 'O Mere Dil ke chain - Arnav and Khushi VM', 'thumbnail': 're:^https?://.*\.jpg', + 'timestamp': 1441211642, + 'upload_date': '20150902', + 'uploader': 'SunshineM', + 'uploader_id': '3552827', + 'age_limit': 0, + 'duration': 223.72, 'view_count': int, 'like_count': int, + 'comment_count': int, + }, + 'params': { + 'skip_download': True, }, }, { # tests uploader field @@ -33,63 +58,94 @@ class VidmeIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Carver', 'description': 'md5:e9c24870018ae8113be936645b93ba3c', - 'duration': 97.859999999999999, + 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1433203629, 'upload_date': '20150602', 'uploader': 'Thomas', - 'thumbnail': 're:^https?://.*\.jpg', + 'uploader_id': '109747', + 'age_limit': 0, + 'duration': 97.859999999999999, 'view_count': int, 'like_count': int, + 'comment_count': int, }, 'params': { 'skip_download': True, }, }, { - # From http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching + # nsfw test from http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching 'url': 'https://vid.me/e/Wmur', - 'only_matching': True, + 'info_dict': { + 'id': 'Wmur', + 'ext': 'mp4', + 'title': 'naked smoking & stretching', + 'thumbnail': 're:^https?://.*\.jpg', + 'timestamp': 1430931613, + 'upload_date': '20150506', + 'uploader': 'naked-yogi', + 'uploader_id': '1638622', + 'age_limit': 18, + 'duration': 653.26999999999998, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): - url = url.replace('vid.me/e/', 'vid.me/') video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'<source src="([^"]+)"', webpage, 'video URL') + try: + response = self._download_json( + 'https://api.vid.me/videoByUrl/%s' % video_id, video_id) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + response = self._parse_json(e.cause.read(), video_id) + else: + raise + + error = response.get('error') + if error: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error), expected=True) - title = self._og_search_title(webpage) - description = self._og_search_description(webpage, default='') - thumbnail = self._og_search_thumbnail(webpage) - timestamp = int_or_none(self._og_search_property( - 'updated_time', webpage, fatal=False)) - width = int_or_none(self._og_search_property( - 'video:width', webpage, fatal=False)) - height = int_or_none(self._og_search_property( - 'video:height', webpage, fatal=False)) - duration = float_or_none(self._html_search_regex( - r'data-duration="([^"]+)"', webpage, 'duration', fatal=False)) - view_count = str_to_int(self._html_search_regex( - r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', - webpage, 'view count', fatal=False)) - like_count = str_to_int(self._html_search_regex( - r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">', - webpage, 'like count', fatal=False)) - uploader = self._html_search_regex( - 'class="video_author_username"[^>]*>([^<]+)', - webpage, 'uploader', default=None) + video = response['video'] + + formats = [{ + 'format_id': f.get('type'), + 'url': f['uri'], + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + } for f in video.get('formats', []) if f.get('uri')] + self._sort_formats(formats) + + title = video['title'] + description = video.get('description') + thumbnail = video.get('thumbnail_url') + timestamp = parse_iso8601(video.get('date_created'), ' ') + uploader = video.get('user', {}).get('username') + uploader_id = video.get('user', {}).get('user_id') + age_limit = 18 if video.get('nsfw') is True else 0 + duration = float_or_none(video.get('duration')) + view_count = int_or_none(video.get('view_count')) + like_count = int_or_none(video.get('likes_count')) + comment_count = int_or_none(video.get('comment_count')) return { 'id': video_id, - 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'age_limit': age_limit, 'timestamp': timestamp, - 'width': width, - 'height': height, 'duration': duration, 'view_count': view_count, 'like_count': like_count, - 'uploader': uploader, + 'comment_count': comment_count, + 'formats': formats, } diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index 15377097e..c76c20614 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import re +import itertools from .common import InfoExtractor @@ -91,31 +92,27 @@ class VierVideosIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) program = mobj.group('program') - webpage = self._download_webpage(url, program) - page_id = mobj.group('page') if page_id: page_id = int(page_id) start_page = page_id - last_page = start_page + 1 playlist_id = '%s-page%d' % (program, page_id) else: start_page = 0 - last_page = int(self._search_regex( - r'videos\?page=(\d+)">laatste</a>', - webpage, 'last page', default=0)) + 1 playlist_id = program entries = [] - for current_page_id in range(start_page, last_page): + for current_page_id in itertools.count(start_page): current_page = self._download_webpage( 'http://www.vier.be/%s/videos?page=%d' % (program, current_page_id), program, - 'Downloading page %d' % (current_page_id + 1)) if current_page_id != page_id else webpage + 'Downloading page %d' % (current_page_id + 1)) page_entries = [ self.url_result('http://www.vier.be' + video_url, 'Vier') for video_url in re.findall( r'<h3><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)] entries.extend(page_entries) + if page_id or '>Meer<' not in current_page: + break return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py index 72eb010f8..ec8b99998 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/youtube_dl/extractor/washingtonpost.py @@ -19,25 +19,25 @@ class WashingtonPostIE(InfoExtractor): 'title': 'Sinkhole of bureaucracy', }, 'playlist': [{ - 'md5': '79132cc09ec5309fa590ae46e4cc31bc', + 'md5': 'b9be794ceb56c7267d410a13f99d801a', 'info_dict': { 'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f', 'ext': 'mp4', 'title': 'Breaking Points: The Paper Mine', - 'duration': 1287, + 'duration': 1290, 'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.', 'uploader': 'The Washington Post', 'timestamp': 1395527908, 'upload_date': '20140322', }, }, { - 'md5': 'e1d5734c06865cc504ad99dc2de0d443', + 'md5': '1fff6a689d8770966df78c8cb6c8c17c', 'info_dict': { 'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f', 'ext': 'mp4', 'title': 'The town bureaucracy sustains', 'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.', - 'duration': 2217, + 'duration': 2220, 'timestamp': 1395528005, 'upload_date': '20140322', 'uploader': 'The Washington Post', diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index f69d46a28..e4f50e64c 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -1,40 +1,33 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from .youtube import YoutubeIE class WimpIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?wimp\.com/([^/]+)/' + _VALID_URL = r'http://(?:www\.)?wimp\.com/(?P<id>[^/]+)/' _TESTS = [{ 'url': 'http://www.wimp.com/maruexhausted/', - 'md5': 'f1acced123ecb28d9bb79f2479f2b6a1', + 'md5': 'ee21217ffd66d058e8b16be340b74883', 'info_dict': { 'id': 'maruexhausted', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Maru is exhausted.', 'description': 'md5:57e099e857c0a4ea312542b684a869b8', } }, { - # youtube video 'url': 'http://www.wimp.com/clowncar/', + 'md5': '4e2986c793694b55b37cf92521d12bb4', 'info_dict': { - 'id': 'cG4CEr2aiSg', + 'id': 'clowncar', 'ext': 'mp4', - 'title': 'Basset hound clown car...incredible!', - 'description': 'md5:8d228485e0719898c017203f900b3a35', - 'uploader': 'Gretchen Hoey', - 'uploader_id': 'gretchenandjeff1', - 'upload_date': '20140303', + 'title': 'It\'s like a clown car.', + 'description': 'md5:0e56db1370a6e49c5c1d19124c0d2fb2', }, - 'add_ie': ['Youtube'], }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_url = self._search_regex( [r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", r"videoId\s*:\s*[\"']([^\"']+)[\"']"], diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 5aac8adb3..8bbac54e2 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -19,7 +19,7 @@ class XuiteIE(InfoExtractor): _TESTS = [{ # Audio 'url': 'http://vlog.xuite.net/play/RGkzc1ZULTM4NjA5MTQuZmx2', - 'md5': '63a42c705772aa53fd4c1a0027f86adf', + 'md5': 'e79284c87b371424885448d11f6398c8', 'info_dict': { 'id': '3860914', 'ext': 'mp3', diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index f9afbdbab..fca5ddc69 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -101,7 +101,7 @@ class YahooIE(InfoExtractor): } }, { 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', - 'md5': '67010fdf3a08d290e060a4dd96baa07b', + 'md5': '88e209b417f173d86186bef6e4d1f160', 'info_dict': { 'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521', 'ext': 'mp4', @@ -144,6 +144,17 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://tw.news.yahoo.com/-100120367.html', 'only_matching': True, + }, { + # Query result is embedded in webpage, but explicit request to video API fails with geo restriction + 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', + 'md5': '4fbafb9c9b6f07aa8f870629f6671b35', + 'info_dict': { + 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', + 'ext': 'mp4', + 'title': 'Communitary - Community Episode 1: Ladders', + 'description': 'md5:8fc39608213295748e1e289807838c97', + 'duration': 1646, + }, } ] @@ -171,6 +182,19 @@ class YahooIE(InfoExtractor): if nbc_sports_url: return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') + # Query result is often embedded in webpage as JSON. Sometimes explicit requests + # to video API results in a failure with geo restriction reason therefore using + # embedded query result when present sounds reasonable. + config_json = self._search_regex( + r'window\.Af\.bootstrap\[[^\]]+\]\s*=\s*({.*?"applet_type"\s*:\s*"td-applet-videoplayer".*?});(?:</script>|$)', + webpage, 'videoplayer applet', default=None) + if config_json: + config = self._parse_json(config_json, display_id, fatal=False) + if config: + sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi') + if sapi: + return self._extract_info(display_id, sapi, webpage) + items_json = self._search_regex( r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, default=None) @@ -190,22 +214,10 @@ class YahooIE(InfoExtractor): video_id = info['id'] return self._get_info(video_id, display_id, webpage) - def _get_info(self, video_id, display_id, webpage): - region = self._search_regex( - r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', - webpage, 'region', fatal=False, default='US') - data = compat_urllib_parse.urlencode({ - 'protocol': 'http', - 'region': region, - }) - query_url = ( - 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' - '{id}?{data}'.format(id=video_id, data=data)) - query_result = self._download_json( - query_url, display_id, 'Downloading video info') - - info = query_result['query']['results']['mediaObj'][0] + def _extract_info(self, display_id, query, webpage): + info = query['query']['results']['mediaObj'][0] meta = info.get('meta') + video_id = info.get('id') if not meta: msg = info['status'].get('msg') @@ -231,6 +243,9 @@ class YahooIE(InfoExtractor): 'ext': 'flv', }) else: + if s.get('format') == 'm3u8_playlist': + format_info['protocol'] = 'm3u8_native' + format_info['ext'] = 'mp4' format_url = compat_urlparse.urljoin(host, path) format_info['url'] = format_url formats.append(format_info) @@ -264,6 +279,21 @@ class YahooIE(InfoExtractor): 'subtitles': subtitles, } + def _get_info(self, video_id, display_id, webpage): + region = self._search_regex( + r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', + webpage, 'region', fatal=False, default='US') + data = compat_urllib_parse.urlencode({ + 'protocol': 'http', + 'region': region, + }) + query_url = ( + 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' + '{id}?{data}'.format(id=video_id, data=data)) + query_result = self._download_json( + query_url, display_id, 'Downloading video info') + return self._extract_info(display_id, query_result, webpage) + class YahooSearchIE(SearchInfoExtractor): IE_DESC = 'Yahoo screen search' diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 030ec70ca..97ce36550 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -26,6 +26,7 @@ from ..compat import ( ) from ..utils import ( clean_html, + encode_dict, ExtractorError, float_or_none, get_element_by_attribute, @@ -111,10 +112,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'hl': 'en_US', } - # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode - # chokes on unicode - login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items()) - login_data = compat_urllib_parse.urlencode(login_form).encode('ascii') + login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii') req = compat_urllib_request.Request(self._LOGIN_URL, login_data) login_results = self._download_webpage( @@ -147,8 +145,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'TrustDevice': 'on', }) - tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items()) - tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii') + tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii') tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data) tfa_results = self._download_webpage( @@ -1838,8 +1835,8 @@ class YoutubeShowIE(InfoExtractor): _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)' IE_NAME = 'youtube:show' _TESTS = [{ - 'url': 'http://www.youtube.com/show/airdisasters', - 'playlist_mincount': 3, + 'url': 'https://www.youtube.com/show/airdisasters', + 'playlist_mincount': 5, 'info_dict': { 'id': 'airdisasters', 'title': 'Air Disasters', @@ -1850,7 +1847,7 @@ class YoutubeShowIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') webpage = self._download_webpage( - url, playlist_id, 'Downloading show webpage') + 'https://www.youtube.com/show/%s/playlists' % playlist_id, playlist_id, 'Downloading show webpage') # There's one playlist for each season of the show m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage)) self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons))) |