diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/brightcove.py | 3 | ||||
-rw-r--r-- | youtube_dl/extractor/drbonanza.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 22 | ||||
-rw-r--r-- | youtube_dl/extractor/infoq.py | 9 | ||||
-rw-r--r-- | youtube_dl/extractor/lynda.py | 8 | ||||
-rw-r--r-- | youtube_dl/extractor/noco.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/onionstudios.py | 74 | ||||
-rw-r--r-- | youtube_dl/extractor/soundcloud.py | 5 | ||||
-rw-r--r-- | youtube_dl/extractor/thesixtyone.py | 18 | ||||
-rw-r--r-- | youtube_dl/extractor/vk.py | 25 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 7 |
12 files changed, 154 insertions, 32 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3b906b880..59068a8b8 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -388,6 +388,7 @@ from .nytimes import ( from .nuvid import NuvidIE from .odnoklassniki import OdnoklassnikiIE from .oktoberfesttv import OktoberfestTVIE +from .onionstudios import OnionStudiosIE from .ooyala import ( OoyalaIE, OoyalaExternalIE, diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index d768f99e6..4721c2293 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -13,6 +13,7 @@ from ..compat import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, + compat_xml_parse_error, ) from ..utils import ( determine_ext, @@ -119,7 +120,7 @@ class BrightcoveIE(InfoExtractor): try: object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) - except xml.etree.ElementTree.ParseError: + except compat_xml_parse_error: return fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') diff --git a/youtube_dl/extractor/drbonanza.py b/youtube_dl/extractor/drbonanza.py index 7626219ba..8b98b013a 100644 --- a/youtube_dl/extractor/drbonanza.py +++ b/youtube_dl/extractor/drbonanza.py @@ -15,7 +15,6 @@ class DRBonanzaIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517', - 'md5': 'fe330252ddea607635cf2eb2c99a0af3', 'info_dict': { 'id': '65517', 'ext': 'mp4', @@ -26,6 +25,9 @@ class DRBonanzaIE(InfoExtractor): 'upload_date': '20110120', 'duration': 3664, }, + 'params': { + 'skip_download': True, # requires rtmp + }, }, { 'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410', 'md5': '6dfe039417e76795fb783c52da3de11d', @@ -93,6 +95,11 @@ class DRBonanzaIE(InfoExtractor): 'format_id': file['Type'].replace('Video', ''), 'preference': preferencemap.get(file['Type'], -10), }) + if format['url'].startswith('rtmp'): + rtmp_url = format['url'] + format['rtmp_live'] = True # --resume does not work + if '/bonanza/' in rtmp_url: + format['play_path'] = rtmp_url.split('/bonanza/')[1] formats.append(format) elif file['Type'] == "Thumb": thumbnail = file['Location'] @@ -111,9 +118,6 @@ class DRBonanzaIE(InfoExtractor): description = '%s\n%s\n%s\n' % ( info['Description'], info['Actors'], info['Colophon']) - for f in formats: - f['url'] = f['url'].replace('rtmp://vod-bonanza.gss.dr.dk/bonanza/', 'http://vodfiles.dr.dk/') - f['url'] = f['url'].replace('mp4:bonanza', 'bonanza') self._sort_formats(formats) display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 5c03fddc6..42e4e7035 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -46,6 +46,7 @@ from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE +from .onionstudios import OnionStudiosIE class GenericIE(InfoExtractor): @@ -836,6 +837,18 @@ class GenericIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpe?g$', } }, + # OnionStudios embed + { + 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', + 'info_dict': { + 'id': '2855', + 'ext': 'mp4', + 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You', + 'thumbnail': 're:^https?://.*\.jpe?g$', + 'uploader': 'ClickHole', + 'uploader_id': 'clickhole', + } + }, # AdobeTVVideo embed { 'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners', @@ -1014,7 +1027,9 @@ class GenericIE(InfoExtractor): } if not self._downloader.params.get('test', False) and not is_intentional: - self._downloader.report_warning('Falling back on generic information extractor.') + force = self._downloader.params.get('force_generic_extractor', False) + self._downloader.report_warning( + '%s on generic information extractor.' % ('Forcing' if force else 'Falling back')) if not full_response: request = compat_urllib_request.Request(url) @@ -1530,6 +1545,11 @@ class GenericIE(InfoExtractor): if dmcloud_url: return self.url_result(dmcloud_url, 'DailymotionCloud') + # Look for OnionStudios embeds + onionstudios_url = OnionStudiosIE._extract_url(webpage) + if onionstudios_url: + return self.url_result(onionstudios_url) + # Look for AdobeTVVideo embeds mobj = re.search( r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index f25f43664..117a7faf6 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -9,9 +9,9 @@ from ..compat import ( class InfoQIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?infoq\.com/[^/]+/(?P<id>[^/]+)$' + _VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things', 'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2', 'info_dict': { @@ -20,7 +20,10 @@ class InfoQIE(InfoExtractor): 'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.', 'title': 'A Few of My Favorite [Python] Things', }, - } + }, { + 'url': 'http://www.infoq.com/fr/presentations/changez-avis-sur-javascript', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index cfd3b14f4..a00f6e5e5 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -30,13 +30,13 @@ class LyndaBaseIE(InfoExtractor): return login_form = { - 'username': username, - 'password': password, + 'username': username.encode('utf-8'), + 'password': password.encode('utf-8'), 'remember': 'false', 'stayPut': 'false' } request = compat_urllib_request.Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) + self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) login_page = self._download_webpage( request, None, 'Logging in as %s' % username) @@ -65,7 +65,7 @@ class LyndaBaseIE(InfoExtractor): 'stayPut': 'false', } request = compat_urllib_request.Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form)) + self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form).encode('utf-8')) login_page = self._download_webpage( request, None, 'Confirming log in and log out from another device') diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index 5bbd2dcf6..a53e27b27 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -195,7 +195,7 @@ class NocoIE(InfoExtractor): if episode_number: title += ' #' + compat_str(episode_number) if episode: - title += ' - ' + episode + title += ' - ' + compat_str(episode) description = show.get('show_resume') or show.get('family_resume') diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py new file mode 100644 index 000000000..8fa507dec --- /dev/null +++ b/youtube_dl/extractor/onionstudios.py @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import determine_ext + + +class OnionStudiosIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?onionstudios\.com/(?:videos/[^/]+-|embed\?.*\bid=)(?P<id>\d+)(?!-)' + + _TESTS = [{ + 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', + 'md5': 'd4851405d31adfadf71cd7a487b765bb', + 'info_dict': { + 'id': '2937', + 'ext': 'mp4', + 'title': 'Hannibal charges forward, stops for a cocktail', + 'description': 'md5:545299bda6abf87e5ec666548c6a9448', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'The A.V. Club', + 'uploader_id': 'TheAVClub', + }, + }, { + 'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?onionstudios\.com/embed.+?)\1', webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://www.onionstudios.com/embed?id=%s' % video_id, video_id) + + formats = [] + for src in re.findall(r'<source[^>]+src="([^"]+)"', webpage): + if determine_ext(src) != 'm3u8': # m3u8 always results in 403 + formats.append({ + 'url': src, + }) + self._sort_formats(formats) + + title = self._search_regex( + r'share_title\s*=\s*"([^"]+)"', webpage, 'title') + description = self._search_regex( + r'share_description\s*=\s*"([^"]+)"', webpage, + 'description', default=None) + thumbnail = self._search_regex( + r'poster="([^"]+)"', webpage, 'thumbnail', default=False) + + uploader_id = self._search_regex( + r'twitter_handle\s*=\s*"([^"]+)"', + webpage, 'uploader id', fatal=False) + uploader = self._search_regex( + r'window\.channelName\s*=\s*"Embedded:([^"]+)"', + webpage, 'uploader', default=False) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'formats': formats, + } diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index c23c5ee0f..118ca4832 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -29,7 +29,7 @@ class SoundcloudIE(InfoExtractor): _VALID_URL = r'''(?x)^(?:https?://)? (?:(?:(?:www\.|m\.)?soundcloud\.com/ (?P<uploader>[\w\d-]+)/ - (?!sets/|likes/?(?:$|[?#])) + (?!sets/|(?:likes|tracks)/?(?:$|[?#])) (?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) @@ -307,6 +307,9 @@ class SoundcloudUserIE(SoundcloudIE): 'title': 'The Royal Concept', }, 'playlist_mincount': 1, + }, { + 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/thesixtyone.py b/youtube_dl/extractor/thesixtyone.py index a77c6a2fc..5d09eb9a8 100644 --- a/youtube_dl/extractor/thesixtyone.py +++ b/youtube_dl/extractor/thesixtyone.py @@ -1,9 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import json -import re - from .common import InfoExtractor from ..utils import unified_strdate @@ -17,7 +14,7 @@ class TheSixtyOneIE(InfoExtractor): song )/(?P<id>[A-Za-z0-9]+)/?$''' _SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}' - _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}.thesixtyone.com/thesixtyone_production/audio/{0:}_stream' + _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}/thesixtyone_production/audio/{0:}_stream' _THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop' _TESTS = [ { @@ -70,14 +67,19 @@ class TheSixtyOneIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - song_id = mobj.group('id') + song_id = self._match_id(url) webpage = self._download_webpage( self._SONG_URL_TEMPLATE.format(song_id), song_id) - song_data = json.loads(self._search_regex( - r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data')) + song_data = self._parse_json(self._search_regex( + r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data'), song_id) + + if self._search_regex(r'(t61\.s3_audio_load\s*=\s*1\.0;)', webpage, 's3_audio_load marker', default=None): + song_data['audio_server'] = 's3.amazonaws.com' + else: + song_data['audio_server'] = song_data['audio_server'] + '.thesixtyone.com' + keys = [self._DECODE_MAP.get(s, s) for s in song_data['key']] url = self._SONG_FILE_URL_TEMPLATE.format( "".join(reversed(keys)), **song_data) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 38ff3c1a9..f2ae109f9 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -121,20 +121,27 @@ class VKIE(InfoExtractor): if username is None: return - login_form = { - 'act': 'login', - 'role': 'al_frame', - 'expire': '1', + login_page = self._download_webpage( + 'https://vk.com', None, 'Downloading login page') + + login_form = dict(re.findall( + r'<input\s+type="hidden"\s+name="([^"]+)"\s+(?:id="[^"]+"\s+)?value="([^"]*)"', + login_page)) + + login_form.update({ 'email': username.encode('cp1251'), 'pass': password.encode('cp1251'), - } + }) - request = compat_urllib_request.Request('https://login.vk.com/?act=login', - compat_urllib_parse.urlencode(login_form).encode('utf-8')) - login_page = self._download_webpage(request, None, note='Logging in as %s' % username) + request = compat_urllib_request.Request( + 'https://login.vk.com/?act=login', + compat_urllib_parse.urlencode(login_form).encode('utf-8')) + login_page = self._download_webpage( + request, None, note='Logging in as %s' % username) if re.search(r'onLoginFailed', login_page): - raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) + raise ExtractorError( + 'Unable to login, incorrect username and/or password', expected=True) def _real_initialize(self): self._login() diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a3da56c14..d9240ff02 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -32,6 +32,7 @@ from ..utils import ( unescapeHTML, unified_strdate, uppercase_escape, + ISO3166Utils, ) @@ -903,6 +904,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): break if 'token' not in video_info: if 'reason' in video_info: + if 'The uploader has not made this video available in your country.' in video_info['reason']: + regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None) + if regions_allowed is not None: + raise ExtractorError('YouTube said: This video is available in %s only' % ( + ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))), + expected=True) raise ExtractorError( 'YouTube said: %s' % video_info['reason'][0], expected=True, video_id=video_id) |