diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 15 | ||||
-rw-r--r-- | youtube_dl/extractor/infoq.py | 9 | ||||
-rw-r--r-- | youtube_dl/extractor/lynda.py | 8 | ||||
-rw-r--r-- | youtube_dl/extractor/newstube.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/snagfilms.py | 171 | ||||
-rw-r--r-- | youtube_dl/extractor/vk.py | 25 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 7 |
8 files changed, 224 insertions, 17 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 46cc4cd06..7e74a971d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -493,6 +493,10 @@ from .smotri import ( SmotriUserIE, SmotriBroadcastIE, ) +from .snagfilms import ( + SnagFilmsIE, + SnagFilmsEmbedIE, +) from .snotr import SnotrIE from .sohu import SohuIE from .soompi import ( diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 42e4e7035..32e41d13e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -47,6 +47,7 @@ from .xhamster import XHamsterEmbedIE from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE +from .snagfilms import SnagFilmsEmbedIE class GenericIE(InfoExtractor): @@ -849,6 +850,15 @@ class GenericIE(InfoExtractor): 'uploader_id': 'clickhole', } }, + # SnagFilms embed + { + 'url': 'http://whilewewatch.blogspot.ru/2012/06/whilewewatch-whilewewatch-gripping.html', + 'info_dict': { + 'id': '74849a00-85a9-11e1-9660-123139220831', + 'ext': 'mp4', + 'title': '#whilewewatch', + } + }, # AdobeTVVideo embed { 'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners', @@ -1550,6 +1560,11 @@ class GenericIE(InfoExtractor): if onionstudios_url: return self.url_result(onionstudios_url) + # Look for SnagFilms embeds + snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage) + if snagfilms_url: + return self.url_result(snagfilms_url) + # Look for AdobeTVVideo embeds mobj = re.search( r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index f25f43664..117a7faf6 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -9,9 +9,9 @@ from ..compat import ( class InfoQIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?infoq\.com/[^/]+/(?P<id>[^/]+)$' + _VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things', 'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2', 'info_dict': { @@ -20,7 +20,10 @@ class InfoQIE(InfoExtractor): 'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.', 'title': 'A Few of My Favorite [Python] Things', }, - } + }, { + 'url': 'http://www.infoq.com/fr/presentations/changez-avis-sur-javascript', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index cfd3b14f4..a00f6e5e5 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -30,13 +30,13 @@ class LyndaBaseIE(InfoExtractor): return login_form = { - 'username': username, - 'password': password, + 'username': username.encode('utf-8'), + 'password': password.encode('utf-8'), 'remember': 'false', 'stayPut': 'false' } request = compat_urllib_request.Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) + self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) login_page = self._download_webpage( request, None, 'Logging in as %s' % username) @@ -65,7 +65,7 @@ class LyndaBaseIE(InfoExtractor): 'stayPut': 'false', } request = compat_urllib_request.Request( - self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form)) + self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form).encode('utf-8')) login_page = self._download_webpage( request, None, 'Confirming log in and log out from another device') diff --git a/youtube_dl/extractor/newstube.py b/youtube_dl/extractor/newstube.py index 85fcad06b..5a9e73cd6 100644 --- a/youtube_dl/extractor/newstube.py +++ b/youtube_dl/extractor/newstube.py @@ -31,7 +31,7 @@ class NewstubeIE(InfoExtractor): page = self._download_webpage(url, video_id, 'Downloading page') video_guid = self._html_search_regex( - r'<meta property="og:video" content="https?://(?:www\.)?newstube\.ru/freshplayer\.swf\?guid=(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', + r'<meta property="og:video:url" content="https?://(?:www\.)?newstube\.ru/freshplayer\.swf\?guid=(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', page, 'video GUID') player = self._download_xml( diff --git a/youtube_dl/extractor/snagfilms.py b/youtube_dl/extractor/snagfilms.py new file mode 100644 index 000000000..cf495f310 --- /dev/null +++ b/youtube_dl/extractor/snagfilms.py @@ -0,0 +1,171 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + determine_ext, + int_or_none, + js_to_json, + parse_duration, +) + + +class SnagFilmsEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|embed)\.)?snagfilms\.com/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})' + _TESTS = [{ + 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', + 'md5': '2924e9215c6eff7a55ed35b72276bd93', + 'info_dict': { + 'id': '74849a00-85a9-11e1-9660-123139220831', + 'ext': 'mp4', + 'title': '#whilewewatch', + } + }, { + 'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?snagfilms\.com/embed/player.+?)\1', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + if '>This film is not playable in your area.<' in webpage: + raise ExtractorError( + 'Film %s is not playable in your area.' % video_id, expected=True) + + formats = [] + for source in self._parse_json(js_to_json(self._search_regex( + r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id): + file_ = source.get('file') + if not file_: + continue + type_ = source.get('type') + format_id = source.get('label') + ext = determine_ext(file_) + if any(_ == 'm3u8' for _ in (type_, ext)): + formats.extend(self._extract_m3u8_formats( + file_, video_id, 'mp4', m3u8_id='hls')) + else: + bitrate = int_or_none(self._search_regex( + r'(\d+)kbps', file_, 'bitrate', default=None)) + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)) + formats.append({ + 'url': file_, + 'format_id': format_id, + 'tbr': bitrate, + 'height': height, + }) + self._sort_formats(formats) + + title = self._search_regex( + [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)</title>'], + webpage, 'title') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } + + +class SnagFilmsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?snagfilms\.com/(?:films/title|show)/(?P<id>[^?#]+)' + _TESTS = [{ + 'url': 'http://www.snagfilms.com/films/title/lost_for_life', + 'md5': '19844f897b35af219773fd63bdec2942', + 'info_dict': { + 'id': '0000014c-de2f-d5d6-abcf-ffef58af0017', + 'display_id': 'lost_for_life', + 'ext': 'mp4', + 'title': 'Lost for Life', + 'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 4489, + 'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals'] + } + }, { + 'url': 'http://www.snagfilms.com/show/the_world_cut_project/india', + 'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd', + 'info_dict': { + 'id': '00000145-d75c-d96e-a9c7-ff5c67b20000', + 'display_id': 'the_world_cut_project/india', + 'ext': 'mp4', + 'title': 'India', + 'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 979, + 'categories': ['Documentary', 'Sports', 'Politics'] + } + }, { + # Film is not playable in your area. + 'url': 'http://www.snagfilms.com/films/title/inside_mecca', + 'only_matching': True, + }, { + # Film is not available. + 'url': 'http://www.snagfilms.com/show/augie_alone/flirting', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + if ">Sorry, the Film you're looking for is not available.<" in webpage: + raise ExtractorError( + 'Film %s is not available.' % display_id, expected=True) + + film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') + + snag = self._parse_json( + self._search_regex( + 'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'), + display_id) + + for item in snag: + if item.get('data', {}).get('film', {}).get('id') == film_id: + data = item['data']['film'] + title = data['title'] + description = clean_html(data.get('synopsis')) + thumbnail = data.get('image') + duration = int_or_none(data.get('duration') or data.get('runtime')) + categories = [ + category['title'] for category in data.get('categories', []) + if category.get('title')] + break + else: + title = self._search_regex( + r'itemprop="title">([^<]+)<', webpage, 'title') + description = self._html_search_regex( + r'(?s)<div itemprop="description" class="film-synopsis-inner ">(.+?)</div>', + webpage, 'description', default=None) or self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + duration = parse_duration(self._search_regex( + r'<span itemprop="duration" class="film-duration strong">([^<]+)<', + webpage, 'duration', fatal=False)) + categories = re.findall(r'<a href="/movies/[^"]+">([^<]+)</a>', webpage) + + return { + '_type': 'url_transparent', + 'url': 'http://embed.snagfilms.com/embed/player?filmId=%s' % film_id, + 'id': film_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'categories': categories, + } diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 38ff3c1a9..f2ae109f9 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -121,20 +121,27 @@ class VKIE(InfoExtractor): if username is None: return - login_form = { - 'act': 'login', - 'role': 'al_frame', - 'expire': '1', + login_page = self._download_webpage( + 'https://vk.com', None, 'Downloading login page') + + login_form = dict(re.findall( + r'<input\s+type="hidden"\s+name="([^"]+)"\s+(?:id="[^"]+"\s+)?value="([^"]*)"', + login_page)) + + login_form.update({ 'email': username.encode('cp1251'), 'pass': password.encode('cp1251'), - } + }) - request = compat_urllib_request.Request('https://login.vk.com/?act=login', - compat_urllib_parse.urlencode(login_form).encode('utf-8')) - login_page = self._download_webpage(request, None, note='Logging in as %s' % username) + request = compat_urllib_request.Request( + 'https://login.vk.com/?act=login', + compat_urllib_parse.urlencode(login_form).encode('utf-8')) + login_page = self._download_webpage( + request, None, note='Logging in as %s' % username) if re.search(r'onLoginFailed', login_page): - raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) + raise ExtractorError( + 'Unable to login, incorrect username and/or password', expected=True) def _real_initialize(self): self._login() diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a3da56c14..d9240ff02 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -32,6 +32,7 @@ from ..utils import ( unescapeHTML, unified_strdate, uppercase_escape, + ISO3166Utils, ) @@ -903,6 +904,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): break if 'token' not in video_info: if 'reason' in video_info: + if 'The uploader has not made this video available in your country.' in video_info['reason']: + regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None) + if regions_allowed is not None: + raise ExtractorError('YouTube said: This video is available in %s only' % ( + ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))), + expected=True) raise ExtractorError( 'YouTube said: %s' % video_info['reason'][0], expected=True, video_id=video_id) |