From faab1d3836ca6c2a3c28ee02efe25d211282f45f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Sep 2013 14:38:41 +0200 Subject: [youtube] Fix detection of feeds urls (fixes #1294) Urls like https://www.youtube.com/feed/watch_later were being as users (before the last changes to YoutubeUserIE, as videos) --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 98a44f333..62aecea02 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1015,14 +1015,14 @@ class YoutubeChannelIE(InfoExtractor): class YoutubeUserIE(InfoExtractor): IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)([A-Za-z0-9_-]+)' + _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)' _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s' _GDATA_PAGE_SIZE = 50 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' IE_NAME = u'youtube:user' def suitable(cls, url): - if YoutubeIE.suitable(url): return False + if YoutubeIE.suitable(url) or YoutubeFavouritesIE.suitable(url): return False else: return super(YoutubeUserIE, cls).suitable(url) def _real_extract(self, url): -- cgit v1.2.3 From e3ea47908747bff4b46b4000fb1de944b400c21a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Sep 2013 16:24:24 +0200 Subject: [youtube] Fix some issues with the detection of playlist/channel urls (reported in #1374) They were being caught by YoutubeUserIE, now it only extracts a url if the rest of extractors aren't suitable. Now the url tests check that the urls can only be extracted with an specific extractor. --- youtube_dl/extractor/youtube.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'youtube_dl') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 62aecea02..423a5e973 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -386,7 +386,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): @classmethod def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False + if YoutubePlaylistIE.suitable(url): return False return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def report_video_webpage_download(self, video_id): @@ -1021,8 +1021,12 @@ class YoutubeUserIE(InfoExtractor): _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' IE_NAME = u'youtube:user' + @classmethod def suitable(cls, url): - if YoutubeIE.suitable(url) or YoutubeFavouritesIE.suitable(url): return False + # Don't return True if the url can be extracted with other youtube + # extractor, the regex would is too permissive and it would match. + other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls) + if any(ie.suitable(url) for ie in other_ies): return False else: return super(YoutubeUserIE, cls).suitable(url) def _real_extract(self, url): -- cgit v1.2.3 From 7e77275293bac0514253c1d38b8d19f926a69d8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Sep 2013 18:08:07 +0200 Subject: Add an extractor for Metacritic --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/metacritic.py | 55 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 youtube_dl/extractor/metacritic.py (limited to 'youtube_dl') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 70ebd29e2..fbe0b8cb7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -52,6 +52,7 @@ from .keek import KeekIE from .liveleak import LiveLeakIE from .livestream import LivestreamIE from .metacafe import MetacafeIE +from .metacritic import MetacriticIE from .mit import TechTVMITIE, MITIE from .mixcloud import MixcloudIE from .mtv import MTVIE diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py new file mode 100644 index 000000000..449138b56 --- /dev/null +++ b/youtube_dl/extractor/metacritic.py @@ -0,0 +1,55 @@ +import re +import xml.etree.ElementTree +import operator + +from .common import InfoExtractor + + +class MetacriticIE(InfoExtractor): + _VALID_URL = r'https?://www\.metacritic\.com/.+?/trailers/(?P\d+)' + + _TEST = { + u'url': u'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222', + u'file': u'3698222.mp4', + u'info_dict': { + u'title': u'inFamous: Second Son - inSide Sucker Punch: Smoke & Mirrors', + u'description': u'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.', + u'duration': 221, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + # The xml is not well formatted, there are raw '&' + info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id, + video_id, u'Downloading info xml').replace('&', '&') + info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + + clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id) + formats = [] + for videoFile in clip.findall('httpURI/videoFile'): + rate_str = videoFile.find('rate').text + video_url = videoFile.find('filePath').text + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'format_id': rate_str, + 'rate': int(rate_str), + }) + formats.sort(key=operator.itemgetter('rate')) + + description = self._html_search_regex(r'Description:(.*?)

', + webpage, u'description', flags=re.DOTALL) + + info = { + 'id': video_id, + 'title': clip.find('title').text, + 'formats': formats, + 'description': description, + 'duration': int(clip.find('duration').text), + } + # TODO: Remove when #980 has been merged + info.update(formats[-1]) + return info -- cgit v1.2.3 From a490fda7464a3cb9d7b5938305241740bae69efb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Sep 2013 18:36:07 +0200 Subject: [daylimotion] accept embed urls (fixes #1386) --- youtube_dl/extractor/dailymotion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'youtube_dl') diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 439033d23..3c616e089 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -14,7 +14,7 @@ from ..utils import ( class DailymotionIE(InfoExtractor): """Information Extractor for Dailymotion""" - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' + _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)' IE_NAME = u'dailymotion' _TEST = { u'url': u'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech', @@ -33,6 +33,7 @@ class DailymotionIE(InfoExtractor): video_id = mobj.group(1).split('_')[0].split('?')[0] video_extension = 'mp4' + url = 'http://www.dailymotion.com/video/%s' % video_id # Retrieve video webpage to extract further information request = compat_urllib_request.Request(url) -- cgit v1.2.3 From a7130543fa0368175740f5fa173ef920671db866 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 6 Sep 2013 18:39:35 +0200 Subject: [generic] If the url doesn't specify the protocol, then try to extract prepending 'http://' --- youtube_dl/extractor/generic.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'youtube_dl') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index de7379a92..f92e61fea 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -109,6 +109,11 @@ class GenericIE(InfoExtractor): return new_url def _real_extract(self, url): + parsed_url = compat_urlparse.urlparse(url) + if not parsed_url.scheme: + self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') + return self.url_result('http://' + url) + try: new_url = self._test_redirect(url) if new_url: -- cgit v1.2.3