diff options
-rwxr-xr-x | youtube_dl/InfoExtractors.py | 69 | ||||
-rw-r--r-- | youtube_dl/extractor/dailymotion.py | 77 |
2 files changed, 78 insertions, 68 deletions
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index fcc5d02cf..b32bd3d94 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -24,80 +24,13 @@ from .utils import * from .extractor.common import InfoExtractor, SearchInfoExtractor +from .extractor.dailymotion import DailymotionIE from .extractor.metacafe import MetacafeIE from .extractor.statigram import StatigramIE from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE -class DailymotionIE(InfoExtractor): - """Information Extractor for Dailymotion""" - - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' - IE_NAME = u'dailymotion' - - def _real_extract(self, url): - # Extract id and simplified title from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group(1).split('_')[0].split('?')[0] - - video_extension = 'mp4' - - # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url) - request.add_header('Cookie', 'family_filter=off') - webpage = self._download_webpage(request, video_id) - - # Extract URL, uploader and title from webpage - self.report_extraction(video_id) - mobj = re.search(r'\s*var flashvars = (.*)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - flashvars = compat_urllib_parse.unquote(mobj.group(1)) - - for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: - if key in flashvars: - max_quality = key - self.to_screen(u'Using %s' % key) - break - else: - raise ExtractorError(u'Unable to extract video URL') - - mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars) - if mobj is None: - raise ExtractorError(u'Unable to extract video URL') - - video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/') - - # TODO: support choosing qualities - - mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = unescapeHTML(mobj.group('title')) - - video_uploader = None - video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', - # Looking for official user - r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'], - webpage, 'video uploader') - - video_upload_date = None - mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage) - if mobj is not None: - video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) - - return [{ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'upload_date': video_upload_date, - 'title': video_title, - 'ext': video_extension, - }] class PhotobucketIE(InfoExtractor): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py new file mode 100644 index 000000000..34306b073 --- /dev/null +++ b/youtube_dl/extractor/dailymotion.py @@ -0,0 +1,77 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_request, + compat_urllib_parse, + + ExtractorError, + unescapeHTML, +) + +class DailymotionIE(InfoExtractor): + """Information Extractor for Dailymotion""" + + _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' + IE_NAME = u'dailymotion' + + def _real_extract(self, url): + # Extract id and simplified title from URL + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group(1).split('_')[0].split('?')[0] + + video_extension = 'mp4' + + # Retrieve video webpage to extract further information + request = compat_urllib_request.Request(url) + request.add_header('Cookie', 'family_filter=off') + webpage = self._download_webpage(request, video_id) + + # Extract URL, uploader and title from webpage + self.report_extraction(video_id) + mobj = re.search(r'\s*var flashvars = (.*)', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract media URL') + flashvars = compat_urllib_parse.unquote(mobj.group(1)) + + for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: + if key in flashvars: + max_quality = key + self.to_screen(u'Using %s' % key) + break + else: + raise ExtractorError(u'Unable to extract video URL') + + mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars) + if mobj is None: + raise ExtractorError(u'Unable to extract video URL') + + video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/') + + # TODO: support choosing qualities + + mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract title') + video_title = unescapeHTML(mobj.group('title')) + + video_uploader = None + video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', + # Looking for official user + r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'], + webpage, 'video uploader') + + video_upload_date = None + mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage) + if mobj is not None: + video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) + + return [{ + 'id': video_id, + 'url': video_url, + 'uploader': video_uploader, + 'upload_date': video_upload_date, + 'title': video_title, + 'ext': video_extension, + }] |