diff options
-rw-r--r-- | youtube_dl/extractor/__init__.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/canalplus.py | 46 | ||||
-rw-r--r-- | youtube_dl/extractor/criterion.py | 40 | ||||
-rw-r--r-- | youtube_dl/extractor/youjizz.py | 16 |
4 files changed, 102 insertions, 2 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c00d5a352..cdbd880c7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -6,8 +6,10 @@ from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE from .breakcom import BreakIE from .brightcove import BrightcoveIE +from .canalplus import CanalplusIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE +from .criterion import CriterionIE from .cspan import CSpanIE from .dailymotion import DailymotionIE from .depositfiles import DepositFilesIE diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py new file mode 100644 index 000000000..3b1c88876 --- /dev/null +++ b/youtube_dl/extractor/canalplus.py @@ -0,0 +1,46 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import unified_strdate + +class CanalplusIE(InfoExtractor): + _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P<id>\d+)' + _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' + IE_NAME = u'canalplus.fr' + + _TEST = { + u'url': u'http://www.canalplus.fr/c-divertissement/pid3351-c-le-petit-journal.html?vid=889861', + u'file': u'889861.flv', + u'md5': u'590a888158b5f0d6832f84001fbf3e99', + u'info_dict': { + u'title': u'Le Petit Journal 20/06/13 - La guerre des drone', + u'upload_date': u'20130620', + }, + u'skip': u'Requires rtmpdump' + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_url = self._VIDEO_INFO_TEMPLATE % video_id + info_page = self._download_webpage(info_url,video_id, + u'Downloading video info') + + self.report_extraction(video_id) + doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8')) + video_info = [video for video in doc if video.find('ID').text == video_id][0] + infos = video_info.find('INFOS') + media = video_info.find('MEDIA') + formats = [media.find('VIDEOS/%s' % format) + for format in ['BAS_DEBIT', 'HAUT_DEBIT', 'HD']] + video_url = [format.text for format in formats if format is not None][-1] + + return {'id': video_id, + 'title': u'%s - %s' % (infos.find('TITRAGE/TITRE').text, + infos.find('TITRAGE/SOUS_TITRE').text), + 'url': video_url, + 'ext': 'flv', + 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text), + 'thumbnail': media.find('IMAGES/GRAND').text, + } diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py new file mode 100644 index 000000000..31fe3d57b --- /dev/null +++ b/youtube_dl/extractor/criterion.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- + +import re + +from .common import InfoExtractor +from ..utils import determine_ext + +class CriterionIE(InfoExtractor): + _VALID_URL = r'https?://www\.criterion\.com/films/(\d*)-.+' + _TEST = { + u'url': u'http://www.criterion.com/films/184-le-samourai', + u'file': u'184.mp4', + u'md5': u'bc51beba55685509883a9a7830919ec3', + u'info_dict': { + u"title": u"Le Samouraï", + u"description" : u'md5:a2b4b116326558149bef81f76dcbb93f', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + webpage = self._download_webpage(url, video_id) + + final_url = self._search_regex(r'so.addVariable\("videoURL", "(.+?)"\)\;', + webpage, 'video url') + title = self._html_search_regex(r'<meta content="(.+?)" property="og:title" />', + webpage, 'video title') + description = self._html_search_regex(r'<meta name="description" content="(.+?)" />', + webpage, 'video description') + thumbnail = self._search_regex(r'so.addVariable\("thumbnailURL", "(.+?)"\)\;', + webpage, 'thumbnail url') + + return {'id': video_id, + 'url' : final_url, + 'title': title, + 'ext': determine_ext(final_url), + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index 6f022670c..1265639e8 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -40,8 +40,20 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(embed_page_url, video_id) # Get the video URL - video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', - webpage, u'video URL') + m_playlist = re.search(r'so.addVariable\("playlist", ?"(?P<playlist>.+?)"\);', webpage) + if m_playlist is not None: + playlist_url = m_playlist.group('playlist') + playlist_page = self._download_webpage(playlist_url, video_id, + u'Downloading playlist page') + m_levels = list(re.finditer(r'<level bitrate="(\d+?)" file="(.*?)"', playlist_page)) + if len(m_levels) == 0: + raise ExtractorError(u'Unable to extract video url') + videos = [(int(m.group(1)), m.group(2)) for m in m_levels] + (_, video_url) = sorted(videos)[0] + video_url = video_url.replace('%252F', '%2F') + else: + video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', + webpage, u'video URL') info = {'id': video_id, 'url': video_url, |