diff options
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/canalplus.py | 46 | ||||
| -rw-r--r-- | youtube_dl/extractor/criterion.py | 40 | ||||
| -rw-r--r-- | youtube_dl/extractor/youjizz.py | 16 | 
4 files changed, 102 insertions, 2 deletions
| diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c00d5a352..cdbd880c7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -6,8 +6,10 @@ from .bandcamp import BandcampIE  from .bliptv import BlipTVIE, BlipTVUserIE  from .breakcom import BreakIE  from .brightcove import BrightcoveIE +from .canalplus import CanalplusIE  from .collegehumor import CollegeHumorIE  from .comedycentral import ComedyCentralIE +from .criterion import CriterionIE  from .cspan import CSpanIE  from .dailymotion import DailymotionIE  from .depositfiles import DepositFilesIE diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py new file mode 100644 index 000000000..3b1c88876 --- /dev/null +++ b/youtube_dl/extractor/canalplus.py @@ -0,0 +1,46 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import unified_strdate + +class CanalplusIE(InfoExtractor): +    _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P<id>\d+)' +    _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' +    IE_NAME = u'canalplus.fr' + +    _TEST = { +        u'url': u'http://www.canalplus.fr/c-divertissement/pid3351-c-le-petit-journal.html?vid=889861', +        u'file': u'889861.flv', +        u'md5': u'590a888158b5f0d6832f84001fbf3e99', +        u'info_dict': { +            u'title': u'Le Petit Journal 20/06/13 - La guerre des drone', +            u'upload_date': u'20130620', +        }, +        u'skip': u'Requires rtmpdump' +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        info_url = self._VIDEO_INFO_TEMPLATE % video_id +        info_page = self._download_webpage(info_url,video_id,  +                                           u'Downloading video info') + +        self.report_extraction(video_id) +        doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8')) +        video_info = [video for video in doc if video.find('ID').text == video_id][0] +        infos = video_info.find('INFOS') +        media = video_info.find('MEDIA') +        formats = [media.find('VIDEOS/%s' % format) +            for format in ['BAS_DEBIT', 'HAUT_DEBIT', 'HD']] +        video_url = [format.text for format in formats if format is not None][-1] + +        return {'id': video_id, +                'title': u'%s - %s' % (infos.find('TITRAGE/TITRE').text, +                                       infos.find('TITRAGE/SOUS_TITRE').text), +                'url': video_url, +                'ext': 'flv', +                'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text), +                'thumbnail': media.find('IMAGES/GRAND').text, +                } diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py new file mode 100644 index 000000000..31fe3d57b --- /dev/null +++ b/youtube_dl/extractor/criterion.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- + +import re + +from .common import InfoExtractor +from ..utils import determine_ext + +class CriterionIE(InfoExtractor): +    _VALID_URL = r'https?://www\.criterion\.com/films/(\d*)-.+' +    _TEST = { +        u'url': u'http://www.criterion.com/films/184-le-samourai', +        u'file': u'184.mp4', +        u'md5': u'bc51beba55685509883a9a7830919ec3', +        u'info_dict': { +            u"title": u"Le Samouraï", +            u"description" : u'md5:a2b4b116326558149bef81f76dcbb93f', +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group(1) +        webpage = self._download_webpage(url, video_id) + +        final_url = self._search_regex(r'so.addVariable\("videoURL", "(.+?)"\)\;', +                                webpage, 'video url') +        title = self._html_search_regex(r'<meta content="(.+?)" property="og:title" />', +                                webpage, 'video title') +        description = self._html_search_regex(r'<meta name="description" content="(.+?)" />', +                                webpage, 'video description') +        thumbnail = self._search_regex(r'so.addVariable\("thumbnailURL", "(.+?)"\)\;', +                                webpage, 'thumbnail url') + +        return {'id': video_id, +                'url' : final_url, +                'title': title, +                'ext': determine_ext(final_url), +                'description': description, +                'thumbnail': thumbnail, +                } diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index 6f022670c..1265639e8 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -40,8 +40,20 @@ class YouJizzIE(InfoExtractor):          webpage = self._download_webpage(embed_page_url, video_id)          # Get the video URL -        video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', -            webpage, u'video URL') +        m_playlist = re.search(r'so.addVariable\("playlist", ?"(?P<playlist>.+?)"\);', webpage) +        if m_playlist is not None: +            playlist_url = m_playlist.group('playlist') +            playlist_page = self._download_webpage(playlist_url, video_id, +                                                   u'Downloading playlist page') +            m_levels = list(re.finditer(r'<level bitrate="(\d+?)" file="(.*?)"', playlist_page)) +            if len(m_levels) == 0: +                raise ExtractorError(u'Unable to extract video url') +            videos = [(int(m.group(1)), m.group(2)) for m in m_levels] +            (_, video_url) = sorted(videos)[0] +            video_url = video_url.replace('%252F', '%2F') +        else: +            video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', +                                           webpage, u'video URL')          info = {'id': video_id,                  'url': video_url, | 
