diff options
-rw-r--r-- | youtube_dl/extractor/__init__.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 28 | ||||
-rw-r--r-- | youtube_dl/extractor/toutv.py | 75 | ||||
-rw-r--r-- | youtube_dl/utils.py | 2 |
4 files changed, 106 insertions, 0 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 2679d1a8f..b0df1cef7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -133,6 +133,7 @@ from .techtalks import TechTalksIE from .ted import TEDIE from .tf1 import TF1IE from .thisav import ThisAVIE +from .toutv import TouTvIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE from .tube8 import Tube8IE diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f787d0a3c..eb3435c77 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -350,6 +350,17 @@ class InfoExtractor(object): if secure: regexes = self._og_regexes('video:secure_url') + regexes return self._html_search_regex(regexes, html, name, **kargs) + def _html_search_meta(self, name, html, display_name=None): + if display_name is None: + display_name = name + return self._html_search_regex( + r'''(?ix)<meta(?=[^>]+(?:name|property)=["\']%s["\']) + [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), + html, display_name, fatal=False) + + def _dc_search_uploader(self, html): + return self._html_search_meta('dc.creator', html, 'uploader') + def _rta_search(self, html): # See http://www.rtalabel.org/index.php?content=howtofaq#single if re.search(r'(?ix)<meta\s+name="rating"\s+' @@ -358,6 +369,23 @@ class InfoExtractor(object): return 18 return 0 + def _media_rating_search(self, html): + # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ + rating = self._html_search_meta('rating', html) + + if not rating: + return None + + RATING_TABLE = { + 'safe for kids': 0, + 'general': 8, + '14 years': 14, + 'mature': 17, + 'restricted': 19, + } + return RATING_TABLE.get(rating.lower(), None) + + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py new file mode 100644 index 000000000..73ea67da9 --- /dev/null +++ b/youtube_dl/extractor/toutv.py @@ -0,0 +1,75 @@ +# coding: utf-8 +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + unified_strdate, +) + + +class TouTvIE(InfoExtractor): + IE_NAME = u'tou.tv' + _VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))' + + _TEST = { + u'url': u'http://www.tou.tv/30-vies/S04E41', + u'file': u'30-vies_S04E41.mp4', + u'info_dict': { + u'title': u'30 vies Saison 4 / Épisode 41', + u'description': u'md5:da363002db82ccbe4dafeb9cab039b09', + u'age_limit': 8, + u'uploader': u'Groupe des Nouveaux Médias', + u'duration': 1296, + u'upload_date': u'20131118', + u'thumbnail': u'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg', + }, + u'params': { + u'skip_download': True, # Requires rtmpdump + }, + u'xskip': 'Only available in Canada' + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + + mediaId = self._search_regex( + r'"idMedia":\s*"([^"]+)"', webpage, u'media ID') + + # TODO test from de + streams_url = u'http://release.theplatform.com/content.select?pid=' + mediaId + streams_webpage = self._download_webpage( + streams_url, video_id, note=u'Downloading stream list') + + streams_doc = xml.etree.ElementTree.fromstring( + streams_webpage.encode('utf-8')) + video_url = next(n.text + for n in streams_doc.findall('.//choice/url') + if u'//ad.doubleclick' not in n.text) + if video_url.endswith('/Unavailable.flv'): + raise ExtractorError( + u'Access to this video is blocked from outside of Canada', + expected=True) + + duration_str = self._html_search_meta( + 'video:duration', webpage, u'duration') + duration = int(duration_str) if duration_str else None + upload_date_str = self._html_search_meta( + 'video:release_date', webpage, u'upload date') + upload_date = unified_strdate(upload_date_str) if upload_date_str else None + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'url': video_url, + 'description': self._og_search_description(webpage), + 'uploader': self._dc_search_uploader(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'age_limit': self._media_rating_search(webpage), + 'duration': duration, + 'upload_date': upload_date, + 'ext': 'mp4', + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 1d9785341..b50c8166f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -734,6 +734,8 @@ def unified_strdate(date_str): '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M', '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S.%fZ', + '%Y-%m-%dT%H:%M:%S.%f0Z', '%Y-%m-%dT%H:%M:%S', ] for expression in format_expressions: |