diff options
Diffstat (limited to 'youtube_dl')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/instagram.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/radiofrance.py | 34 | ||||
-rw-r--r-- | youtube_dl/extractor/rts.py | 61 | ||||
-rw-r--r-- | youtube_dl/utils.py | 36 |
6 files changed, 114 insertions, 22 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3e728e876..245860140 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -195,6 +195,7 @@ from .ro220 import Ro220IE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtlnow import RTLnowIE +from .rts import RTSIE from .rutube import ( RutubeIE, RutubeChannelIE, diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 647720c8a..78f238f84 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -74,7 +74,7 @@ class InfoExtractor(object): "http", "https", "rtsp", "rtmp", "m3u8" or so. * preference Order number of this format. If this field is present and not None, the formats get sorted - by this field. + by this field, regardless of all other values. -1 for default (order by other properties), -2 or smaller for less than default. * quality Order number of the video quality of this diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 994f0e4ae..b5372bf7a 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -89,7 +89,7 @@ class InstagramUserIE(InfoExtractor): 'uploader': user.get('full_name'), 'uploader_id': user.get('username'), 'like_count': like_count, - 'upload_timestamp': int_or_none(it.get('created_time')), + 'timestamp': int_or_none(it.get('created_time')), }) if not page['items']: diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py index 34652f6c1..09352ed82 100644 --- a/youtube_dl/extractor/radiofrance.py +++ b/youtube_dl/extractor/radiofrance.py @@ -1,4 +1,6 @@ # coding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -6,16 +8,17 @@ from .common import InfoExtractor class RadioFranceIE(InfoExtractor): _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)' - IE_NAME = u'radiofrance' + IE_NAME = 'radiofrance' _TEST = { - u'url': u'http://maison.radiofrance.fr/radiovisions/one-one', - u'file': u'one-one.ogg', - u'md5': u'bdbb28ace95ed0e04faab32ba3160daf', - u'info_dict': { - u"title": u"One to one", - u"description": u"Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.", - u"uploader": u"Thomas Hercouët", + 'url': 'http://maison.radiofrance.fr/radiovisions/one-one', + 'md5': 'bdbb28ace95ed0e04faab32ba3160daf', + 'info_dict': { + 'id': 'one-one', + 'ext': 'ogg', + "title": "One to one", + "description": "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.", + "uploader": "Thomas Hercouët", }, } @@ -24,27 +27,28 @@ class RadioFranceIE(InfoExtractor): video_id = m.group('id') webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, u'title') + title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title') description = self._html_search_regex( r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>', - webpage, u'description', fatal=False) + webpage, 'description', fatal=False) uploader = self._html_search_regex( r'<div class="credit"> © (.*?)</div>', - webpage, u'uploader', fatal=False) + webpage, 'uploader', fatal=False) formats_str = self._html_search_regex( r'class="jp-jplayer[^"]*" data-source="([^"]+)">', - webpage, u'audio URLs') + webpage, 'audio URLs') formats = [ { 'format_id': fm[0], 'url': fm[1], 'vcodec': 'none', + 'preference': i, } - for fm in - re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str) + for i, fm in + enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)) ] - # No sorting, we don't know any more about these formats + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py new file mode 100644 index 000000000..f211637a7 --- /dev/null +++ b/youtube_dl/extractor/rts.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, + unescapeHTML, +) + + +class RTSIE(InfoExtractor): + IE_DESC = 'RTS.ch' + _VALID_URL = r'^https?://(?:www\.)?rts\.ch/archives/tv/[^/]+/(?P<id>[0-9]+)-.*?\.html' + + _TEST = { + 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', + 'md5': '753b877968ad8afaeddccc374d4256a5', + 'info_dict': { + 'id': '3449373', + 'ext': 'mp4', + 'duration': 1488, + 'title': 'Les Enfants Terribles', + 'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.', + 'uploader': 'Divers', + 'upload_date': '19680921', + 'timestamp': -40280400, + }, + } + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + all_info = self._download_json( + 'http://www.rts.ch/a/%s.html?f=json/article' % video_id, video_id) + info = all_info['video']['JSONinfo'] + + upload_timestamp = parse_iso8601(info.get('broadcast_date')) + duration = parse_duration(info.get('duration')) + thumbnail = unescapeHTML(info.get('preview_image_url')) + formats = [{ + 'format_id': fid, + 'url': furl, + 'tbr': int_or_none(self._search_regex( + r'-([0-9]+)k\.', furl, 'bitrate', default=None)), + } for fid, furl in info['streams'].items()] + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': info['title'], + 'description': info.get('intro'), + 'duration': duration, + 'uploader': info.get('programName'), + 'timestamp': upload_timestamp, + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8b359cb77..68d590ba2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import calendar import contextlib import ctypes import datetime @@ -501,13 +502,13 @@ def orderedSet(iterable): res.append(el) return res + def unescapeHTML(s): - """ - @param s a string - """ - assert type(s) == type(u'') + if s is None: + return None + assert type(s) == compat_str - result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s) + result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s) return result @@ -761,6 +762,31 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): https_response = http_response +def parse_iso8601(date_str): + """ Return a UNIX timestamp from the given date """ + + if date_str is None: + return None + + m = re.search( + r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$', + date_str) + if not m: + timezone = datetime.timedelta() + else: + date_str = date_str[:-len(m.group(0))] + if not m.group('sign'): + timezone = datetime.timedelta() + else: + sign = 1 if m.group('sign') == '+' else -1 + timezone = datetime.timedelta( + hours=sign * int(m.group('hours')), + minutes=sign * int(m.group('minutes'))) + + dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone + return calendar.timegm(dt.timetuple()) + + def unified_strdate(date_str): """Return a string with the date in the format YYYYMMDD""" |