diff options
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/elpais.py | 58 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 6 | 
3 files changed, 63 insertions, 2 deletions
| diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 30b993b45..4e0501ec3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -54,6 +54,7 @@ from .ebaumsworld import EbaumsWorldIE  from .ehow import EHowIE  from .eighttracks import EightTracksIE  from .eitb import EitbIE +from .elpais import ElPaisIE  from .escapist import EscapistIE  from .everyonesmixtape import EveryonesMixtapeIE  from .exfm import ExfmIE diff --git a/youtube_dl/extractor/elpais.py b/youtube_dl/extractor/elpais.py new file mode 100644 index 000000000..291400152 --- /dev/null +++ b/youtube_dl/extractor/elpais.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class ElPaisIE(InfoExtractor): +    _VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P<id>[^/#?]+)\.html(?:$|[?#])' +    IE_DESCR = 'El País' + +    _TEST = { +        'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html', +        'md5': '98406f301f19562170ec071b83433d55', +        'info_dict': { +            'id': 'tiempo-nuevo-recetas-viejas', +            'ext': 'mp4', +            'title': 'Tiempo nuevo, recetas viejas', +            'description': 'De lunes a viernes, a partir de las ocho de la mañana, Iñaki Gabilondo nos cuenta su visión de la actualidad nacional e internacional.', +            'upload_date': '20140206', +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) + +        prefix = self._html_search_regex( +            r'var url_cache = "([^"]+)";', webpage, 'URL prefix') +        video_suffix = self._search_regex( +            r"URLMediaFile = url_cache \+ '([^']+)'", webpage, 'video URL') +        video_url = prefix + video_suffix +        thumbnail_suffix = self._search_regex( +            r"URLMediaStill = url_cache \+ '([^']+)'", webpage, 'thumbnail URL', +            fatal=False) +        thumbnail = ( +            None if thumbnail_suffix is None +            else prefix + thumbnail_suffix) +        title = self._html_search_regex( +            '<h2 class="entry-header entry-title.*?>(.*?)</h2>', +            webpage, 'title') +        date_str = self._search_regex( +            r'<p class="date-header date-int updated"\s+title="([^"]+)">', +            webpage, 'upload date', fatal=False) +        upload_date = (None if date_str is None else unified_strdate(date_str)) + +        return { +            'id': video_id, +            'url': video_url, +            'title': title, +            'description': self._og_search_description(webpage), +            'thumbnail': thumbnail, +            'upload_date': upload_date, +        } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8fa4cb67f..01c8c017d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -751,13 +751,14 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):      https_request = http_request      https_response = http_response +  def unified_strdate(date_str):      """Return a string with the date in the format YYYYMMDD"""      upload_date = None      #Replace commas      date_str = date_str.replace(',',' ')      # %z (UTC offset) is only supported in python>=3.2 -    date_str = re.sub(r' (\+|-)[\d]*$', '', date_str) +    date_str = re.sub(r' ?(\+|-)[0-9:]*$', '', date_str)      format_expressions = [          '%d %B %Y',          '%B %d %Y', @@ -771,11 +772,12 @@ def unified_strdate(date_str):          '%Y-%m-%dT%H:%M:%S.%fZ',          '%Y-%m-%dT%H:%M:%S.%f0Z',          '%Y-%m-%dT%H:%M:%S', +        '%Y-%m-%dT%H:%M',      ]      for expression in format_expressions:          try:              upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') -        except: +        except ValueError:              pass      if upload_date is None:          timetuple = email.utils.parsedate_tz(date_str) | 
