diff options
| -rw-r--r-- | youtube_dl/extractor/extractors.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/r7.py | 95 | 
2 files changed, 64 insertions, 36 deletions
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2ff867651..b1b04f2fc 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -631,7 +631,10 @@ from .qqmusic import (      QQMusicToplistIE,      QQMusicPlaylistIE,  ) -from .r7 import R7IE +from .r7 import ( +    R7IE, +    R7ArticleIE, +)  from .radiocanada import (      RadioCanadaIE,      RadioCanadaAudioVideoIE, diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py index 976c8feec..069dbfaed 100644 --- a/youtube_dl/extractor/r7.py +++ b/youtube_dl/extractor/r7.py @@ -2,22 +2,19 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from ..utils import ( -    js_to_json, -    unescapeHTML, -    int_or_none, -) +from ..utils import int_or_none  class R7IE(InfoExtractor): -    _VALID_URL = r'''(?x)https?:// +    _VALID_URL = r'''(?x) +                        https?://                          (?:                              (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/|                              noticias\.r7\.com(?:/[^/]+)+/[^/]+-|                              player\.r7\.com/video/i/                          )                          (?P<id>[\da-f]{24}) -                        ''' +                    '''      _TESTS = [{          'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html',          'md5': '403c4e393617e8e8ddc748978ee8efde', @@ -25,6 +22,7 @@ class R7IE(InfoExtractor):              'id': '54e7050b0cf2ff57e0279389',              'ext': 'mp4',              'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"', +            'description': 'md5:01812008664be76a6479aa58ec865b72',              'thumbnail': 're:^https?://.*\.jpg$',              'duration': 98,              'like_count': int, @@ -44,45 +42,72 @@ class R7IE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        webpage = self._download_webpage( -            'http://player.r7.com/video/i/%s' % video_id, video_id) +        video = self._download_json( +            'http://player-api.r7.com/video/i/%s' % video_id, video_id) -        item = self._parse_json(js_to_json(self._search_regex( -            r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id) - -        title = unescapeHTML(item['title']) -        thumbnail = item.get('init', {}).get('thumbUri') -        duration = None - -        statistics = item.get('statistics', {}) -        like_count = int_or_none(statistics.get('likes')) -        view_count = int_or_none(statistics.get('views')) +        title = video['title']          formats = [] -        for format_key, format_dict in item['playlist'][0].items(): -            src = format_dict.get('src') -            if not src: -                continue -            format_id = format_dict.get('format') or format_key -            if duration is None: -                duration = format_dict.get('duration') -            if '.f4m' in src: -                formats.extend(self._extract_f4m_formats(src, video_id, preference=-1)) -            elif src.endswith('.m3u8'): -                formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2)) -            else: -                formats.append({ -                    'url': src, -                    'format_id': format_id, -                }) +        media_url_hls = video.get('media_url_hls') +        if media_url_hls: +            formats.extend(self._extract_m3u8_formats( +                media_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', +                m3u8_id='hls', fatal=False)) +        media_url = video.get('media_url') +        if media_url: +            f = { +                'url': media_url, +                'format_id': 'http', +            } +            # m3u8 format always matches the http format, let's copy metadata from +            # one to another +            m3u8_formats = list(filter( +                lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', +                formats)) +            if len(m3u8_formats) == 1: +                f_copy = m3u8_formats[0].copy() +                f_copy.update(f) +                f_copy['protocol'] = 'http' +                f = f_copy +            formats.append(f)          self._sort_formats(formats) +        description = video.get('description') +        thumbnail = video.get('thumb') +        duration = int_or_none(video.get('media_duration')) +        like_count = int_or_none(video.get('likes')) +        view_count = int_or_none(video.get('views')) +          return {              'id': video_id,              'title': title, +            'description': description,              'thumbnail': thumbnail,              'duration': duration,              'like_count': like_count,              'view_count': view_count,              'formats': formats,          } + + +class R7ArticleIE(InfoExtractor): +    _VALID_URL = r'https?://(?:[a-zA-Z]+)\.r7\.com/(?:[^/]+/)+[^/?#&]+-(?P<id>\d+)' +    _TEST = { +        'url': 'http://tv.r7.com/record-play/balanco-geral/videos/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-16102015', +        'only_matching': True, +    } + +    @classmethod +    def suitable(cls, url): +        return False if R7IE.suitable(url) else super(R7ArticleIE, cls).suitable(url) + +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        webpage = self._download_webpage(url, display_id) + +        video_id = self._search_regex( +            r'<div[^>]+(?:id=["\']player-|class=["\']embed["\'][^>]+id=["\'])([\da-f]{24})', +            webpage, 'video id') + +        return self.url_result('http://player.r7.com/video/i/%s' % video_id, R7IE.ie_key())  | 
