diff options
| author | Sergey M․ <dstftw@gmail.com> | 2016-12-04 23:20:14 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2016-12-04 23:35:21 +0700 | 
| commit | 9b5288c92ae43436a5d48775bbe1ee537588625f (patch) | |
| tree | b7cc0ba1731bd77746c37f28b5f1866c64c9f904 | |
| parent | 83442966194640d9bc00e7f3086aa5e8b25c4ae3 (diff) | |
[1tv] Improve extraction and add support for playlists (closes #11335)
| -rw-r--r-- | youtube_dl/extractor/firsttv.py | 105 | 
1 files changed, 70 insertions, 35 deletions
| diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 6b662cc3c..4463d3d20 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -2,7 +2,10 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( +    compat_str, +    compat_urlparse, +)  from ..utils import (      int_or_none,      qualities, @@ -22,8 +25,7 @@ class FirstTVIE(InfoExtractor):          'info_dict': {              'id': '40049',              'ext': 'mp4', -            'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015', -            'description': 'md5:36a39c1d19618fec57d12efe212a8370', +            'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015',              'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$',              'upload_date': '20150212',              'duration': 2694, @@ -34,8 +36,7 @@ class FirstTVIE(InfoExtractor):          'info_dict': {              'id': '364746',              'ext': 'mp4', -            'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016', -            'description': 'md5:a242eea0031fd180a4497d52640a9572', +            'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016',              'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$',              'upload_date': '20160407',              'duration': 179, @@ -44,6 +45,17 @@ class FirstTVIE(InfoExtractor):          'params': {              'skip_download': True,          }, +    }, { +        'url': 'http://www.1tv.ru/news/issue/2016-12-01/14:00', +        'info_dict': { +            'id': '14:00', +            'title': 'Выпуск новостей в 14:00   1 декабря 2016 года. Новости. Первый канал', +            'description': 'md5:2e921b948f8c1ff93901da78ebdb1dfd', +        }, +        'playlist_count': 13, +    }, { +        'url': 'http://www.1tv.ru/shows/tochvtoch-supersezon/vystupleniya/evgeniy-dyatlov-vladimir-vysockiy-koni-priveredlivye-toch-v-toch-supersezon-fragment-vypuska-ot-06-11-2016', +        'only_matching': True,      }]      def _real_extract(self, url): @@ -51,43 +63,66 @@ class FirstTVIE(InfoExtractor):          webpage = self._download_webpage(url, display_id)          playlist_url = compat_urlparse.urljoin(url, self._search_regex( -            r'data-playlist-url="([^"]+)', webpage, 'playlist url')) +            r'data-playlist-url=(["\'])(?P<url>(?:(?!\1).)+)\1', +            webpage, 'playlist url', group='url')) + +        parsed_url = compat_urlparse.urlparse(playlist_url) +        qs = compat_urlparse.parse_qs(parsed_url.query) +        item_ids = qs.get('videos_ids[]') or qs.get('news_ids[]') + +        items = self._download_json(playlist_url, display_id) + +        if item_ids: +            items = [ +                item for item in items +                if item.get('uid') and compat_str(item['uid']) in item_ids] +        else: +            items = [items[0]] + +        entries = [] +        QUALITIES = ('ld', 'sd', 'hd', ) + +        for item in items: +            title = item['title'] +            quality = qualities(QUALITIES) +            formats = [] +            for f in item.get('mbr', []): +                src = f.get('src') +                if not src or not isinstance(src, compat_str): +                    continue +                tbr = int_or_none(self._search_regex( +                    r'_(\d{3,})\.mp4', src, 'tbr', default=None)) +                formats.append({ +                    'url': src, +                    'format_id': f.get('name'), +                    'tbr': tbr, +                    'quality': quality(f.get('name')), +                }) +            self._sort_formats(formats) + +            thumbnail = item.get('poster') or self._og_search_thumbnail(webpage) +            duration = int_or_none(item.get('duration') or self._html_search_meta( +                'video:duration', webpage, 'video duration', fatal=False)) +            upload_date = unified_strdate(self._html_search_meta( +                'ya:ovs:upload_date', webpage, 'upload date', default=None)) -        item = self._download_json(playlist_url, display_id)[0] -        video_id = item['id'] -        quality = qualities(('ld', 'sd', 'hd', )) -        formats = [] -        for f in item.get('mbr', []): -            src = f.get('src') -            if not src: -                continue -            fname = f.get('name') -            formats.append({ -                'url': src, -                'format_id': fname, -                'quality': quality(fname), +            entries.append({ +                'id': item.get('id') or uid, +                'thumbnail': thumbnail, +                'title': title, +                'upload_date': upload_date, +                'duration': int_or_none(duration), +                'formats': formats              }) -        self._sort_formats(formats)          title = self._html_search_regex(              (r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>',               r"'title'\s*:\s*'([^']+)'"), -            webpage, 'title', default=None) or item['title'] +            webpage, 'title', default=None) or self._og_search_title( +            webpage, default=None)          description = self._html_search_regex(              r'<div class="descr">\s*<div> </div>\s*<p>([^<]*)</p></div>',              webpage, 'description', default=None) or self._html_search_meta( -            'description', webpage, 'description') -        duration = int_or_none(self._html_search_meta( -            'video:duration', webpage, 'video duration', fatal=False)) -        upload_date = unified_strdate(self._html_search_meta( -            'ya:ovs:upload_date', webpage, 'upload date', fatal=False)) +            'description', webpage, 'description', default=None) -        return { -            'id': video_id, -            'thumbnail': item.get('poster') or self._og_search_thumbnail(webpage), -            'title': title, -            'description': description, -            'upload_date': upload_date, -            'duration': int_or_none(duration), -            'formats': formats -        } +        return self.playlist_result(entries, display_id, title, description) | 
