diff options
| author | Sergey M․ <dstftw@gmail.com> | 2018-06-12 01:35:23 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2018-06-12 01:37:34 +0700 | 
| commit | a572ae6114deca4e8f2f0365ca7091749f01deaf (patch) | |
| tree | bb60bcc47a73234e0dbbde77ec867ef7b1784d4e | |
| parent | b2df66aecab9faea206cb72e715dfa1394a6d182 (diff) | |
[tvnet] Improve and fix issues (closes #15462)
| -rw-r--r-- | youtube_dl/extractor/extractors.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/tvnet.py | 133 | ||||
| -rw-r--r-- | youtube_dl/extractor/vtv.py | 91 | 
3 files changed, 134 insertions, 92 deletions
| diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e6d1fe70e..d4583b8e4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1139,6 +1139,7 @@ from .tvc import (  from .tvigle import TvigleIE  from .tvland import TVLandIE  from .tvn24 import TVN24IE +from .tvnet import TVNetIE  from .tvnoe import TVNoeIE  from .tvnow import (      TVNowIE, @@ -1306,7 +1307,6 @@ from .vrv import (      VRVSeriesIE,  )  from .vshare import VShareIE -from .vtv import VTVIE  from .medialaan import MedialaanIE  from .vube import VubeIE  from .vuclip import VuClipIE diff --git a/youtube_dl/extractor/tvnet.py b/youtube_dl/extractor/tvnet.py new file mode 100644 index 000000000..0ec2da4da --- /dev/null +++ b/youtube_dl/extractor/tvnet.py @@ -0,0 +1,133 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    int_or_none, +    unescapeHTML, +) + + +class TVNetIE(InfoExtractor): +    _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?P<id>[0-9]+)' +    _TESTS = [{ +        # video +        'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', +        'md5': 'b4d7abe0252c9b47774760b7519c7558', +        'info_dict': { +            'id': '109788', +            'ext': 'mp4', +            'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang', +            'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', +            'is_live': False, +            'view_count': int, +        }, +    }, { +        # audio +        'url': 'http://vn.tvnet.gov.vn/radio/27017/vov1---ban-tin-chieu-10062018/doi-song-va-xa-hoi', +        'md5': 'b5875ce9b0a2eecde029216d0e6db2ae', +        'info_dict': { +            'id': '27017', +            'ext': 'm4a', +            'title': 'VOV1 - Bản tin chiều (10/06/2018)', +            'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', +            'is_live': False, +        }, +    }, { +        # live stream +        'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1', +        'info_dict': { +            'id': '1011', +            'ext': 'mp4', +            'title': r're:^VTV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +            'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', +            'is_live': True, +        }, +        'params': { +            'skip_download': True, +        }, +    }, { +        # radio live stream +        'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014', +        'info_dict': { +            'id': '1014', +            'ext': 'm4a', +            'title': r're:VOV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +            'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)', +            'is_live': True, +        }, +        'params': { +            'skip_download': True, +        }, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        title = self._og_search_title( +            webpage, default=None) or self._html_search_meta( +            'title', webpage, default=None) or self._search_regex( +            r'<title>([^<]+)<', webpage, 'title') +        title = re.sub(r'\s*-\s*TV Net\s*$', '', title) + +        if '/video/' in url or '/radio/' in url: +            is_live = False +        elif '/kenh-truyen-hinh/' in url: +            is_live = True +        else: +            is_live = None + +        data_file = unescapeHTML(self._search_regex( +            r'data-file=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, +            'data file', group='url')) + +        stream_urls = set() +        formats = [] +        for stream in self._download_json(data_file, video_id): +            if not isinstance(stream, dict): +                continue +            stream_url = stream.get('url') +            if (stream_url in stream_urls or not stream_url or +                    not isinstance(stream_url, compat_str)): +                continue +            stream_urls.add(stream_url) +            formats.extend(self._extract_m3u8_formats( +                stream_url, video_id, 'mp4', +                entry_protocol='m3u8' if is_live else 'm3u8_native', +                m3u8_id='hls', fatal=False)) +        self._sort_formats(formats) + +        # better support for radio streams +        if title.startswith('VOV'): +            for f in formats: +                f.update({ +                    'ext': 'm4a', +                    'vcodec': 'none', +                }) + +        thumbnail = self._og_search_thumbnail( +            webpage, default=None) or unescapeHTML( +            self._search_regex( +                r'data-image=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage, +                'thumbnail', default=None, group='url')) + +        if is_live: +            title = self._live_title(title) + +        view_count = int_or_none(self._search_regex( +            r'(?s)<div[^>]+\bclass=["\'].*?view-count[^>]+>.*?(\d+).*?</div>', +            webpage, 'view count', default=None)) + +        return { +            'id': video_id, +            'title': title, +            'thumbnail': thumbnail, +            'is_live': is_live, +            'view_count': view_count, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/vtv.py b/youtube_dl/extractor/vtv.py deleted file mode 100644 index a9683dd85..000000000 --- a/youtube_dl/extractor/vtv.py +++ /dev/null @@ -1,91 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - -import re - -from ..utils import extract_attributes - -class VTVIE(InfoExtractor): -    _VALID_URL = r'https?://(au|ca|cz|de|jp|kr|tw|us|vn)\.tvnet\.gov\.vn/[^/]*/(?P<id>[0-9]+)/?' -    _TESTS = [{ -        # Livestream. Channel: VTV 1 -        'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1', -        'info_dict': { -            'id': '1011', -            'ext': 'mp4', -            'title': r're:^VTV1 | LiveTV - TV Net [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', -            'thumbnail': r're:https?://.*\.png$', -        } -    }, { -        # Downloading a video. -        'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h', -        'md5': '5263c63d738569ed507980f1e49ebc03', -        'info_dict': { -            'id': '109788', -            'ext': 'mp4', -            'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang - TV Net', -            'thumbnail': r're:https?://.*\.JPG$', -        } -    }, { -        # Radio live stream. Channel: VOV 1 -        'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014', -        'info_dict': { -            'id': '1014', -            'ext': 'm4a', -            'vcodec': 'none', -            'title': r're:VOV1 | LiveTV - TV Net [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', -            'thumbnail': r're:https?://.*\.png$', -        } - -    }] - -    def _real_extract(self, url): -        video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) - -        title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title', default=None, fatal=False) -        if title is None: -            title = self._og_search_title(webpage) -        title.strip() - -        mediaplayer_div = self._search_regex(r'(<div[^>]*id="mediaplayer"[^>]*>)', webpage, 'mediaplayer element') -        mediaplayer_div_attributes = extract_attributes(mediaplayer_div) - -        thumbnail = mediaplayer_div_attributes.get("data-image") - -        json_url = mediaplayer_div_attributes["data-file"] -        video_streams = self._download_json(json_url, video_id) - - -        # get any working playlist from streams. Currently there's 2 and the first always works, -        # but you never know in the future -        for stream in video_streams: -            formats = self._extract_m3u8_formats(stream.get("url"), video_id, ext="mp4", fatal=False) -            if formats: -                break - -        # better support radio streams -        if title.startswith("VOV"): -            for f in formats: -                f["ext"] = "m4a" -                f["vcodec"] = "none" - -        if "/video/" in url or "/radio/" in url: -            is_live = False -        elif "/kenh-truyen-hinh/" in url: -            is_live = True -        else: -            is_live = None - -        if is_live: -            title = self._live_title(title) - -        return { -            'id': video_id, -            'title': title, -            'thumbnail': thumbnail, -            'formats': formats, -            'is_live': is_live, -        } | 
