diff options
| author | Remita Amine <remitamine@gmail.com> | 2018-09-01 08:16:28 +0100 | 
|---|---|---|
| committer | Remita Amine <remitamine@gmail.com> | 2018-09-01 08:16:41 +0100 | 
| commit | 54a5be4dba3560ff51f98865b5598d361a878e82 (patch) | |
| tree | 1df2ca40a364f93dcac60b542f0e656431b743c8 | |
| parent | ed6919e7371b3da66c10e4c5768816e81f4c5db3 (diff) | |
[crunchyroll] parse vilos media data(closes #17343)
| -rw-r--r-- | youtube_dl/extractor/crunchyroll.py | 205 | ||||
| -rw-r--r-- | youtube_dl/extractor/vrv.py | 48 | 
2 files changed, 141 insertions, 112 deletions
| diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 463f995c7..4ed458372 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -7,7 +7,7 @@ import zlib  from hashlib import sha1  from math import pow, sqrt, floor -from .common import InfoExtractor +from .vrv import VRVIE  from ..compat import (      compat_b64decode,      compat_etree_fromstring, @@ -18,6 +18,8 @@ from ..compat import (  from ..utils import (      ExtractorError,      bytes_to_intlist, +    extract_attributes, +    float_or_none,      intlist_to_bytes,      int_or_none,      lowercase_escape, @@ -26,14 +28,13 @@ from ..utils import (      unified_strdate,      urlencode_postdata,      xpath_text, -    extract_attributes,  )  from ..aes import (      aes_cbc_decrypt,  ) -class CrunchyrollBaseIE(InfoExtractor): +class CrunchyrollBaseIE(VRVIE):      _LOGIN_URL = 'https://www.crunchyroll.com/login'      _LOGIN_FORM = 'login_form'      _NETRC_MACHINE = 'crunchyroll' @@ -148,7 +149,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):              'ext': 'mp4',              'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',              'description': 'md5:2d17137920c64f2f49981a7797d275ef', -            'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', +            'thumbnail': r're:^https?://.*\.jpg$',              'uploader': 'Yomiuri Telecasting Corporation (YTV)',              'upload_date': '20131013',              'url': 're:(?!.*&)', @@ -221,7 +222,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):          'info_dict': {              'id': '535080',              'ext': 'mp4', -            'title': '11eyes Episode 1 – Piros éjszaka - Red Night', +            'title': '11eyes Episode 1 – Red Night ~ Piros éjszaka',              'description': 'Kakeru and Yuka are thrown into an alternate nightmarish world they call "Red Night".',              'uploader': 'Marvelous AQL Inc.',              'upload_date': '20091021', @@ -437,13 +438,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text          if 'To view this, please log in to verify you are 18 or older.' in webpage:              self.raise_login_required() +        media = self._parse_json(self._search_regex( +            r'vilos\.config\.media\s*=\s*({.+?});', +            webpage, 'vilos media', default='{}'), video_id) +        media_metadata = media.get('metadata') or {} +          video_title = self._html_search_regex(              r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>',              webpage, 'video_title')          video_title = re.sub(r' {2,}', ' ', video_title) -        video_description = self._parse_json(self._html_search_regex( +        video_description = (self._parse_json(self._html_search_regex(              r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id, -            webpage, 'description', default='{}'), video_id).get('description') +            webpage, 'description', default='{}'), video_id) or media_metadata).get('description')          if video_description:              video_description = lowercase_escape(video_description.replace(r'\r\n', '\n'))          video_upload_date = self._html_search_regex( @@ -456,91 +462,99 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text              [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'],              webpage, 'video_uploader', fatal=False) -        available_fmts = [] -        for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): -            attrs = extract_attributes(a) -            href = attrs.get('href') -            if href and '/freetrial' in href: -                continue -            available_fmts.append(fmt) -        if not available_fmts: -            for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'): -                available_fmts = re.findall(p, webpage) -                if available_fmts: -                    break -        video_encode_ids = []          formats = [] -        for fmt in available_fmts: -            stream_quality, stream_format = self._FORMAT_IDS[fmt] -            video_format = fmt + 'p' -            stream_infos = [] -            streamdata = self._call_rpc_api( -                'VideoPlayer_GetStandardConfig', video_id, -                'Downloading media info for %s' % video_format, data={ -                    'media_id': video_id, -                    'video_format': stream_format, -                    'video_quality': stream_quality, -                    'current_page': url, -                }) -            if streamdata is not None: -                stream_info = streamdata.find('./{default}preload/stream_info') +        for stream in media.get('streams', []): +            formats.extend(self._extract_vrv_formats( +                stream.get('url'), video_id, stream.get('format'), +                stream.get('audio_lang'), stream.get('hardsub_lang'))) +        if not formats: +            available_fmts = [] +            for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): +                attrs = extract_attributes(a) +                href = attrs.get('href') +                if href and '/freetrial' in href: +                    continue +                available_fmts.append(fmt) +            if not available_fmts: +                for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'): +                    available_fmts = re.findall(p, webpage) +                    if available_fmts: +                        break +            if not available_fmts: +                available_fmts = self._FORMAT_IDS.keys() +            video_encode_ids = [] + +            for fmt in available_fmts: +                stream_quality, stream_format = self._FORMAT_IDS[fmt] +                video_format = fmt + 'p' +                stream_infos = [] +                streamdata = self._call_rpc_api( +                    'VideoPlayer_GetStandardConfig', video_id, +                    'Downloading media info for %s' % video_format, data={ +                        'media_id': video_id, +                        'video_format': stream_format, +                        'video_quality': stream_quality, +                        'current_page': url, +                    }) +                if streamdata is not None: +                    stream_info = streamdata.find('./{default}preload/stream_info') +                    if stream_info is not None: +                        stream_infos.append(stream_info) +                stream_info = self._call_rpc_api( +                    'VideoEncode_GetStreamInfo', video_id, +                    'Downloading stream info for %s' % video_format, data={ +                        'media_id': video_id, +                        'video_format': stream_format, +                        'video_encode_quality': stream_quality, +                    })                  if stream_info is not None:                      stream_infos.append(stream_info) -            stream_info = self._call_rpc_api( -                'VideoEncode_GetStreamInfo', video_id, -                'Downloading stream info for %s' % video_format, data={ -                    'media_id': video_id, -                    'video_format': stream_format, -                    'video_encode_quality': stream_quality, -                }) -            if stream_info is not None: -                stream_infos.append(stream_info) -            for stream_info in stream_infos: -                video_encode_id = xpath_text(stream_info, './video_encode_id') -                if video_encode_id in video_encode_ids: -                    continue -                video_encode_ids.append(video_encode_id) - -                video_file = xpath_text(stream_info, './file') -                if not video_file: -                    continue -                if video_file.startswith('http'): -                    formats.extend(self._extract_m3u8_formats( -                        video_file, video_id, 'mp4', entry_protocol='m3u8_native', -                        m3u8_id='hls', fatal=False)) -                    continue +                for stream_info in stream_infos: +                    video_encode_id = xpath_text(stream_info, './video_encode_id') +                    if video_encode_id in video_encode_ids: +                        continue +                    video_encode_ids.append(video_encode_id) -                video_url = xpath_text(stream_info, './host') -                if not video_url: -                    continue -                metadata = stream_info.find('./metadata') -                format_info = { -                    'format': video_format, -                    'height': int_or_none(xpath_text(metadata, './height')), -                    'width': int_or_none(xpath_text(metadata, './width')), -                } - -                if '.fplive.net/' in video_url: -                    video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) -                    parsed_video_url = compat_urlparse.urlparse(video_url) -                    direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( -                        netloc='v.lvlt.crcdn.net', -                        path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) -                    if self._is_valid_url(direct_video_url, video_id, video_format): -                        format_info.update({ -                            'format_id': 'http-' + video_format, -                            'url': direct_video_url, -                        }) -                        formats.append(format_info) +                    video_file = xpath_text(stream_info, './file') +                    if not video_file: +                        continue +                    if video_file.startswith('http'): +                        formats.extend(self._extract_m3u8_formats( +                            video_file, video_id, 'mp4', entry_protocol='m3u8_native', +                            m3u8_id='hls', fatal=False))                          continue -                format_info.update({ -                    'format_id': 'rtmp-' + video_format, -                    'url': video_url, -                    'play_path': video_file, -                    'ext': 'flv', -                }) -                formats.append(format_info) +                    video_url = xpath_text(stream_info, './host') +                    if not video_url: +                        continue +                    metadata = stream_info.find('./metadata') +                    format_info = { +                        'format': video_format, +                        'height': int_or_none(xpath_text(metadata, './height')), +                        'width': int_or_none(xpath_text(metadata, './width')), +                    } + +                    if '.fplive.net/' in video_url: +                        video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) +                        parsed_video_url = compat_urlparse.urlparse(video_url) +                        direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( +                            netloc='v.lvlt.crcdn.net', +                            path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) +                        if self._is_valid_url(direct_video_url, video_id, video_format): +                            format_info.update({ +                                'format_id': 'http-' + video_format, +                                'url': direct_video_url, +                            }) +                            formats.append(format_info) +                            continue + +                    format_info.update({ +                        'format_id': 'rtmp-' + video_format, +                        'url': video_url, +                        'play_path': video_file, +                        'ext': 'flv', +                    }) +                    formats.append(format_info)          self._sort_formats(formats, ('height', 'width', 'tbr', 'fps'))          metadata = self._call_rpc_api( @@ -549,7 +563,17 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text                  'media_id': video_id,              }) -        subtitles = self.extract_subtitles(video_id, webpage) +        subtitles = {} +        for subtitle in media.get('subtitles', []): +            subtitle_url = subtitle.get('url') +            if not subtitle_url: +                continue +            subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({ +                'url': subtitle_url, +                'ext': subtitle.get('format', 'ass'), +            }) +        if not subtitles: +            subtitles = self.extract_subtitles(video_id, webpage)          # webpage provide more accurate data than series_title from XML          series = self._html_search_regex( @@ -557,8 +581,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text              webpage, 'series', fatal=False)          season = xpath_text(metadata, 'series_title') -        episode = xpath_text(metadata, 'episode_title') -        episode_number = int_or_none(xpath_text(metadata, 'episode_number')) +        episode = xpath_text(metadata, 'episode_title') or media_metadata.get('title') +        episode_number = int_or_none(xpath_text(metadata, 'episode_number') or media_metadata.get('episode_number'))          season_number = int_or_none(self._search_regex(              r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)', @@ -568,7 +592,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text              'id': video_id,              'title': video_title,              'description': video_description, -            'thumbnail': xpath_text(metadata, 'episode_image_url'), +            'duration': float_or_none(media_metadata.get('duration'), 1000), +            'thumbnail': xpath_text(metadata, 'episode_image_url') or media_metadata.get('thumbnail', {}).get('url'),              'uploader': video_uploader,              'upload_date': video_upload_date,              'series': series, diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 64b13f0ed..921e9e172 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -72,7 +72,7 @@ class VRVBaseIE(InfoExtractor):  class VRVIE(VRVBaseIE):      IE_NAME = 'vrv'      _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)' -    _TEST = { +    _TESTS = [{          'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT',          'info_dict': {              'id': 'GR9PNZ396', @@ -85,7 +85,28 @@ class VRVIE(VRVBaseIE):              # m3u8 download              'skip_download': True,          }, -    } +    }] + +    def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): +        if not url or stream_format not in ('hls', 'dash'): +            return [] +        stream_id = hardsub_lang or audio_lang +        format_id = '%s-%s' % (stream_format, stream_id) +        if stream_format == 'hls': +            adaptive_formats = self._extract_m3u8_formats( +                url, video_id, 'mp4', m3u8_id=format_id, +                note='Downloading %s m3u8 information' % stream_id, +                fatal=False) +        elif stream_format == 'dash': +            adaptive_formats = self._extract_mpd_formats( +                url, video_id, mpd_id=format_id, +                note='Downloading %s MPD information' % stream_id, +                fatal=False) +        if audio_lang: +            for f in adaptive_formats: +                if f.get('acodec') != 'none': +                    f['language'] = audio_lang +        return adaptive_formats      def _real_extract(self, url):          video_id = self._match_id(url) @@ -115,26 +136,9 @@ class VRVIE(VRVBaseIE):          for stream_type, streams in streams_json.get('streams', {}).items():              if stream_type in ('adaptive_hls', 'adaptive_dash'):                  for stream in streams.values(): -                    stream_url = stream.get('url') -                    if not stream_url: -                        continue -                    stream_id = stream.get('hardsub_locale') or audio_locale -                    format_id = '%s-%s' % (stream_type.split('_')[1], stream_id) -                    if stream_type == 'adaptive_hls': -                        adaptive_formats = self._extract_m3u8_formats( -                            stream_url, video_id, 'mp4', m3u8_id=format_id, -                            note='Downloading %s m3u8 information' % stream_id, -                            fatal=False) -                    else: -                        adaptive_formats = self._extract_mpd_formats( -                            stream_url, video_id, mpd_id=format_id, -                            note='Downloading %s MPD information' % stream_id, -                            fatal=False) -                    if audio_locale: -                        for f in adaptive_formats: -                            if f.get('acodec') != 'none': -                                f['language'] = audio_locale -                    formats.extend(adaptive_formats) +                    formats.extend(self._extract_vrv_formats( +                        stream.get('url'), video_id, stream_type.split('_')[1], +                        audio_locale, stream.get('hardsub_locale')))          self._sort_formats(formats)          subtitles = {} | 
