aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
authorRemita Amine <remitamine@gmail.com>2018-09-01 08:16:28 +0100
committerRemita Amine <remitamine@gmail.com>2018-09-01 08:16:41 +0100
commit54a5be4dba3560ff51f98865b5598d361a878e82 (patch)
tree1df2ca40a364f93dcac60b542f0e656431b743c8 /youtube_dl
parented6919e7371b3da66c10e4c5768816e81f4c5db3 (diff)
[crunchyroll] parse vilos media data(closes #17343)
Diffstat (limited to 'youtube_dl')
-rw-r--r--youtube_dl/extractor/crunchyroll.py205
-rw-r--r--youtube_dl/extractor/vrv.py48
2 files changed, 141 insertions, 112 deletions
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 463f995c7..4ed458372 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -7,7 +7,7 @@ import zlib
from hashlib import sha1
from math import pow, sqrt, floor
-from .common import InfoExtractor
+from .vrv import VRVIE
from ..compat import (
compat_b64decode,
compat_etree_fromstring,
@@ -18,6 +18,8 @@ from ..compat import (
from ..utils import (
ExtractorError,
bytes_to_intlist,
+ extract_attributes,
+ float_or_none,
intlist_to_bytes,
int_or_none,
lowercase_escape,
@@ -26,14 +28,13 @@ from ..utils import (
unified_strdate,
urlencode_postdata,
xpath_text,
- extract_attributes,
)
from ..aes import (
aes_cbc_decrypt,
)
-class CrunchyrollBaseIE(InfoExtractor):
+class CrunchyrollBaseIE(VRVIE):
_LOGIN_URL = 'https://www.crunchyroll.com/login'
_LOGIN_FORM = 'login_form'
_NETRC_MACHINE = 'crunchyroll'
@@ -148,7 +149,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'ext': 'mp4',
'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',
'description': 'md5:2d17137920c64f2f49981a7797d275ef',
- 'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg',
+ 'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Yomiuri Telecasting Corporation (YTV)',
'upload_date': '20131013',
'url': 're:(?!.*&amp)',
@@ -221,7 +222,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'info_dict': {
'id': '535080',
'ext': 'mp4',
- 'title': '11eyes Episode 1 – Piros éjszaka - Red Night',
+ 'title': '11eyes Episode 1 – Red Night ~ Piros éjszaka',
'description': 'Kakeru and Yuka are thrown into an alternate nightmarish world they call "Red Night".',
'uploader': 'Marvelous AQL Inc.',
'upload_date': '20091021',
@@ -437,13 +438,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
if 'To view this, please log in to verify you are 18 or older.' in webpage:
self.raise_login_required()
+ media = self._parse_json(self._search_regex(
+ r'vilos\.config\.media\s*=\s*({.+?});',
+ webpage, 'vilos media', default='{}'), video_id)
+ media_metadata = media.get('metadata') or {}
+
video_title = self._html_search_regex(
r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>',
webpage, 'video_title')
video_title = re.sub(r' {2,}', ' ', video_title)
- video_description = self._parse_json(self._html_search_regex(
+ video_description = (self._parse_json(self._html_search_regex(
r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id,
- webpage, 'description', default='{}'), video_id).get('description')
+ webpage, 'description', default='{}'), video_id) or media_metadata).get('description')
if video_description:
video_description = lowercase_escape(video_description.replace(r'\r\n', '\n'))
video_upload_date = self._html_search_regex(
@@ -456,91 +462,99 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
[r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'],
webpage, 'video_uploader', fatal=False)
- available_fmts = []
- for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage):
- attrs = extract_attributes(a)
- href = attrs.get('href')
- if href and '/freetrial' in href:
- continue
- available_fmts.append(fmt)
- if not available_fmts:
- for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'):
- available_fmts = re.findall(p, webpage)
- if available_fmts:
- break
- video_encode_ids = []
formats = []
- for fmt in available_fmts:
- stream_quality, stream_format = self._FORMAT_IDS[fmt]
- video_format = fmt + 'p'
- stream_infos = []
- streamdata = self._call_rpc_api(
- 'VideoPlayer_GetStandardConfig', video_id,
- 'Downloading media info for %s' % video_format, data={
- 'media_id': video_id,
- 'video_format': stream_format,
- 'video_quality': stream_quality,
- 'current_page': url,
- })
- if streamdata is not None:
- stream_info = streamdata.find('./{default}preload/stream_info')
+ for stream in media.get('streams', []):
+ formats.extend(self._extract_vrv_formats(
+ stream.get('url'), video_id, stream.get('format'),
+ stream.get('audio_lang'), stream.get('hardsub_lang')))
+ if not formats:
+ available_fmts = []
+ for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage):
+ attrs = extract_attributes(a)
+ href = attrs.get('href')
+ if href and '/freetrial' in href:
+ continue
+ available_fmts.append(fmt)
+ if not available_fmts:
+ for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'):
+ available_fmts = re.findall(p, webpage)
+ if available_fmts:
+ break
+ if not available_fmts:
+ available_fmts = self._FORMAT_IDS.keys()
+ video_encode_ids = []
+
+ for fmt in available_fmts:
+ stream_quality, stream_format = self._FORMAT_IDS[fmt]
+ video_format = fmt + 'p'
+ stream_infos = []
+ streamdata = self._call_rpc_api(
+ 'VideoPlayer_GetStandardConfig', video_id,
+ 'Downloading media info for %s' % video_format, data={
+ 'media_id': video_id,
+ 'video_format': stream_format,
+ 'video_quality': stream_quality,
+ 'current_page': url,
+ })
+ if streamdata is not None:
+ stream_info = streamdata.find('./{default}preload/stream_info')
+ if stream_info is not None:
+ stream_infos.append(stream_info)
+ stream_info = self._call_rpc_api(
+ 'VideoEncode_GetStreamInfo', video_id,
+ 'Downloading stream info for %s' % video_format, data={
+ 'media_id': video_id,
+ 'video_format': stream_format,
+ 'video_encode_quality': stream_quality,
+ })
if stream_info is not None:
stream_infos.append(stream_info)
- stream_info = self._call_rpc_api(
- 'VideoEncode_GetStreamInfo', video_id,
- 'Downloading stream info for %s' % video_format, data={
- 'media_id': video_id,
- 'video_format': stream_format,
- 'video_encode_quality': stream_quality,
- })
- if stream_info is not None:
- stream_infos.append(stream_info)
- for stream_info in stream_infos:
- video_encode_id = xpath_text(stream_info, './video_encode_id')
- if video_encode_id in video_encode_ids:
- continue
- video_encode_ids.append(video_encode_id)
-
- video_file = xpath_text(stream_info, './file')
- if not video_file:
- continue
- if video_file.startswith('http'):
- formats.extend(self._extract_m3u8_formats(
- video_file, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls', fatal=False))
- continue
+ for stream_info in stream_infos:
+ video_encode_id = xpath_text(stream_info, './video_encode_id')
+ if video_encode_id in video_encode_ids:
+ continue
+ video_encode_ids.append(video_encode_id)
- video_url = xpath_text(stream_info, './host')
- if not video_url:
- continue
- metadata = stream_info.find('./metadata')
- format_info = {
- 'format': video_format,
- 'height': int_or_none(xpath_text(metadata, './height')),
- 'width': int_or_none(xpath_text(metadata, './width')),
- }
-
- if '.fplive.net/' in video_url:
- video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip())
- parsed_video_url = compat_urlparse.urlparse(video_url)
- direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
- netloc='v.lvlt.crcdn.net',
- path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1])))
- if self._is_valid_url(direct_video_url, video_id, video_format):
- format_info.update({
- 'format_id': 'http-' + video_format,
- 'url': direct_video_url,
- })
- formats.append(format_info)
+ video_file = xpath_text(stream_info, './file')
+ if not video_file:
+ continue
+ if video_file.startswith('http'):
+ formats.extend(self._extract_m3u8_formats(
+ video_file, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
continue
- format_info.update({
- 'format_id': 'rtmp-' + video_format,
- 'url': video_url,
- 'play_path': video_file,
- 'ext': 'flv',
- })
- formats.append(format_info)
+ video_url = xpath_text(stream_info, './host')
+ if not video_url:
+ continue
+ metadata = stream_info.find('./metadata')
+ format_info = {
+ 'format': video_format,
+ 'height': int_or_none(xpath_text(metadata, './height')),
+ 'width': int_or_none(xpath_text(metadata, './width')),
+ }
+
+ if '.fplive.net/' in video_url:
+ video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip())
+ parsed_video_url = compat_urlparse.urlparse(video_url)
+ direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
+ netloc='v.lvlt.crcdn.net',
+ path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1])))
+ if self._is_valid_url(direct_video_url, video_id, video_format):
+ format_info.update({
+ 'format_id': 'http-' + video_format,
+ 'url': direct_video_url,
+ })
+ formats.append(format_info)
+ continue
+
+ format_info.update({
+ 'format_id': 'rtmp-' + video_format,
+ 'url': video_url,
+ 'play_path': video_file,
+ 'ext': 'flv',
+ })
+ formats.append(format_info)
self._sort_formats(formats, ('height', 'width', 'tbr', 'fps'))
metadata = self._call_rpc_api(
@@ -549,7 +563,17 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
'media_id': video_id,
})
- subtitles = self.extract_subtitles(video_id, webpage)
+ subtitles = {}
+ for subtitle in media.get('subtitles', []):
+ subtitle_url = subtitle.get('url')
+ if not subtitle_url:
+ continue
+ subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({
+ 'url': subtitle_url,
+ 'ext': subtitle.get('format', 'ass'),
+ })
+ if not subtitles:
+ subtitles = self.extract_subtitles(video_id, webpage)
# webpage provide more accurate data than series_title from XML
series = self._html_search_regex(
@@ -557,8 +581,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
webpage, 'series', fatal=False)
season = xpath_text(metadata, 'series_title')
- episode = xpath_text(metadata, 'episode_title')
- episode_number = int_or_none(xpath_text(metadata, 'episode_number'))
+ episode = xpath_text(metadata, 'episode_title') or media_metadata.get('title')
+ episode_number = int_or_none(xpath_text(metadata, 'episode_number') or media_metadata.get('episode_number'))
season_number = int_or_none(self._search_regex(
r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)',
@@ -568,7 +592,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
'id': video_id,
'title': video_title,
'description': video_description,
- 'thumbnail': xpath_text(metadata, 'episode_image_url'),
+ 'duration': float_or_none(media_metadata.get('duration'), 1000),
+ 'thumbnail': xpath_text(metadata, 'episode_image_url') or media_metadata.get('thumbnail', {}).get('url'),
'uploader': video_uploader,
'upload_date': video_upload_date,
'series': series,
diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py
index 64b13f0ed..921e9e172 100644
--- a/youtube_dl/extractor/vrv.py
+++ b/youtube_dl/extractor/vrv.py
@@ -72,7 +72,7 @@ class VRVBaseIE(InfoExtractor):
class VRVIE(VRVBaseIE):
IE_NAME = 'vrv'
_VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)'
- _TEST = {
+ _TESTS = [{
'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT',
'info_dict': {
'id': 'GR9PNZ396',
@@ -85,7 +85,28 @@ class VRVIE(VRVBaseIE):
# m3u8 download
'skip_download': True,
},
- }
+ }]
+
+ def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang):
+ if not url or stream_format not in ('hls', 'dash'):
+ return []
+ stream_id = hardsub_lang or audio_lang
+ format_id = '%s-%s' % (stream_format, stream_id)
+ if stream_format == 'hls':
+ adaptive_formats = self._extract_m3u8_formats(
+ url, video_id, 'mp4', m3u8_id=format_id,
+ note='Downloading %s m3u8 information' % stream_id,
+ fatal=False)
+ elif stream_format == 'dash':
+ adaptive_formats = self._extract_mpd_formats(
+ url, video_id, mpd_id=format_id,
+ note='Downloading %s MPD information' % stream_id,
+ fatal=False)
+ if audio_lang:
+ for f in adaptive_formats:
+ if f.get('acodec') != 'none':
+ f['language'] = audio_lang
+ return adaptive_formats
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -115,26 +136,9 @@ class VRVIE(VRVBaseIE):
for stream_type, streams in streams_json.get('streams', {}).items():
if stream_type in ('adaptive_hls', 'adaptive_dash'):
for stream in streams.values():
- stream_url = stream.get('url')
- if not stream_url:
- continue
- stream_id = stream.get('hardsub_locale') or audio_locale
- format_id = '%s-%s' % (stream_type.split('_')[1], stream_id)
- if stream_type == 'adaptive_hls':
- adaptive_formats = self._extract_m3u8_formats(
- stream_url, video_id, 'mp4', m3u8_id=format_id,
- note='Downloading %s m3u8 information' % stream_id,
- fatal=False)
- else:
- adaptive_formats = self._extract_mpd_formats(
- stream_url, video_id, mpd_id=format_id,
- note='Downloading %s MPD information' % stream_id,
- fatal=False)
- if audio_locale:
- for f in adaptive_formats:
- if f.get('acodec') != 'none':
- f['language'] = audio_locale
- formats.extend(adaptive_formats)
+ formats.extend(self._extract_vrv_formats(
+ stream.get('url'), video_id, stream_type.split('_')[1],
+ audio_locale, stream.get('hardsub_locale')))
self._sort_formats(formats)
subtitles = {}