diff options
Diffstat (limited to 'youtube_dl/extractor/crunchyroll.py')
-rw-r--r-- | youtube_dl/extractor/crunchyroll.py | 28 |
1 files changed, 17 insertions, 11 deletions
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index f8ce10111..00d943f77 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -5,12 +5,12 @@ import re import json import base64 import zlib -import xml.etree.ElementTree from hashlib import sha1 from math import pow, sqrt, floor from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_urllib_parse, compat_urllib_parse_unquote, compat_urllib_request, @@ -21,7 +21,9 @@ from ..utils import ( bytes_to_intlist, intlist_to_bytes, int_or_none, + lowercase_escape, remove_end, + sanitized_Request, unified_strdate, urlencode_postdata, xpath_text, @@ -45,7 +47,7 @@ class CrunchyrollBaseIE(InfoExtractor): 'name': username, 'password': password, }) - login_request = compat_urllib_request.Request(login_url, data) + login_request = sanitized_Request(login_url, data) login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') self._download_webpage(login_request, None, False, 'Wrong login info') @@ -54,7 +56,7 @@ class CrunchyrollBaseIE(InfoExtractor): def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) - else compat_urllib_request.Request(url_or_request)) + else sanitized_Request(url_or_request)) # Accept-Language must be set explicitly to accept any language to avoid issues # similar to https://github.com/rg3/youtube-dl/issues/6797. # Along with IP address Crunchyroll uses Accept-Language to guess whether georestriction @@ -104,7 +106,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'id': '589804', 'ext': 'flv', 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11', - 'description': 'md5:fe2743efedb49d279552926d0bd0cd9e', + 'description': 'md5:2fbc01f90b87e8e9137296f37b461c12', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'Danny Choo Network', 'upload_date': '20120213', @@ -234,7 +236,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text return output def _extract_subtitles(self, subtitle): - sub_root = xml.etree.ElementTree.fromstring(subtitle) + sub_root = compat_etree_fromstring(subtitle) return [{ 'ext': 'srt', 'data': self._convert_subtitles_to_srt(sub_root), @@ -287,11 +289,15 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text if 'To view this, please log in to verify you are 18 or older.' in webpage: self.raise_login_required() - video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL) + video_title = self._html_search_regex( + r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>', + webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) - video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') - if not video_description: - video_description = None + video_description = self._html_search_regex( + r'<script[^>]*>\s*.+?\[media_id=%s\].+?"description"\s*:\s*"([^"]+)' % video_id, + webpage, 'description', default=None) + if video_description: + video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) video_upload_date = self._html_search_regex( [r'<div>Availability for free users:(.+?)</div>', r'<div>[^<>]+<span>\s*(.+?\d{4})\s*</span></div>'], webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) @@ -302,7 +308,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'video_uploader', fatal=False) playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) - playerdata_req = compat_urllib_request.Request(playerdata_url) + playerdata_req = sanitized_Request(playerdata_url) playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url}) playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info') @@ -314,7 +320,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage): stream_quality, stream_format = self._FORMAT_IDS[fmt] video_format = fmt + 'p' - streamdata_req = compat_urllib_request.Request( + streamdata_req = sanitized_Request( 'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s' % (stream_id, stream_format, stream_quality), compat_urllib_parse.urlencode({'current_page': url}).encode('utf-8')) |