aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>2014-10-29 21:19:20 +0100
committerJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>2014-10-29 21:19:20 +0100
commitd65d6286139fb6e4836113924648e35892f48de3 (patch)
treeb2aaa4139b19426c05d4e69f76eb81fb787b6147
parentac645ac7d0ce240e17f5e26f525e22e2aa71407a (diff)
downloadyoutube-dl-d65d6286139fb6e4836113924648e35892f48de3.tar.xz
[crunchycroll] Fix building of ass subtitles (reported in #4019)
Parse the xml document instead of using regexes, otherwise unicode characters are left unescaped.
-rw-r--r--youtube_dl/extractor/crunchyroll.py27
1 files changed, 12 insertions, 15 deletions
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 05b21e872..cc612d08e 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -109,19 +109,17 @@ class CrunchyrollIE(SubtitlesInfoExtractor):
decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv))
return zlib.decompress(decrypted_data)
- def _convert_subtitles_to_srt(self, subtitles):
+ def _convert_subtitles_to_srt(self, sub_root):
output = ''
- for i, (start, end, text) in enumerate(re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles), 1):
- start = start.replace('.', ',')
- end = end.replace('.', ',')
- text = clean_html(text)
- text = text.replace('\\N', '\n')
- if not text:
- continue
+
+ for i, event in enumerate(sub_root.findall('./events/event'), 1):
+ start = event.attrib['start'].replace('.', ',')
+ end = event.attrib['end'].replace('.', ',')
+ text = event.attrib['text'].replace('\\N', '\n')
output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text)
return output
- def _convert_subtitles_to_ass(self, subtitles):
+ def _convert_subtitles_to_ass(self, sub_root):
output = ''
def ass_bool(strvalue):
@@ -130,10 +128,6 @@ class CrunchyrollIE(SubtitlesInfoExtractor):
assvalue = '-1'
return assvalue
- sub_root = xml.etree.ElementTree.fromstring(subtitles)
- if not sub_root:
- return output
-
output = '[Script Info]\n'
output += 'Title: %s\n' % sub_root.attrib["title"]
output += 'ScriptType: v4.00+\n'
@@ -270,10 +264,13 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
if not lang_code:
continue
+ sub_root = xml.etree.ElementTree.fromstring(subtitle)
+ if not sub_root:
+ subtitles[lang_code] = ''
if sub_format == 'ass':
- subtitles[lang_code] = self._convert_subtitles_to_ass(subtitle)
+ subtitles[lang_code] = self._convert_subtitles_to_ass(sub_root)
else:
- subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle)
+ subtitles[lang_code] = self._convert_subtitles_to_srt(sub_root)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, subtitles)