[utils] Use bytes-like objects in dfxp2srt

This fixes handling of non-UTF8 TTML subtitles Closes #14191
author: Yen Chi Hsuan <yan12125@gmail.com> 2017-09-16 12:18:38 +0800
committer: Yen Chi Hsuan <yan12125@gmail.com> 2017-09-16 12:18:38 +0800
commit: 3869028ffb6be6ab719e5cf1004276dfdfd1216d (patch)
tree: 86301ec756a57fea9248042be358a42b6af6d7c2 /youtube_dl
parent: 68d43a61b552007a718894967b869c0f1d8ff00f (diff)
2 files changed, 12 insertions, 8 deletions
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index 51256a3fb..f71d413b5 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -585,7 +585,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
                 dfxp_file = old_file
                 srt_file = subtitles_filename(filename, lang, 'srt')
 
-                with io.open(dfxp_file, 'rt', encoding='utf-8') as f:
+                with open(dfxp_file, 'rb') as f:
                     srt_data = dfxp2srt(f.read())
 
                 with io.open(srt_file, 'wt', encoding='utf-8') as f:
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 9e4492d40..b724e0b70 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -2572,14 +2572,18 @@ def srt_subtitles_timecode(seconds):
 
 
 def dfxp2srt(dfxp_data):
+    '''
+    @param dfxp_data A bytes-like object containing DFXP data
+    @returns A unicode object containing converted SRT data
+    '''
     LEGACY_NAMESPACES = (
-        ('http://www.w3.org/ns/ttml', [
-            'http://www.w3.org/2004/11/ttaf1',
-            'http://www.w3.org/2006/04/ttaf1',
-            'http://www.w3.org/2006/10/ttaf1',
+        (b'http://www.w3.org/ns/ttml', [
+            b'http://www.w3.org/2004/11/ttaf1',
+            b'http://www.w3.org/2006/04/ttaf1',
+            b'http://www.w3.org/2006/10/ttaf1',
         ]),
-        ('http://www.w3.org/ns/ttml#styling', [
-            'http://www.w3.org/ns/ttml#style',
+        (b'http://www.w3.org/ns/ttml#styling', [
+            b'http://www.w3.org/ns/ttml#style',
         ]),
     )
 
@@ -2674,7 +2678,7 @@ def dfxp2srt(dfxp_data):
         for ns in v:
             dfxp_data = dfxp_data.replace(ns, k)
 
-    dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
+    dfxp = compat_etree_fromstring(dfxp_data)
     out = []
     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
author	Yen Chi Hsuan <yan12125@gmail.com>	2017-09-16 12:18:38 +0800
committer	Yen Chi Hsuan <yan12125@gmail.com>	2017-09-16 12:18:38 +0800
commit	3869028ffb6be6ab719e5cf1004276dfdfd1216d (patch)
tree	86301ec756a57fea9248042be358a42b6af6d7c2 /youtube_dl
parent	68d43a61b552007a718894967b869c0f1d8ff00f (diff)