aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--youtube_dl/extractor/generic.py4
-rw-r--r--youtube_dl/utils.py11
2 files changed, 13 insertions, 2 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 7a2e5dee0..7666cf207 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -4,7 +4,6 @@ from __future__ import unicode_literals
import os
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from .youtube import YoutubeIE
@@ -17,6 +16,7 @@ from ..utils import (
ExtractorError,
HEADRequest,
+ parse_xml,
smuggle_url,
unescapeHTML,
unified_strdate,
@@ -274,7 +274,7 @@ class GenericIE(InfoExtractor):
# Is it an RSS feed?
try:
- doc = xml.etree.ElementTree.fromstring(webpage.encode('utf-8'))
+ doc = parse_xml(webpage)
if doc.tag == 'rss':
return self._extract_rss(url, video_id, doc)
except compat_xml_parse_error:
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index d4abd4031..3943cc9c5 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -22,6 +22,7 @@ import struct
import subprocess
import sys
import traceback
+import xml.etree.ElementTree
import zlib
try:
@@ -1267,3 +1268,13 @@ def read_batch_urls(batch_fd):
def urlencode_postdata(*args, **kargs):
return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
+
+
+def parse_xml(s):
+ class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
+ def doctype(self, name, pubid, system):
+ pass # Ignore doctypes
+
+ parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
+ kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
+ return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)