diff options
| -rw-r--r-- | youtube_dl/extractor/generic.py | 4 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 11 | 
2 files changed, 13 insertions, 2 deletions
| diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7a2e5dee0..7666cf207 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals  import os  import re -import xml.etree.ElementTree  from .common import InfoExtractor  from .youtube import YoutubeIE @@ -17,6 +16,7 @@ from ..utils import (      ExtractorError,      HEADRequest, +    parse_xml,      smuggle_url,      unescapeHTML,      unified_strdate, @@ -274,7 +274,7 @@ class GenericIE(InfoExtractor):          # Is it an RSS feed?          try: -            doc = xml.etree.ElementTree.fromstring(webpage.encode('utf-8')) +            doc = parse_xml(webpage)              if doc.tag == 'rss':                  return self._extract_rss(url, video_id, doc)          except compat_xml_parse_error: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d4abd4031..3943cc9c5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -22,6 +22,7 @@ import struct  import subprocess  import sys  import traceback +import xml.etree.ElementTree  import zlib  try: @@ -1267,3 +1268,13 @@ def read_batch_urls(batch_fd):  def urlencode_postdata(*args, **kargs):      return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') + + +def parse_xml(s): +    class TreeBuilder(xml.etree.ElementTree.TreeBuilder): +        def doctype(self, name, pubid, system): +            pass  # Ignore doctypes + +    parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder()) +    kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {} +    return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) | 
