diff options
| -rw-r--r-- | test/test_compat.py | 11 | ||||
| -rw-r--r-- | youtube_dl/compat.py | 18 | ||||
| -rw-r--r-- | youtube_dl/extractor/ard.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 4 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 23 | 
5 files changed, 30 insertions, 30 deletions
| diff --git a/test/test_compat.py b/test/test_compat.py index 834f4bc55..b6bfad05e 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -74,10 +74,19 @@ class TestCompat(unittest.TestCase):          self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two'])      def test_compat_etree_fromstring(self): -        xml = '<el foo="bar" spam="中文"></el>' +        xml = ''' +            <root foo="bar" spam="中文"> +                <normal>foo</normal> +                <chinese>中文</chinese> +                <foo><bar>spam</bar></foo> +            </root> +        '''          doc = compat_etree_fromstring(xml.encode('utf-8'))          self.assertTrue(isinstance(doc.attrib['foo'], compat_str))          self.assertTrue(isinstance(doc.attrib['spam'], compat_str)) +        self.assertTrue(isinstance(doc.find('normal').text, compat_str)) +        self.assertTrue(isinstance(doc.find('chinese').text, compat_str)) +        self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str))  if __name__ == '__main__':      unittest.main() diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index f39d4e9a9..2d43ec852 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -216,9 +216,19 @@ except ImportError:  # Python 2.6  if sys.version_info[0] >= 3:      compat_etree_fromstring = xml.etree.ElementTree.fromstring  else: -    # on python 2.x the the attributes of a node aren't always unicode objects +    # on python 2.x the attributes and text of a node aren't always unicode +    # objects      etree = xml.etree.ElementTree +    try: +        _etree_iter = etree.Element.iter +    except AttributeError:  # Python <=2.6 +        def _etree_iter(root): +            for el in root.findall('*'): +                yield el +                for sub in _etree_iter(el): +                    yield sub +      # on 2.6 XML doesn't have a parser argument, function copied from CPython      # 2.7 source      def _XML(text, parser=None): @@ -235,7 +245,11 @@ else:          return el      def compat_etree_fromstring(text): -        return _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) +        doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) +        for el in _etree_iter(doc): +            if el.text is not None and isinstance(el.text, bytes): +                el.text = el.text.decode('utf-8') +        return doc  try:      from urllib.parse import parse_qs as compat_parse_qs diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6f465789b..73be6d204 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -14,8 +14,8 @@ from ..utils import (      parse_duration,      unified_strdate,      xpath_text, -    parse_xml,  ) +from ..compat import compat_etree_fromstring  class ARDMediathekIE(InfoExtractor): @@ -161,7 +161,7 @@ class ARDMediathekIE(InfoExtractor):              raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)          if re.search(r'[\?&]rss($|[=&])', url): -            doc = parse_xml(webpage) +            doc = compat_etree_fromstring(webpage.encode('utf-8'))              if doc.tag == 'rss':                  return GenericIE()._extract_rss(url, video_id, doc) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca5fbafb2..1de96b268 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -9,6 +9,7 @@ import sys  from .common import InfoExtractor  from .youtube import YoutubeIE  from ..compat import ( +    compat_etree_fromstring,      compat_urllib_parse_unquote,      compat_urllib_request,      compat_urlparse, @@ -21,7 +22,6 @@ from ..utils import (      HEADRequest,      is_html,      orderedSet, -    parse_xml,      smuggle_url,      unescapeHTML,      unified_strdate, @@ -1237,7 +1237,7 @@ class GenericIE(InfoExtractor):          # Is it an RSS feed, a SMIL file or a XSPF playlist?          try: -            doc = parse_xml(webpage) +            doc = compat_etree_fromstring(webpage.encode('utf-8'))              if doc.tag == 'rss':                  return self._extract_rss(url, video_id, doc)              elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7d846d680..c761ea22a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1652,29 +1652,6 @@ def encode_dict(d, encoding='utf-8'):      return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items()) -try: -    etree_iter = xml.etree.ElementTree.Element.iter -except AttributeError:  # Python <=2.6 -    etree_iter = lambda n: n.findall('.//*') - - -def parse_xml(s): -    class TreeBuilder(xml.etree.ElementTree.TreeBuilder): -        def doctype(self, name, pubid, system): -            pass  # Ignore doctypes - -    parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder()) -    kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {} -    tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) -    # Fix up XML parser in Python 2.x -    if sys.version_info < (3, 0): -        for n in etree_iter(tree): -            if n.text is not None: -                if not isinstance(n.text, compat_str): -                    n.text = n.text.decode('utf-8') -    return tree - -  US_RATINGS = {      'G': 0,      'PG': 10, | 
