From f78546272cf7c4b10c8003870728ab69bec982fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 26 Oct 2015 16:41:24 +0100 Subject: [compat] compat_etree_fromstring: also decode the text attribute Deletes parse_xml from utils, because it also does it. --- youtube_dl/compat.py | 18 ++++++++++++++++-- youtube_dl/extractor/ard.py | 4 ++-- youtube_dl/extractor/generic.py | 4 ++-- youtube_dl/utils.py | 23 ----------------------- 4 files changed, 20 insertions(+), 29 deletions(-) (limited to 'youtube_dl') diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index f39d4e9a9..2d43ec852 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -216,9 +216,19 @@ except ImportError: # Python 2.6 if sys.version_info[0] >= 3: compat_etree_fromstring = xml.etree.ElementTree.fromstring else: - # on python 2.x the the attributes of a node aren't always unicode objects + # on python 2.x the attributes and text of a node aren't always unicode + # objects etree = xml.etree.ElementTree + try: + _etree_iter = etree.Element.iter + except AttributeError: # Python <=2.6 + def _etree_iter(root): + for el in root.findall('*'): + yield el + for sub in _etree_iter(el): + yield sub + # on 2.6 XML doesn't have a parser argument, function copied from CPython # 2.7 source def _XML(text, parser=None): @@ -235,7 +245,11 @@ else: return el def compat_etree_fromstring(text): - return _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) + doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) + for el in _etree_iter(doc): + if el.text is not None and isinstance(el.text, bytes): + el.text = el.text.decode('utf-8') + return doc try: from urllib.parse import parse_qs as compat_parse_qs diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6f465789b..73be6d204 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -14,8 +14,8 @@ from ..utils import ( parse_duration, unified_strdate, xpath_text, - parse_xml, ) +from ..compat import compat_etree_fromstring class ARDMediathekIE(InfoExtractor): @@ -161,7 +161,7 @@ class ARDMediathekIE(InfoExtractor): raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True) if re.search(r'[\?&]rss($|[=&])', url): - doc = parse_xml(webpage) + doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return GenericIE()._extract_rss(url, video_id, doc) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca5fbafb2..1de96b268 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -9,6 +9,7 @@ import sys from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import ( + compat_etree_fromstring, compat_urllib_parse_unquote, compat_urllib_request, compat_urlparse, @@ -21,7 +22,6 @@ from ..utils import ( HEADRequest, is_html, orderedSet, - parse_xml, smuggle_url, unescapeHTML, unified_strdate, @@ -1237,7 +1237,7 @@ class GenericIE(InfoExtractor): # Is it an RSS feed, a SMIL file or a XSPF playlist? try: - doc = parse_xml(webpage) + doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7d846d680..c761ea22a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1652,29 +1652,6 @@ def encode_dict(d, encoding='utf-8'): return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items()) -try: - etree_iter = xml.etree.ElementTree.Element.iter -except AttributeError: # Python <=2.6 - etree_iter = lambda n: n.findall('.//*') - - -def parse_xml(s): - class TreeBuilder(xml.etree.ElementTree.TreeBuilder): - def doctype(self, name, pubid, system): - pass # Ignore doctypes - - parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder()) - kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {} - tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) - # Fix up XML parser in Python 2.x - if sys.version_info < (3, 0): - for n in etree_iter(tree): - if n.text is not None: - if not isinstance(n.text, compat_str): - n.text = n.text.decode('utf-8') - return tree - - US_RATINGS = { 'G': 0, 'PG': 10, -- cgit v1.2.3