aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Hagemeister <phihag@phihag.de>2013-12-10 12:45:22 +0100
committerPhilipp Hagemeister <phihag@phihag.de>2013-12-10 12:45:22 +0100
commite2b38da93112c97d46d612bf89c329b22ac2d00d (patch)
tree1bf6d351696e3cff54a3b247f0c842b04dcb1cbc
parenta30a60d8eb027a55ec14c912bad4359b3128997e (diff)
downloadyoutube-dl-e2b38da93112c97d46d612bf89c329b22ac2d00d.tar.xz
[mtv] Fixup incorrectly encoded XML documents
-rw-r--r--youtube_dl/extractor/common.py5
-rw-r--r--youtube_dl/extractor/mtv.py9
2 files changed, 11 insertions, 3 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 534908a2b..69a083b68 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -230,9 +230,12 @@ class InfoExtractor(object):
return content
def _download_xml(self, url_or_request, video_id,
- note=u'Downloading XML', errnote=u'Unable to download XML'):
+ note=u'Downloading XML', errnote=u'Unable to download XML',
+ transform_source=None):
"""Return the xml as an xml.etree.ElementTree.Element"""
xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
+ if transform_source:
+ xml_string = transform_source(xml_string)
return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
def to_screen(self, msg):
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index 6b3feb560..5b2bd9633 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -82,8 +82,13 @@ class MTVServicesInfoExtractor(InfoExtractor):
def _get_videos_info(self, uri):
video_id = self._id_from_uri(uri)
data = compat_urllib_parse.urlencode({'uri': uri})
- idoc = self._download_xml(self._FEED_URL +'?' + data, video_id,
- u'Downloading info')
+
+ def fix_ampersand(s):
+ """ Fix unencoded ampersand in XML """
+ return s.replace(u'& ', '&amp; ')
+ idoc = self._download_xml(
+ self._FEED_URL + '?' + data, video_id,
+ u'Downloading info', transform_source=fix_ampersand)
return [self._get_video_info(item) for item in idoc.findall('.//item')]