aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Hagemeister <phihag@phihag.de>2014-01-20 22:11:34 +0100
committerPhilipp Hagemeister <phihag@phihag.de>2014-01-20 22:11:34 +0100
commit5aafe895fce2a7be9595cb2e56b7bd73a748e6b6 (patch)
tree092e8e5663aceb9adb7475b5b2d9c9b7ee7a84e6
parentb853d2e1555dbb4a09fe3d7857c6d2bc044646f4 (diff)
downloadyoutube-dl-5aafe895fce2a7be9595cb2e56b7bd73a748e6b6.tar.xz
Correct XML ampersand fixup
-rw-r--r--test/test_utils.py14
-rw-r--r--youtube_dl/extractor/clipsyndicate.py4
-rw-r--r--youtube_dl/extractor/metacritic.py4
-rw-r--r--youtube_dl/extractor/mtv.py6
-rw-r--r--youtube_dl/utils.py7
5 files changed, 25 insertions, 10 deletions
diff --git a/test/test_utils.py b/test/test_utils.py
index bee355ee0..a17483ada 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -16,6 +16,7 @@ from youtube_dl.utils import (
DateRange,
encodeFilename,
find_xpath_attr,
+ fix_xml_ampersands,
get_meta_content,
orderedSet,
parse_duration,
@@ -200,5 +201,18 @@ class TestUtil(unittest.TestCase):
self.assertEqual(parse_duration('9:12:43'), 33163)
self.assertEqual(parse_duration('x:y'), None)
+ def test_fix_xml_ampersands(self):
+ self.assertEqual(
+ fix_xml_ampersands('"&x=y&z=a'), '"&amp;x=y&amp;z=a')
+ self.assertEqual(
+ fix_xml_ampersands('"&amp;x=y&wrong;&z=a'),
+ '"&amp;x=y&amp;wrong;&amp;z=a')
+ self.assertEqual(
+ fix_xml_ampersands('&amp;&apos;&gt;&lt;&quot;'),
+ '&amp;&apos;&gt;&lt;&quot;')
+ self.assertEqual(
+ fix_xml_ampersands('&#1234;&#x1abC;'), '&#1234;&#x1abC;')
+ self.assertEqual(fix_xml_ampersands('&#&#'), '&amp;#&amp;#')
+
if __name__ == '__main__':
unittest.main()
diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py
index c60089ad3..9ab6a4ab6 100644
--- a/youtube_dl/extractor/clipsyndicate.py
+++ b/youtube_dl/extractor/clipsyndicate.py
@@ -3,7 +3,7 @@ import re
from .common import InfoExtractor
from ..utils import (
find_xpath_attr,
- fix_xml_all_ampersand,
+ fix_xml_ampersands
)
@@ -33,7 +33,7 @@ class ClipsyndicateIE(InfoExtractor):
pdoc = self._download_xml(
'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
video_id, u'Downloading video info',
- transform_source=fix_xml_all_ampersand)
+ transform_source=fix_xml_ampersands)
track_doc = pdoc.find('trackList/track')
def find_param(name):
diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py
index f3ff0e8bb..465ac4916 100644
--- a/youtube_dl/extractor/metacritic.py
+++ b/youtube_dl/extractor/metacritic.py
@@ -4,7 +4,7 @@ import re
from .common import InfoExtractor
from ..utils import (
- fix_xml_all_ampersand,
+ fix_xml_ampersands,
)
@@ -27,7 +27,7 @@ class MetacriticIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
# The xml is not well formatted, there are raw '&'
info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id,
- video_id, 'Downloading info xml', transform_source=fix_xml_all_ampersand)
+ video_id, 'Downloading info xml', transform_source=fix_xml_ampersands)
clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
formats = []
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index f1cf41e2d..c4fa16fb6 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -5,6 +5,7 @@ from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
ExtractorError,
+ fix_xml_ampersands,
)
def _media_xml_tag(tag):
@@ -83,12 +84,9 @@ class MTVServicesInfoExtractor(InfoExtractor):
video_id = self._id_from_uri(uri)
data = compat_urllib_parse.urlencode({'uri': uri})
- def fix_ampersand(s):
- """ Fix unencoded ampersand in XML """
- return s.replace(u'& ', '&amp; ')
idoc = self._download_xml(
self._FEED_URL + '?' + data, video_id,
- u'Downloading info', transform_source=fix_ampersand)
+ u'Downloading info', transform_source=fix_xml_ampersands)
return [self._get_video_info(item) for item in idoc.findall('.//item')]
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 73fe1ad0a..70f284149 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1092,9 +1092,12 @@ def month_by_name(name):
return None
-def fix_xml_all_ampersand(xml_str):
+def fix_xml_ampersands(xml_str):
"""Replace all the '&' by '&amp;' in XML"""
- return xml_str.replace(u'&', u'&amp;')
+ return re.sub(
+ r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
+ u'&amp;',
+ xml_str)
def setproctitle(title):