diff options
| author | Philipp Hagemeister <phihag@phihag.de> | 2014-01-20 22:11:34 +0100 | 
|---|---|---|
| committer | Philipp Hagemeister <phihag@phihag.de> | 2014-01-20 22:11:34 +0100 | 
| commit | 5aafe895fce2a7be9595cb2e56b7bd73a748e6b6 (patch) | |
| tree | 092e8e5663aceb9adb7475b5b2d9c9b7ee7a84e6 | |
| parent | b853d2e1555dbb4a09fe3d7857c6d2bc044646f4 (diff) | |
Correct XML ampersand fixup
| -rw-r--r-- | test/test_utils.py | 14 | ||||
| -rw-r--r-- | youtube_dl/extractor/clipsyndicate.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/metacritic.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/mtv.py | 6 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 7 | 
5 files changed, 25 insertions, 10 deletions
| diff --git a/test/test_utils.py b/test/test_utils.py index bee355ee0..a17483ada 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -16,6 +16,7 @@ from youtube_dl.utils import (      DateRange,      encodeFilename,      find_xpath_attr, +    fix_xml_ampersands,      get_meta_content,      orderedSet,      parse_duration, @@ -200,5 +201,18 @@ class TestUtil(unittest.TestCase):          self.assertEqual(parse_duration('9:12:43'), 33163)          self.assertEqual(parse_duration('x:y'), None) +    def test_fix_xml_ampersands(self): +        self.assertEqual( +            fix_xml_ampersands('"&x=y&z=a'), '"&x=y&z=a') +        self.assertEqual( +            fix_xml_ampersands('"&x=y&wrong;&z=a'), +            '"&x=y&wrong;&z=a') +        self.assertEqual( +            fix_xml_ampersands('&'><"'), +            '&'><"') +        self.assertEqual( +            fix_xml_ampersands('Ӓ᪼'), 'Ӓ᪼') +        self.assertEqual(fix_xml_ampersands('&#&#'), '&#&#') +  if __name__ == '__main__':      unittest.main() diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index c60089ad3..9ab6a4ab6 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -3,7 +3,7 @@ import re  from .common import InfoExtractor  from ..utils import (      find_xpath_attr, -    fix_xml_all_ampersand, +    fix_xml_ampersands  ) @@ -33,7 +33,7 @@ class ClipsyndicateIE(InfoExtractor):          pdoc = self._download_xml(              'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,              video_id, u'Downloading video info', -            transform_source=fix_xml_all_ampersand)  +            transform_source=fix_xml_ampersands)          track_doc = pdoc.find('trackList/track')          def find_param(name): diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py index f3ff0e8bb..465ac4916 100644 --- a/youtube_dl/extractor/metacritic.py +++ b/youtube_dl/extractor/metacritic.py @@ -4,7 +4,7 @@ import re  from .common import InfoExtractor  from ..utils import ( -    fix_xml_all_ampersand, +    fix_xml_ampersands,  ) @@ -27,7 +27,7 @@ class MetacriticIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          # The xml is not well formatted, there are raw '&'          info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id, -            video_id, 'Downloading info xml', transform_source=fix_xml_all_ampersand) +            video_id, 'Downloading info xml', transform_source=fix_xml_ampersands)          clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)          formats = [] diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index f1cf41e2d..c4fa16fb6 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -5,6 +5,7 @@ from .common import InfoExtractor  from ..utils import (      compat_urllib_parse,      ExtractorError, +    fix_xml_ampersands,  )  def _media_xml_tag(tag): @@ -83,12 +84,9 @@ class MTVServicesInfoExtractor(InfoExtractor):          video_id = self._id_from_uri(uri)          data = compat_urllib_parse.urlencode({'uri': uri}) -        def fix_ampersand(s): -            """ Fix unencoded ampersand in XML """ -            return s.replace(u'& ', '& ')          idoc = self._download_xml(              self._FEED_URL + '?' + data, video_id, -            u'Downloading info', transform_source=fix_ampersand) +            u'Downloading info', transform_source=fix_xml_ampersands)          return [self._get_video_info(item) for item in idoc.findall('.//item')] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 73fe1ad0a..70f284149 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1092,9 +1092,12 @@ def month_by_name(name):          return None -def fix_xml_all_ampersand(xml_str): +def fix_xml_ampersands(xml_str):      """Replace all the '&' by '&' in XML""" -    return xml_str.replace(u'&', u'&') +    return re.sub( +        r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)', +        u'&', +        xml_str)  def setproctitle(title): | 
