diff options
| -rw-r--r-- | youtube_dl/extractor/appletrailers.py | 23 | ||||
| -rw-r--r-- | youtube_dl/extractor/clipsyndicate.py | 10 | ||||
| -rw-r--r-- | youtube_dl/extractor/metacritic.py | 9 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 5 | 
4 files changed, 25 insertions, 22 deletions
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index a527f10de..ef5644aa5 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -1,5 +1,4 @@  import re -import xml.etree.ElementTree  import json  from .common import InfoExtractor @@ -65,18 +64,18 @@ class AppleTrailersIE(InfoExtractor):          uploader_id = mobj.group('company')          playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') -        playlist_snippet = self._download_webpage(playlist_url, movie) -        playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet) -        playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned) -        # The ' in the onClick attributes are not escaped, it couldn't be parsed -        # with xml.etree.ElementTree.fromstring -        # like: http://trailers.apple.com/trailers/wb/gravity/ -        def _clean_json(m): -            return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') -        playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned) -        playlist_html = u'<html>' + playlist_cleaned + u'</html>' +        def fix_html(s): +            s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s) +            s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s) +            # The ' in the onClick attributes are not escaped, it couldn't be parsed +            # like: http://trailers.apple.com/trailers/wb/gravity/ +            def _clean_json(m): +                return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') +            s = re.sub(self._JSON_RE, _clean_json, s) +            s = u'<html>' + s + u'</html>' +            return s +        doc = self._download_xml(playlist_url, movie, transform_source=fix_html) -        doc = xml.etree.ElementTree.fromstring(playlist_html)          playlist = []          for li in doc.findall('./div/ul/li'):              on_click = li.find('.//a').attrib['onClick'] diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index d4fc86973..c60089ad3 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -1,9 +1,9 @@  import re -import xml.etree.ElementTree  from .common import InfoExtractor  from ..utils import (      find_xpath_attr, +    fix_xml_all_ampersand,  ) @@ -30,12 +30,10 @@ class ClipsyndicateIE(InfoExtractor):          # it includes a required token          flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars') -        playlist_page = self._download_webpage( +        pdoc = self._download_xml(              'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, -            video_id, u'Downloading video info')  -        # Fix broken xml -        playlist_page = re.sub('&', '&', playlist_page) -        pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8')) +            video_id, u'Downloading video info', +            transform_source=fix_xml_all_ampersand)           track_doc = pdoc.find('trackList/track')          def find_param(name): diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py index 6b95b4998..e560c1d35 100644 --- a/youtube_dl/extractor/metacritic.py +++ b/youtube_dl/extractor/metacritic.py @@ -1,8 +1,10 @@  import re -import xml.etree.ElementTree  import operator  from .common import InfoExtractor +from ..utils import ( +    fix_xml_all_ampersand, +)  class MetacriticIE(InfoExtractor): @@ -23,9 +25,8 @@ class MetacriticIE(InfoExtractor):          video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id)          # The xml is not well formatted, there are raw '&' -        info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id, -            video_id, u'Downloading info xml').replace('&', '&') -        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) +        info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id, +            video_id, u'Downloading info xml', transform_source=fix_xml_all_ampersand)          clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)          formats = [] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0dab9fcc5..4593488ce 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1057,3 +1057,8 @@ def month_by_name(name):          return ENGLISH_NAMES.index(name) + 1      except ValueError:          return None + + +def fix_xml_all_ampersand(xml_str): +    """Replace all the '&' by '&' in XML""" +    return xml_str.replace(u'&', u'&')  | 
