diff options
Diffstat (limited to 'youtube_dl/extractor')
| -rw-r--r-- | youtube_dl/extractor/appletrailers.py | 23 | ||||
| -rw-r--r-- | youtube_dl/extractor/clipsyndicate.py | 10 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/dailymotion.py | 12 | ||||
| -rw-r--r-- | youtube_dl/extractor/daum.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/metacritic.py | 9 | ||||
| -rw-r--r-- | youtube_dl/extractor/mixcloud.py | 12 | ||||
| -rw-r--r-- | youtube_dl/extractor/mtv.py | 9 | ||||
| -rw-r--r-- | youtube_dl/extractor/naver.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/pornhub.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/soundcloud.py | 17 | ||||
| -rw-r--r-- | youtube_dl/extractor/vimeo.py | 12 | ||||
| -rw-r--r-- | youtube_dl/extractor/zdf.py | 4 | 
13 files changed, 75 insertions, 44 deletions
| diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index a527f10de..ef5644aa5 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -1,5 +1,4 @@  import re -import xml.etree.ElementTree  import json  from .common import InfoExtractor @@ -65,18 +64,18 @@ class AppleTrailersIE(InfoExtractor):          uploader_id = mobj.group('company')          playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') -        playlist_snippet = self._download_webpage(playlist_url, movie) -        playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet) -        playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned) -        # The ' in the onClick attributes are not escaped, it couldn't be parsed -        # with xml.etree.ElementTree.fromstring -        # like: http://trailers.apple.com/trailers/wb/gravity/ -        def _clean_json(m): -            return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') -        playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned) -        playlist_html = u'<html>' + playlist_cleaned + u'</html>' +        def fix_html(s): +            s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s) +            s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s) +            # The ' in the onClick attributes are not escaped, it couldn't be parsed +            # like: http://trailers.apple.com/trailers/wb/gravity/ +            def _clean_json(m): +                return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') +            s = re.sub(self._JSON_RE, _clean_json, s) +            s = u'<html>' + s + u'</html>' +            return s +        doc = self._download_xml(playlist_url, movie, transform_source=fix_html) -        doc = xml.etree.ElementTree.fromstring(playlist_html)          playlist = []          for li in doc.findall('./div/ul/li'):              on_click = li.find('.//a').attrib['onClick'] diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index d4fc86973..c60089ad3 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -1,9 +1,9 @@  import re -import xml.etree.ElementTree  from .common import InfoExtractor  from ..utils import (      find_xpath_attr, +    fix_xml_all_ampersand,  ) @@ -30,12 +30,10 @@ class ClipsyndicateIE(InfoExtractor):          # it includes a required token          flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars') -        playlist_page = self._download_webpage( +        pdoc = self._download_xml(              'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, -            video_id, u'Downloading video info')  -        # Fix broken xml -        playlist_page = re.sub('&', '&', playlist_page) -        pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8')) +            video_id, u'Downloading video info', +            transform_source=fix_xml_all_ampersand)           track_doc = pdoc.find('trackList/track')          def find_param(name): diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 534908a2b..69a083b68 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -230,9 +230,12 @@ class InfoExtractor(object):              return content      def _download_xml(self, url_or_request, video_id, -                      note=u'Downloading XML', errnote=u'Unable to download XML'): +                      note=u'Downloading XML', errnote=u'Unable to download XML', +                      transform_source=None):          """Return the xml as an xml.etree.ElementTree.Element"""          xml_string = self._download_webpage(url_or_request, video_id, note, errnote) +        if transform_source: +            xml_string = transform_source(xml_string)          return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))      def to_screen(self, msg): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 3bd0b862c..aea7e557e 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -101,10 +101,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):              self.to_screen(u'Vevo video detected: %s' % vevo_id)              return self.url_result(u'vevo:%s' % vevo_id, ie='Vevo') -        video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', -                                             # Looking for official user -                                             r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'], -                                            webpage, 'video uploader', fatal=False)          age_limit = self._rta_search(webpage)          video_upload_date = None @@ -147,13 +143,15 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):              self._list_available_subtitles(video_id, webpage)              return -        view_count = str_to_int(self._search_regex( -            r'video_views_value[^>]+>([\d\.,]+)<', webpage, u'view count')) +        view_count = self._search_regex( +            r'video_views_count[^>]+>\s+([\d\.,]+)', webpage, u'view count', fatal=False) +        if view_count is not None: +            view_count = str_to_int(view_count)          return {              'id':       video_id,              'formats': formats, -            'uploader': video_uploader, +            'uploader': info['owner_screenname'],              'upload_date':  video_upload_date,              'title':    self._og_search_title(webpage),              'subtitles':    video_subtitles, diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index d418ce4a8..4876ecb48 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -9,7 +9,7 @@ from ..utils import (  class DaumIE(InfoExtractor): -    _VALID_URL = r'https?://tvpot\.daum\.net/.*?clipid=(?P<id>\d+)' +    _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/.*?clipid=(?P<id>\d+)'      IE_NAME = u'daum.net'      _TEST = { diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py index 6b95b4998..e560c1d35 100644 --- a/youtube_dl/extractor/metacritic.py +++ b/youtube_dl/extractor/metacritic.py @@ -1,8 +1,10 @@  import re -import xml.etree.ElementTree  import operator  from .common import InfoExtractor +from ..utils import ( +    fix_xml_all_ampersand, +)  class MetacriticIE(InfoExtractor): @@ -23,9 +25,8 @@ class MetacriticIE(InfoExtractor):          video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id)          # The xml is not well formatted, there are raw '&' -        info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id, -            video_id, u'Downloading info xml').replace('&', '&') -        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) +        info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id, +            video_id, u'Downloading info xml', transform_source=fix_xml_all_ampersand)          clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)          formats = [] diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 04fa3ac7a..125d81551 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -37,6 +37,9 @@ class MixcloudIE(InfoExtractor):          return None +    def _get_url(self, template_url): +        return self.check_urls(template_url % i for i in range(30)) +      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -52,13 +55,18 @@ class MixcloudIE(InfoExtractor):          preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url')          song_url = preview_url.replace('/previews/', '/cloudcasts/originals/')          template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) -        final_song_url = self.check_urls(template_url % i for i in range(30)) +        final_song_url = self._get_url(template_url) +        if final_song_url is None: +            self.to_screen('Trying with m4a extension') +            template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') +            final_song_url = self._get_url(template_url) +        if final_song_url is None: +            raise ExtractorError(u'Unable to extract track url')          return {              'id': track_id,              'title': info['name'],              'url': final_song_url, -            'ext': 'mp3',              'description': info.get('description'),              'thumbnail': info['pictures'].get('extra_large'),              'uploader': info['user']['name'], diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 6b3feb560..5b2bd9633 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -82,8 +82,13 @@ class MTVServicesInfoExtractor(InfoExtractor):      def _get_videos_info(self, uri):          video_id = self._id_from_uri(uri)          data = compat_urllib_parse.urlencode({'uri': uri}) -        idoc = self._download_xml(self._FEED_URL +'?' + data, video_id, -                                         u'Downloading info') + +        def fix_ampersand(s): +            """ Fix unencoded ampersand in XML """ +            return s.replace(u'& ', '& ') +        idoc = self._download_xml( +            self._FEED_URL + '?' + data, video_id, +            u'Downloading info', transform_source=fix_ampersand)          return [self._get_video_info(item) for item in idoc.findall('.//item')] diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index c012ec0cf..4cab30631 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -9,7 +9,7 @@ from ..utils import (  class NaverIE(InfoExtractor): -    _VALID_URL = r'https?://tvcast\.naver\.com/v/(?P<id>\d+)' +    _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)'      _TEST = {          u'url': u'http://tvcast.naver.com/v/81652', diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 8b3471919..d9135c6b9 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -12,7 +12,7 @@ from ..aes import (  )  class PornHubIE(InfoExtractor): -    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9]+))' +    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9a-f]+))'      _TEST = {          u'url': u'http://www.pornhub.com/view_video.php?viewkey=648719015',          u'file': u'648719015.mp4', diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 5c026c0b8..cbba4094b 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -73,6 +73,19 @@ class SoundcloudIE(InfoExtractor):                  u'upload_date': u'20131209',              },          }, +        # downloadable song +        { +            u'url': u'https://soundcloud.com/simgretina/just-your-problem-baby-1', +            u'md5': u'56a8b69568acaa967b4c49f9d1d52d19', +            u'info_dict': { +                u'id': u'105614606', +                u'ext': u'wav', +                u'title': u'Just Your Problem Baby (Acapella)', +                u'description': u'Vocals', +                u'uploader': u'Sim Gretina', +                u'upload_date': u'20130815', +            }, +        },      ]      _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' @@ -99,7 +112,7 @@ class SoundcloudIE(InfoExtractor):          thumbnail = info['artwork_url']          if thumbnail is not None:              thumbnail = thumbnail.replace('-large', '-t500x500') -        ext = info.get('original_format', u'mp3') +        ext = u'mp3'          result = {              'id': track_id,              'uploader': info['user']['username'], @@ -115,7 +128,7 @@ class SoundcloudIE(InfoExtractor):                      track_id, self._CLIENT_ID))              result['formats'] = [{                  'format_id': 'download', -                'ext': ext, +                'ext': info.get('original_format', u'mp3'),                  'url': format_url,                  'vcodec': 'none',              }] diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index fb2bd225a..ea4409528 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -115,7 +115,7 @@ class VimeoIE(InfoExtractor):      def _real_initialize(self):          self._login() -    def _real_extract(self, url, new_video=True): +    def _real_extract(self, url):          url, data = unsmuggle_url(url)          headers = std_headers          if data is not None: @@ -151,8 +151,14 @@ class VimeoIE(InfoExtractor):                  config = json.loads(config_json)              except RegexNotFoundError:                  # For pro videos or player.vimeo.com urls -                config = self._search_regex([r' = {config:({.+?}),assets:', r'(?:c|b)=({.+?});'], -                    webpage, u'info section', flags=re.DOTALL) +                # We try to find out to which variable is assigned the config dic +                m_variable_name = re.search('(\w)\.video\.id', webpage) +                if m_variable_name is not None: +                    config_re = r'%s=({.+?});' % re.escape(m_variable_name.group(1)) +                else: +                    config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] +                config = self._search_regex(config_re, webpage, u'info section', +                    flags=re.DOTALL)                  config = json.loads(config)          except Exception as e:              if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 689f19735..35ece354a 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -73,14 +73,14 @@ class ZDFIE(InfoExtractor):              try:                  proto_pref = -PROTO_ORDER.index(format_m.group('proto'))              except ValueError: -                proto_pref = 999 +                proto_pref = -999              quality = fnode.find('./quality').text              QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low']              try:                  quality_pref = -QUALITY_ORDER.index(quality)              except ValueError: -                quality_pref = 999 +                quality_pref = -999              abr = int(fnode.find('./audioBitrate').text) // 1000              vbr = int(fnode.find('./videoBitrate').text) // 1000 | 
