diff options
Diffstat (limited to 'youtube_dl/extractor')
| -rw-r--r-- | youtube_dl/extractor/__init__.py | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/arte.py | 33 | ||||
| -rw-r--r-- | youtube_dl/extractor/brightcove.py | 57 | ||||
| -rw-r--r-- | youtube_dl/extractor/cinemassacre.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/cnn.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/dailymotion.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/eitb.py | 37 | ||||
| -rw-r--r-- | youtube_dl/extractor/gamekings.py | 40 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 24 | ||||
| -rw-r--r-- | youtube_dl/extractor/kankan.py | 7 | ||||
| -rw-r--r-- | youtube_dl/extractor/mtv.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/slashdot.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/soundcloud.py | 130 | ||||
| -rw-r--r-- | youtube_dl/extractor/space.py | 35 | ||||
| -rw-r--r-- | youtube_dl/extractor/subtitles.py | 12 | ||||
| -rw-r--r-- | youtube_dl/extractor/ted.py | 36 | ||||
| -rw-r--r-- | youtube_dl/extractor/vine.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/weibo.py | 1 | ||||
| -rw-r--r-- | youtube_dl/extractor/xnxx.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 33 | 
21 files changed, 324 insertions, 142 deletions
| diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 78f84cea3..0594a3666 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -38,6 +38,7 @@ from .defense import DefenseGouvFrIE  from .ebaumsworld import EbaumsWorldIE  from .ehow import EHowIE  from .eighttracks import EightTracksIE +from .eitb import EitbIE  from .escapist import EscapistIE  from .exfm import ExfmIE  from .extremetube import ExtremeTubeIE @@ -56,6 +57,7 @@ from .francetv import (  )  from .freesound import FreesoundIE  from .funnyordie import FunnyOrDieIE +from .gamekings import GamekingsIE  from .gamespot import GameSpotIE  from .gametrailers import GametrailersIE  from .generic import GenericIE @@ -115,6 +117,7 @@ from .slideshare import SlideshareIE  from .sohu import SohuIE  from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE  from .southparkstudios import SouthParkStudiosIE +from .space import SpaceIE  from .spankwire import SpankwireIE  from .spiegel import SpiegelIE  from .stanfordoc import StanfordOpenClassroomIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index e10c74c11..b35a679e3 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -10,6 +10,7 @@ from ..utils import (      unified_strdate,      determine_ext,      get_element_by_id, +    compat_str,  )  # There are different sources of video in arte.tv, the extraction process  @@ -181,20 +182,30 @@ class ArteTVPlus7IE(InfoExtractor):                  formats = all_formats              else:                  raise ExtractorError(u'The formats list is empty') -        # We order the formats by quality +          if re.match(r'[A-Z]Q', formats[0]['quality']) is not None: -            sort_key = lambda f: ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality']) +            def sort_key(f): +                return ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality'])          else: -            sort_key = lambda f: int(f.get('height',-1)) +            def sort_key(f): +                return ( +                    # Sort first by quality +                    int(f.get('height',-1)), +                    int(f.get('bitrate',-1)), +                    # The original version with subtitles has lower relevance +                    re.match(r'VO-ST(F|A)', f.get('versionCode', '')) is None, +                    # The version with sourds/mal subtitles has also lower relevance +                    re.match(r'VO?(F|A)-STM\1', f.get('versionCode', '')) is None, +                )          formats = sorted(formats, key=sort_key) -        # Prefer videos without subtitles in the same language -        formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f.get('versionCode', '')) is None) -        # Pick the best quality          def _format(format_info): -            quality = format_info['quality'] -            m_quality = re.match(r'\w*? - (\d*)p', quality) -            if m_quality is not None: -                quality = m_quality.group(1) +            quality = '' +            height = format_info.get('height') +            if height is not None: +                quality = compat_str(height) +            bitrate = format_info.get('bitrate') +            if bitrate is not None: +                quality += '-%d' % bitrate              if format_info.get('versionCode') is not None:                  format_id = u'%s-%s' % (quality, format_info['versionCode'])              else: @@ -203,7 +214,7 @@ class ArteTVPlus7IE(InfoExtractor):                  'format_id': format_id,                  'format_note': format_info.get('versionLibelle'),                  'width': format_info.get('width'), -                'height': format_info.get('height'), +                'height': height,              }              if format_info['mediaType'] == u'rtmp':                  info['url'] = format_info['streamer'] diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 0d9b87a34..d8c35465a 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -9,10 +9,13 @@ from ..utils import (      compat_urllib_parse,      find_xpath_attr,      compat_urlparse, +    compat_str, +    compat_urllib_request,      ExtractorError,  ) +  class BrightcoveIE(InfoExtractor):      _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'      _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' @@ -41,6 +44,17 @@ class BrightcoveIE(InfoExtractor):                  u'uploader': u'Oracle',              },          }, +        { +            # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/ +            u'url': u'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001', +            u'info_dict': { +                u'id': u'2750934548001', +                u'ext': u'mp4', +                u'title': u'This Bracelet Acts as a Personal Thermostat', +                u'description': u'md5:547b78c64f4112766ccf4e151c20b6a0', +                u'uploader': u'Mashable', +            }, +        },      ]      @classmethod @@ -68,24 +82,48 @@ class BrightcoveIE(InfoExtractor):          videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer')          if videoPlayer is not None:              params['@videoPlayer'] = videoPlayer.attrib['value'] +        linkBase = find_xpath_attr(object_doc, './param', 'name', 'linkBaseURL') +        if linkBase is not None: +            params['linkBaseURL'] = linkBase.attrib['value']          data = compat_urllib_parse.urlencode(params)          return cls._FEDERATED_URL_TEMPLATE % data +    @classmethod +    def _extract_brightcove_url(cls, webpage): +        """Try to extract the brightcove url from the wepbage, returns None +        if it can't be found +        """ +        m_brightcove = re.search( +            r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', +            webpage, re.DOTALL) +        if m_brightcove is not None: +            return cls._build_brighcove_url(m_brightcove.group()) +        else: +            return None +      def _real_extract(self, url): +        # Change the 'videoId' and others field to '@videoPlayer' +        url = re.sub(r'(?<=[?&])(videoI(d|D)|bctid)', '%40videoPlayer', url) +        # Change bckey (used by bcove.me urls) to playerKey +        url = re.sub(r'(?<=[?&])bckey', 'playerKey', url)          mobj = re.match(self._VALID_URL, url)          query_str = mobj.group('query')          query = compat_urlparse.parse_qs(query_str)          videoPlayer = query.get('@videoPlayer')          if videoPlayer: -            return self._get_video_info(videoPlayer[0], query_str) +            return self._get_video_info(videoPlayer[0], query_str, query)          else:              player_key = query['playerKey']              return self._get_playlist_info(player_key[0]) -    def _get_video_info(self, video_id, query): -        request_url = self._FEDERATED_URL_TEMPLATE % query -        webpage = self._download_webpage(request_url, video_id) +    def _get_video_info(self, video_id, query_str, query): +        request_url = self._FEDERATED_URL_TEMPLATE % query_str +        req = compat_urllib_request.Request(request_url) +        linkBase = query.get('linkBaseURL') +        if linkBase is not None: +            req.add_header('Referer', linkBase[0]) +        webpage = self._download_webpage(req, video_id)          self.report_extraction(video_id)          info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json') @@ -109,7 +147,7 @@ class BrightcoveIE(InfoExtractor):      def _extract_video_info(self, video_info):          info = { -            'id': video_info['id'], +            'id': compat_str(video_info['id']),              'title': video_info['displayName'],              'description': video_info.get('shortDescription'),              'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), @@ -119,10 +157,11 @@ class BrightcoveIE(InfoExtractor):          renditions = video_info.get('renditions')          if renditions:              renditions = sorted(renditions, key=lambda r: r['size']) -            best_format = renditions[-1] -            info.update({ -                'url': best_format['defaultURL'], -            }) +            info['formats'] = [{ +                'url': rend['defaultURL'], +                'height': rend.get('frameHeight'), +                'width': rend.get('frameWidth'), +            } for rend in renditions]          elif video_info.get('FLVFullLengthURL') is not None:              info.update({                  'url': video_info['FLVFullLengthURL'], diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 8f9396d6b..f0d08cebf 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -65,6 +65,7 @@ class CinemassacreIE(InfoExtractor):              {                  'url': url,                  'play_path': 'mp4:' + sd_file, +                'rtmp_live': True, # workaround                  'ext': 'flv',                  'format': 'sd',                  'format_id': 'sd', @@ -72,6 +73,7 @@ class CinemassacreIE(InfoExtractor):              {                  'url': url,                  'play_path': 'mp4:' + hd_file, +                'rtmp_live': True, # workaround                  'ext': 'flv',                  'format': 'hd',                  'format_id': 'hd', diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index a79f881cd..34adf6dda 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -6,7 +6,7 @@ from ..utils import determine_ext  class CNNIE(InfoExtractor): -    _VALID_URL = r'''(?x)https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/ +    _VALID_URL = r'''(?x)https?://((edition|www)\.)?cnn\.com/video/(data/.+?|\?)/          (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))'''      _TESTS = [{ diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e0ccba533..9c20d30b4 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -322,6 +322,8 @@ class InfoExtractor(object):          if name is None:              name = 'OpenGraph %s' % prop          escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs) +        if escaped is None: +            return None          return unescapeHTML(escaped)      def _og_search_thumbnail(self, html, **kargs): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 355b4ed0a..e87690f9d 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -141,9 +141,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):              raise ExtractorError(u'Unable to extract video URL')          # subtitles -        video_subtitles = self.extract_subtitles(video_id) +        video_subtitles = self.extract_subtitles(video_id, webpage)          if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id) +            self._list_available_subtitles(video_id, webpage)              return          return { @@ -157,7 +157,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):              'age_limit': age_limit,          } -    def _get_available_subtitles(self, video_id): +    def _get_available_subtitles(self, video_id, webpage):          try:              sub_list = self._download_webpage(                  'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py new file mode 100644 index 000000000..4ba323148 --- /dev/null +++ b/youtube_dl/extractor/eitb.py @@ -0,0 +1,37 @@ +# encoding: utf-8 +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveIE +from ..utils import ExtractorError + + +class EitbIE(InfoExtractor): +    IE_NAME = u'eitb.tv' +    _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)' + +    _TEST = { +        u'add_ie': ['Brightcove'], +        u'url': u'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', +        u'md5': u'edf4436247185adee3ea18ce64c47998', +        u'info_dict': { +            u'id': u'2743577154001', +            u'ext': u'mp4', +            u'title': u'60 minutos (Lasa y Zabala, 30 años)', +            # All videos from eitb has this description in the brightcove info +            u'description': u'.', +            u'uploader': u'Euskal Telebista', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        chapter_id = mobj.group('chapter_id') +        webpage = self._download_webpage(url, chapter_id) +        bc_url = BrightcoveIE._extract_brightcove_url(webpage) +        if bc_url is None: +            raise ExtractorError(u'Could not extract the Brightcove url') +        # The BrightcoveExperience object doesn't contain the video id, we set +        # it manually +        bc_url += '&%40videoPlayer={0}'.format(chapter_id) +        return self.url_result(bc_url, BrightcoveIE.ie_key()) diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py new file mode 100644 index 000000000..4b4259447 --- /dev/null +++ b/youtube_dl/extractor/gamekings.py @@ -0,0 +1,40 @@ +import re + +from .common import InfoExtractor +from ..utils import ( +    determine_ext, +) + + +class GamekingsIE(InfoExtractor): +    _VALID_URL = r'http?://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)' +    _TEST = { +        u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/", +        u'file': u'20130811.mp4', +        u'md5': u'17f6088f7d0149ff2b46f2714bdb1954', +        u'info_dict': { +            u"title": u"Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review", +            u"description": u"Melle en Steven hebben voor de review een week in de rechtbank doorbracht met Phoenix Wright: Ace Attorney - Dual Destinies.", +        } +    } + +    def _real_extract(self, url): + +        mobj = re.match(self._VALID_URL, url) +        name = mobj.group('name') +        webpage = self._download_webpage(url, name) +        video_url = self._og_search_video_url(webpage) + +        video = re.search(r'[0-9]+', video_url) +        video_id = video.group(0) + +        # Todo: add medium format +        video_url = video_url.replace(video_id, 'large/' + video_id) + +        return { +            'id': video_id, +            'ext': 'mp4', +            'url': video_url, +            'title': self._og_search_title(webpage), +            'description': self._og_search_description(webpage), +        } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b3fec8e86..c7552fddb 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -33,6 +33,7 @@ class GenericIE(InfoExtractor):          },          # embedded vimeo video          { +            u'add_ie': ['Vimeo'],              u'url': u'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references',              u'file': u'22444065.mp4',              u'md5': u'2903896e23df39722c33f015af0666e2', @@ -44,6 +45,7 @@ class GenericIE(InfoExtractor):          },          # bandcamp page with custom domain          { +            u'add_ie': ['Bandcamp'],              u'url': u'http://bronyrock.com/track/the-pony-mash',              u'file': u'3235767654.mp3',              u'info_dict': { @@ -52,6 +54,23 @@ class GenericIE(InfoExtractor):              },              u'skip': u'There is a limit of 200 free downloads / month for the test song',          }, +        # embedded brightcove video +        # it also tests brightcove videos that need to set the 'Referer' in the +        # http requests +        { +            u'add_ie': ['Brightcove'], +            u'url': u'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', +            u'info_dict': { +                u'id': u'2765128793001', +                u'ext': u'mp4', +                u'title': u'Le cours de bourse : l’analyse technique', +                u'description': u'md5:7e9ad046e968cb2d1114004aba466fd9', +                u'uploader': u'BFM BUSINESS', +            }, +            u'params': { +                u'skip_download': True, +            }, +        },      ]      def report_download_webpage(self, video_id): @@ -144,10 +163,9 @@ class GenericIE(InfoExtractor):          self.report_extraction(video_id)          # Look for BrightCove: -        m_brightcove = re.search(r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL) -        if m_brightcove is not None: +        bc_url = BrightcoveIE._extract_brightcove_url(webpage) +        if bc_url is not None:              self.to_screen(u'Brightcove video detected.') -            bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group())              return self.url_result(bc_url, 'Brightcove')          # Look for embedded Vimeo player diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py index 445d46501..50916f4a6 100644 --- a/youtube_dl/extractor/kankan.py +++ b/youtube_dl/extractor/kankan.py @@ -1,8 +1,10 @@  import re +import hashlib  from .common import InfoExtractor  from ..utils import determine_ext +_md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()  class KankanIE(InfoExtractor):      _VALID_URL = r'https?://(?:.*?\.)?kankan\.com/.+?/(?P<id>\d+)\.shtml' @@ -30,7 +32,10 @@ class KankanIE(InfoExtractor):                                                   video_id, u'Downloading video url info')          ip = self._search_regex(r'ip:"(.+?)"', video_info_page, u'video url ip')          path = self._search_regex(r'path:"(.+?)"', video_info_page, u'video url path') -        video_url = 'http://%s%s' % (ip, path) +        param1 = self._search_regex(r'param1:(\d+)', video_info_page, u'param1') +        param2 = self._search_regex(r'param2:(\d+)', video_info_page, u'param2') +        key = _md5('xl_mp43651' + param1 + param2) +        video_url = 'http://%s%s?key=%s&key1=%s' % (ip, path, key, param2)          return {'id': video_id,                  'title': title, diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index e96d3952c..24a79ae13 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -26,6 +26,7 @@ class MTVIE(InfoExtractor):              },          },          { +            u'add_ie': ['Vevo'],              u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',              u'file': u'USCJY1331283.mp4',              u'md5': u'73b4e7fcadd88929292fe52c3ced8caf', diff --git a/youtube_dl/extractor/slashdot.py b/youtube_dl/extractor/slashdot.py index 2cba53076..f5003c7f9 100644 --- a/youtube_dl/extractor/slashdot.py +++ b/youtube_dl/extractor/slashdot.py @@ -7,6 +7,7 @@ class SlashdotIE(InfoExtractor):      _VALID_URL = r'https?://tv.slashdot.org/video/\?embed=(?P<id>.*?)(&|$)'      _TEST = { +        u'add_ie': ['Ooyala'],          u'url': u'http://tv.slashdot.org/video/?embed=JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz',          u'file': u'JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz.mp4',          u'md5': u'd2222e7a4a4c1541b3e0cf732fb26735', diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 29cd5617c..4717fbb77 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -29,17 +29,34 @@ class SoundcloudIE(InfoExtractor):                      )                      '''      IE_NAME = u'soundcloud' -    _TEST = { -        u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', -        u'file': u'62986583.mp3', -        u'md5': u'ebef0a451b909710ed1d7787dddbf0d7', -        u'info_dict': { -            u"upload_date": u"20121011",  -            u"description": u"No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",  -            u"uploader": u"E.T. ExTerrestrial Music",  -            u"title": u"Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1" -        } -    } +    _TESTS = [ +        { +            u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', +            u'file': u'62986583.mp3', +            u'md5': u'ebef0a451b909710ed1d7787dddbf0d7', +            u'info_dict': { +                u"upload_date": u"20121011",  +                u"description": u"No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",  +                u"uploader": u"E.T. ExTerrestrial Music",  +                u"title": u"Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1" +            } +        }, +        # not streamable song +        { +            u'url': u'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', +            u'info_dict': { +                u'id': u'47127627', +                u'ext': u'mp3', +                u'title': u'Goldrushed', +                u'uploader': u'The Royal Concept', +                u'upload_date': u'20120521', +            }, +            u'params': { +                # rtmp +                u'skip_download': True, +            }, +        }, +    ]      _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' @@ -56,16 +73,16 @@ class SoundcloudIE(InfoExtractor):          return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID      def _extract_info_dict(self, info, full_title=None, quiet=False): -        video_id = info['id'] -        name = full_title or video_id +        track_id = compat_str(info['id']) +        name = full_title or track_id          if quiet == False:              self.report_extraction(name)          thumbnail = info['artwork_url']          if thumbnail is not None:              thumbnail = thumbnail.replace('-large', '-t500x500') -        return { -            'id':       info['id'], +        result = { +            'id':       track_id,              'url':      info['stream_url'] + '?client_id=' + self._CLIENT_ID,              'uploader': info['user']['username'],              'upload_date': unified_strdate(info['created_at']), @@ -74,6 +91,21 @@ class SoundcloudIE(InfoExtractor):              'description': info['description'],              'thumbnail': thumbnail,          } +        if info.get('downloadable', False): +            result['url'] = 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(track_id, self._CLIENT_ID) +        if not info.get('streamable', False): +            # We have to get the rtmp url +            stream_json = self._download_webpage( +                'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._CLIENT_ID), +                track_id, u'Downloading track url') +            rtmp_url = json.loads(stream_json)['rtmp_mp3_128_url'] +            # The url doesn't have an rtmp app, we have to extract the playpath +            url, path = rtmp_url.split('mp3:', 1) +            result.update({ +                'url': url, +                'play_path': 'mp3:' + path, +            }) +        return result      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) @@ -106,70 +138,8 @@ class SoundcloudIE(InfoExtractor):  class SoundcloudSetIE(SoundcloudIE):      _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'      IE_NAME = u'soundcloud:set' -    _TEST = { -        u"url":"https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep", -        u"playlist": [ -            { -                u"file":"30510138.mp3", -                u"md5":"f9136bf103901728f29e419d2c70f55d", -                u"info_dict": { -                    u"upload_date": u"20111213", -                    u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", -                    u"uploader": u"The Royal Concept", -                    u"title": u"D-D-Dance" -                } -            }, -            { -                u"file":"47127625.mp3", -                u"md5":"09b6758a018470570f8fd423c9453dd8", -                u"info_dict": { -                    u"upload_date": u"20120521", -                    u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", -                    u"uploader": u"The Royal Concept", -                    u"title": u"The Royal Concept - Gimme Twice" -                } -            }, -            { -                u"file":"47127627.mp3", -                u"md5":"154abd4e418cea19c3b901f1e1306d9c", -                u"info_dict": { -                    u"upload_date": u"20120521", -                    u"uploader": u"The Royal Concept", -                    u"title": u"Goldrushed" -                } -            }, -            { -                u"file":"47127629.mp3", -                u"md5":"2f5471edc79ad3f33a683153e96a79c1", -                u"info_dict": { -                    u"upload_date": u"20120521", -                    u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", -                    u"uploader": u"The Royal Concept", -                    u"title": u"In the End" -                } -            }, -            { -                u"file":"47127631.mp3", -                u"md5":"f9ba87aa940af7213f98949254f1c6e2", -                u"info_dict": { -                    u"upload_date": u"20120521", -                    u"description": u"The Royal Concept from Stockholm\r\nFilip / David / Povel / Magnus\r\nwww.theroyalconceptband.com", -                    u"uploader": u"The Royal Concept", -                    u"title": u"Knocked Up" -                } -            }, -            { -                u"file":"75206121.mp3", -                u"md5":"f9d1fe9406717e302980c30de4af9353", -                u"info_dict": { -                    u"upload_date": u"20130116", -                    u"description": u"The unreleased track World on Fire premiered on the CW's hit show Arrow (8pm/7pm central).  \r\nAs a gift to our fans we would like to offer you a free download of the track!  ", -                    u"uploader": u"The Royal Concept", -                    u"title": u"World On Fire" -                } -            } -        ] -    } +    # it's in tests/test_playlists.py +    _TESTS = []      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -208,7 +178,7 @@ class SoundcloudUserIE(SoundcloudIE):      IE_NAME = u'soundcloud:user'      # it's in tests/test_playlists.py -    _TEST = None +    _TESTS = []      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py new file mode 100644 index 000000000..0d32a0688 --- /dev/null +++ b/youtube_dl/extractor/space.py @@ -0,0 +1,35 @@ +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveIE +from ..utils import RegexNotFoundError, ExtractorError + + +class SpaceIE(InfoExtractor): +    _VALID_URL = r'https?://www\.space\.com/\d+-(?P<title>[^/\.\?]*?)-video.html' +    _TEST = { +        u'add_ie': ['Brightcove'], +        u'url': u'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html', +        u'info_dict': { +            u'id': u'2780937028001', +            u'ext': u'mp4', +            u'title': u'Huge Martian Landforms\' Detail Revealed By European Probe | Video', +            u'description': u'md5:db81cf7f3122f95ed234b631a6ea1e61', +            u'uploader': u'TechMedia Networks', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        title = mobj.group('title') +        webpage = self._download_webpage(url, title) +        try: +            # Some videos require the playerKey field, which isn't define in +            # the BrightcoveExperience object +            brightcove_url = self._og_search_video_url(webpage) +        except RegexNotFoundError: +            # Other videos works fine with the info from the object +            brightcove_url = BrightcoveIE._extract_brightcove_url(webpage) +        if brightcove_url is None: +            raise ExtractorError(u'The webpage does not contain a video', expected=True) +        return self.url_result(brightcove_url, BrightcoveIE.ie_key()) diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index 90de7de3a..4b4c5235d 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -12,9 +12,9 @@ class SubtitlesInfoExtractor(InfoExtractor):          return any([self._downloader.params.get('writesubtitles', False),                      self._downloader.params.get('writeautomaticsub')]) -    def _list_available_subtitles(self, video_id, webpage=None): +    def _list_available_subtitles(self, video_id, webpage):          """ outputs the available subtitles for the video """ -        sub_lang_list = self._get_available_subtitles(video_id) +        sub_lang_list = self._get_available_subtitles(video_id, webpage)          auto_captions_list = self._get_available_automatic_caption(video_id, webpage)          sub_lang = ",".join(list(sub_lang_list.keys()))          self.to_screen(u'%s: Available subtitles for video: %s' % @@ -23,7 +23,7 @@ class SubtitlesInfoExtractor(InfoExtractor):          self.to_screen(u'%s: Available automatic captions for video: %s' %                         (video_id, auto_lang)) -    def extract_subtitles(self, video_id, video_webpage=None): +    def extract_subtitles(self, video_id, webpage):          """          returns {sub_lang: sub} ,{} if subtitles not found or None if the          subtitles aren't requested. @@ -32,9 +32,9 @@ class SubtitlesInfoExtractor(InfoExtractor):              return None          available_subs_list = {}          if self._downloader.params.get('writeautomaticsub', False): -            available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage)) +            available_subs_list.update(self._get_available_automatic_caption(video_id, webpage))          if self._downloader.params.get('writesubtitles', False): -            available_subs_list.update(self._get_available_subtitles(video_id)) +            available_subs_list.update(self._get_available_subtitles(video_id, webpage))          if not available_subs_list:  # error, it didn't get the available subtitles              return {} @@ -74,7 +74,7 @@ class SubtitlesInfoExtractor(InfoExtractor):              return          return sub -    def _get_available_subtitles(self, video_id): +    def _get_available_subtitles(self, video_id, webpage):          """          returns {sub_lang: url} or {} if not available          Must be redefined by the subclasses diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index dfa1176a3..76cfdfb90 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -1,10 +1,14 @@  import json  import re -from .common import InfoExtractor +from .subtitles import SubtitlesInfoExtractor +from ..utils import ( +    compat_str, +    RegexNotFoundError, +) -class TEDIE(InfoExtractor): +class TEDIE(SubtitlesInfoExtractor):      _VALID_URL=r'''http://www\.ted\.com/                     (                          ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist @@ -32,7 +36,7 @@ class TEDIE(InfoExtractor):      def _real_extract(self, url):          m=re.match(self._VALID_URL, url, re.VERBOSE)          if m.group('type_talk'): -            return [self._talk_info(url)] +            return self._talk_info(url)          else :              playlist_id=m.group('playlist_id')              name=m.group('name') @@ -82,11 +86,21 @@ class TEDIE(InfoExtractor):              'url': stream['file'],              'format': stream['id']              } for stream in info['htmlStreams']] + +        video_id = info['id'] + +        # subtitles +        video_subtitles = self.extract_subtitles(video_id, webpage) +        if self._downloader.params.get('listsubtitles', False): +            self._list_available_subtitles(video_id, webpage) +            return +          info = { -            'id': info['id'], +            'id': video_id,              'title': title,              'thumbnail': thumbnail,              'description': desc, +            'subtitles': video_subtitles,              'formats': formats,          } @@ -94,3 +108,17 @@ class TEDIE(InfoExtractor):          info.update(info['formats'][-1])          return info + +    def _get_available_subtitles(self, video_id, webpage): +        try: +            options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL) +            languages = re.findall(r'(?:<option value=")(\S+)"', options) +            if languages: +                sub_lang_list = {} +                for l in languages: +                    url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) +                    sub_lang_list[l] = url +                return sub_lang_list +        except RegexNotFoundError as err: +            self._downloader.report_warning(u'video doesn\'t have subtitles') +        return {} diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index c4ec1f06f..651ba317d 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -27,7 +27,7 @@ class VineIE(InfoExtractor):          video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',              webpage, u'video URL') -        uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>', +        uploader = self._html_search_regex(r'<p class="username">(.*?)</p>',              webpage, u'uploader', fatal=False, flags=re.DOTALL)          return [{ diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 0757495bd..fa784ab99 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -13,6 +13,7 @@ class WeiboIE(InfoExtractor):      _VALID_URL = r'https?://video\.weibo\.com/v/weishipin/t_(?P<id>.+?)\.htm'      _TEST = { +        u'add_ie': ['Sina'],          u'url': u'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm',          u'file': u'98322879.flv',          u'info_dict': { diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py index 8a0eb1afd..1177a4b14 100644 --- a/youtube_dl/extractor/xnxx.py +++ b/youtube_dl/extractor/xnxx.py @@ -9,7 +9,7 @@ from ..utils import (  class XNXXIE(InfoExtractor): -    _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)' +    _VALID_URL = r'^(?:https?://)?(?:video|www)\.xnxx\.com/video([0-9]+)/(.*)'      VIDEO_URL_RE = r'flv_url=(.*?)&'      VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'      VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&' diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 74a381fe2..c992cba97 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1082,7 +1082,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          else:              raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) -    def _get_available_subtitles(self, video_id): +    def _get_available_subtitles(self, video_id, webpage):          try:              sub_list = self._download_webpage(                  'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, @@ -1572,7 +1572,6 @@ class YoutubePlaylistIE(InfoExtractor):  class YoutubeChannelIE(InfoExtractor):      IE_DESC = u'YouTube.com channels'      _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" -    _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'      _MORE_PAGES_INDICATOR = 'yt-uix-load-more'      _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'      IE_NAME = u'youtube:channel' @@ -1593,30 +1592,20 @@ class YoutubeChannelIE(InfoExtractor):          # Download channel page          channel_id = mobj.group(1)          video_ids = [] -        pagenum = 1 -        url = self._TEMPLATE_URL % (channel_id, pagenum) -        page = self._download_webpage(url, channel_id, -                                      u'Downloading page #%s' % pagenum) +        # Download all channel pages using the json-based channel_ajax query +        for pagenum in itertools.count(1): +            url = self._MORE_PAGES_URL % (pagenum, channel_id) +            page = self._download_webpage(url, channel_id, +                                          u'Downloading page #%s' % pagenum) -        # Extract video identifiers -        ids_in_page = self.extract_videos_from_page(page) -        video_ids.extend(ids_in_page) +            page = json.loads(page) -        # Download any subsequent channel pages using the json-based channel_ajax query -        if self._MORE_PAGES_INDICATOR in page: -            for pagenum in itertools.count(1): -                url = self._MORE_PAGES_URL % (pagenum, channel_id) -                page = self._download_webpage(url, channel_id, -                                              u'Downloading page #%s' % pagenum) - -                page = json.loads(page) - -                ids_in_page = self.extract_videos_from_page(page['content_html']) -                video_ids.extend(ids_in_page) +            ids_in_page = self.extract_videos_from_page(page['content_html']) +            video_ids.extend(ids_in_page) -                if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']: -                    break +            if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']: +                break          self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) | 
