diff options
| -rw-r--r-- | AUTHORS | 1 | ||||
| -rw-r--r-- | docs/supportedsites.md | 3 | ||||
| -rw-r--r-- | youtube_dl/extractor/bbc.py | 85 | ||||
| -rw-r--r-- | youtube_dl/extractor/comcarcoff.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 11 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 8 | ||||
| -rw-r--r-- | youtube_dl/extractor/spiegel.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 35 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 4 | ||||
| -rw-r--r-- | youtube_dl/version.py | 2 | 
10 files changed, 110 insertions, 46 deletions
| @@ -135,3 +135,4 @@ Bernhard Minks  sceext  Zach Bruggeman  Tjark Saul +slangangular diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 73445137f..657935dc6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -51,6 +51,7 @@   - **bambuser:channel**   - **Bandcamp**   - **Bandcamp:album** + - **bbc**: BBC   - **bbc.co.uk**: BBC iPlayer   - **BeatportPro**   - **Beeg** @@ -224,6 +225,7 @@   - **InternetVideoArchive**   - **IPrima**   - **iqiyi**: 爱奇艺 + - **Ir90Tv**   - **ivi**: ivi.ru   - **ivi:compilation**: ivi.ru compilations   - **Izlesene** @@ -252,6 +254,7 @@   - **kuwo:song**: 酷我音乐   - **la7.tv**   - **Laola1Tv** + - **Lecture2Go**   - **Letv**: 乐视网   - **LetvPlaylist**   - **LetvTv** diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 2a0901ee4..9a1b6e3dc 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -20,7 +20,9 @@ class BBCCoUkIE(InfoExtractor):      IE_DESC = 'BBC iPlayer'      _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})' -    _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' +    _MEDIASELECTOR_URLS = [ +        'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', +    ]      _TESTS = [          { @@ -162,6 +164,10 @@ class BBCCoUkIE(InfoExtractor):          }      ] +    class MediaSelectionError(Exception): +        def __init__(self, id): +            self.id = id +      def _extract_asx_playlist(self, connection, programme_id):          asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')          return [ref.get('href') for ref in asx.findall('./Entry/ref')] @@ -172,6 +178,7 @@ class BBCCoUkIE(InfoExtractor):          supplier = connection.get('supplier')          if protocol == 'http':              href = connection.get('href') +            transfer_format = connection.get('transferFormat')              # ASX playlist              if supplier == 'asx':                  for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): @@ -179,6 +186,9 @@ class BBCCoUkIE(InfoExtractor):                          'url': ref,                          'format_id': 'ref%s_%s' % (i, supplier),                      }) +            # Skip DASH until supported +            elif transfer_format == 'dash': +                pass              # Direct link              else:                  formats.append({ @@ -208,8 +218,7 @@ class BBCCoUkIE(InfoExtractor):      def _extract_medias(self, media_selection):          error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error')          if error is not None: -            raise ExtractorError( -                '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True) +            raise BBCCoUkIE.MediaSelectionError(error.get('id'))          return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')      def _extract_connections(self, media): @@ -266,9 +275,23 @@ class BBCCoUkIE(InfoExtractor):              ]          return subtitles +    def _raise_extractor_error(self, media_selection_error): +        raise ExtractorError( +            '%s returned error: %s' % (self.IE_NAME, media_selection_error.id), +            expected=True) +      def _download_media_selector(self, programme_id): -        return self._download_media_selector_url( -            self._MEDIASELECTOR_URL % programme_id, programme_id) +        last_exception = None +        for mediaselector_url in self._MEDIASELECTOR_URLS: +            try: +                return self._download_media_selector_url( +                    mediaselector_url % programme_id, programme_id) +            except BBCCoUkIE.MediaSelectionError as e: +                if e.id == 'notukerror': +                    last_exception = e +                    continue +                self._raise_extractor_error(e) +        self._raise_extractor_error(last_exception)      def _download_media_selector_url(self, url, programme_id=None):          try: @@ -293,7 +316,6 @@ class BBCCoUkIE(InfoExtractor):                  formats.extend(self._extract_video(media, programme_id))              elif kind == 'captions':                  subtitles = self.extract_subtitles(media, programme_id) -          return formats, subtitles      def _download_playlist(self, playlist_id): @@ -422,9 +444,14 @@ class BBCIE(BBCCoUkIE):      IE_DESC = 'BBC'      _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)' -    # fails with notukerror for some videos -    #_MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' -    _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s' +    _MEDIASELECTOR_URLS = [ +        # Provides more formats, namely direct mp4 links, but fails on some videos with +        # notukerror for non UK (?) users (e.g. +        # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) +        'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s', +        # Provides fewer formats, but works everywhere for everybody (hopefully) +        'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s', +    ]      _TESTS = [{          # article with multiple videos embedded with data-media-meta containing @@ -447,11 +474,19 @@ class BBCIE(BBCCoUkIE):          'playlist_count': 9,          'skip': 'Save time',      }, { +        # article with multiple videos embedded with `new SMP()` +        'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460', +        'info_dict': { +            'id': '3662a707-0af9-3149-963f-47bea720b460', +            'title': 'BBC Blogs - Adam Curtis - BUGGER', +        }, +        'playlist_count': 18, +    }, {          # single video embedded with mediaAssetPage.init()          'url': 'http://www.bbc.com/news/world-europe-32041533',          'info_dict': {              'id': 'p02mprgb', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',              'duration': 47,              'timestamp': 1427219242, @@ -511,7 +546,7 @@ class BBCIE(BBCCoUkIE):          'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',          'info_dict': {              'id': 'p018zqqg', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Hyundai Santa Fe Sport: Rock star',              'description': 'md5:b042a26142c4154a6e472933cf20793d',              'timestamp': 1368473503, @@ -526,7 +561,7 @@ class BBCIE(BBCCoUkIE):          'url': 'http://www.bbc.com/sport/0/football/33653409',          'info_dict': {              'id': 'p02xycnp', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',              'description': 'md5:398fca0e2e701c609d726e034fa1fc89',              'duration': 140, @@ -633,12 +668,30 @@ class BBCIE(BBCCoUkIE):          playlist_title = self._html_search_regex(              r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title') -        playlist_description = self._og_search_description(webpage) +        playlist_description = self._og_search_description(webpage, default=None) + +        def extract_all(pattern): +            return list(filter(None, map( +                lambda s: self._parse_json(s, playlist_id, fatal=False), +                re.findall(pattern, webpage)))) + +        # Multiple video article (e.g. +        # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) +        EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?' +        entries = [] +        for match in extract_all(r'new\s+SMP\(({.+?})\)'): +            embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') +            if embed_url and re.match(EMBED_URL, embed_url): +                entries.append(embed_url) +        entries.extend(re.findall( +            r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage)) +        if entries: +            return self.playlist_result( +                [self.url_result(entry, 'BBCCoUk') for entry in entries], +                playlist_id, playlist_title, playlist_description)          # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) -        medias = list(filter(None, map( -            lambda s: self._parse_json(s, playlist_id, fatal=False), -            re.findall(r"data-media-meta='({[^']+})'", webpage)))) +        medias = extract_all(r"data-media-meta='({[^']+})'")          if not medias:              # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international) diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 9c25b2223..81f3d7697 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -36,7 +36,7 @@ class ComCarCoffIE(InfoExtractor):              webpage, 'full data json'))          video_id = full_data['activeVideo']['video'] -        video_data = full_data['videos'][video_id] +        video_data = full_data.get('videos', {}).get(video_id) or full_data['singleshots'][video_id]          thumbnails = [{              'url': video_data['images']['thumb'],          }, { diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 14b9b4fe2..d54866d1f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -181,6 +181,7 @@ class InfoExtractor(object):                      by YoutubeDL if it's missing)      categories:     A list of categories that the video falls in, for example                      ["Sports", "Berlin"] +    tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]      is_live:        True, False, or None (=unknown). Whether this video is a                      live stream that goes on instead of a fixed-length video.      start_time:     Time in seconds where the reproduction should start, as @@ -630,6 +631,12 @@ class InfoExtractor(object):              template % (content_re, property_re),          ] +    @staticmethod +    def _meta_regex(prop): +        return r'''(?isx)<meta +                    (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) +                    [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) +      def _og_search_property(self, prop, html, name=None, **kargs):          if name is None:              name = 'OpenGraph %s' % prop @@ -660,9 +667,7 @@ class InfoExtractor(object):          if display_name is None:              display_name = name          return self._html_search_regex( -            r'''(?isx)<meta -                    (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) -                    [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name), +            self._meta_regex(name),              html, display_name, fatal=fatal, group='content', **kwargs)      def _dc_search_uploader(self, html): diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6d2efb22e..8cef61c3c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -276,14 +276,6 @@ class GenericIE(InfoExtractor):                  'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',              },          }, -        # BBC iPlayer embeds -        { -            'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER', -            'info_dict': { -                'title': 'BBC - Blogs -  Adam Curtis - BUGGER', -            }, -            'playlist_mincount': 18, -        },          # RUTV embed          {              'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html', diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index b868241d5..5bd3c0087 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -9,7 +9,7 @@ from .spiegeltv import SpiegeltvIE  class SpiegelIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed)?(?:\.html)?(?:#.*)?$' +    _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$'      _TESTS = [{          'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',          'md5': '2c2754212136f35fb4b19767d242f66e', @@ -39,6 +39,9 @@ class SpiegelIE(InfoExtractor):              'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.',              'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"',          } +    }, { +        'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8a5ef2e70..67a1df9a0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -283,13 +283,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},          # Dash webm -        '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, -        '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, -        '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, -        '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, -        '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, -        '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, -        '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'}, +        '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'},          '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, @@ -299,11 +299,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, -        '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, -        '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, -        '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, -        '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'}, -        '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, +        '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, +        '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, +        '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, +        '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'}, +        '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},          # Dash webm audio          '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, @@ -331,6 +331,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'upload_date': '20121002',                  'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',                  'categories': ['Science & Technology'], +                'tags': ['youtube-dl'],                  'like_count': int,                  'dislike_count': int,                  'start_time': 1, @@ -345,7 +346,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'ext': 'mp4',                  'upload_date': '20120506',                  'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', -                'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f', +                'description': 'md5:782e8651347686cba06e58f71ab51773', +                'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', +                         'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', +                         'iconic ep', 'iconic', 'love', 'it'],                  'uploader': 'Icona Pop',                  'uploader_id': 'IconaPop',              } @@ -1171,6 +1175,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          else:              video_categories = None +        video_tags = [ +            unescapeHTML(m.group('content')) +            for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] +          def _extract_count(count_name):              return str_to_int(self._search_regex(                  r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' @@ -1339,6 +1347,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              'thumbnail': video_thumbnail,              'description': video_description,              'categories': video_categories, +            'tags': video_tags,              'subtitles': video_subtitles,              'automatic_captions': automatic_captions,              'duration': video_duration, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ae813099d..88f9f9070 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -576,11 +576,9 @@ class ContentTooShortError(Exception):      download is too small for what the server announced first, indicating      the connection was probably interrupted.      """ -    # Both in bytes -    downloaded = None -    expected = None      def __init__(self, downloaded, expected): +        # Both in bytes          self.downloaded = downloaded          self.expected = expected diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 280afdd7f..fa157cadb 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2015.07.21' +__version__ = '2015.07.28' | 
