diff options
| author | Sergey M․ <dstftw@gmail.com> | 2015-10-10 23:40:20 +0600 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2015-10-10 23:40:20 +0600 | 
| commit | 78f9d843186977c614c5a0f6004732f5d410cd0c (patch) | |
| tree | 0d1fa47fe777c80cf17925aa62513767314966de | |
| parent | b1ec70e4a9fc53b0ec583f48a5262c9f864db40b (diff) | |
[bbc] Support playlists of data-playable
| -rw-r--r-- | youtube_dl/extractor/bbc.py | 73 | 
1 files changed, 44 insertions, 29 deletions
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 972abd0d4..a15e67114 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -716,6 +716,8 @@ class BBCIE(BBCCoUkIE):               r'itemprop="datePublished"[^>]+datetime="([^"]+)"'],              webpage, 'date', default=None)) +        entries = [] +          # article with multiple videos embedded with playlist.sxml (e.g.          # http://www.bbc.com/sport/0/football/34475836)          playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage) @@ -723,44 +725,57 @@ class BBCIE(BBCCoUkIE):              entries = [                  self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)                  for playlist_url in playlists] -            playlist_title = self._og_search_title(webpage) -            playlist_description = self._og_search_description(webpage, default=None) -            return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) -        # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) -        programme_id = self._search_regex( -            [r'data-video-player-vpid="([\da-z]{8})"', -             r'<param[^>]+name="externalIdentifier"[^>]+value="([\da-z]{8})"'], -            webpage, 'vpid', default=None) - -        duration = None -        if not programme_id: -            # single video in news article embedded with data-playable (e.g. -            # http://www.bbc.com/news/world-us-canada-34473351) -            data_playable = self._parse_json( -                unescapeHTML(self._search_regex( -                    r'data-playable=(["\'])(?P<json>{.+?})\1', webpage, -                    'data playable', default='{}', group='json')), -                programme_id, fatal=False) -            if data_playable: -                # data-playable has video vpid in settings.playlistObject.items (e.g. -                # http://www.bbc.com/news/world-us-canada-34473351) +        # news article with multiple videos embedded with data-playable +        data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage) +        if data_playables: +            for _, data_playable_json in data_playables: +                data_playable = self._parse_json( +                    unescapeHTML(data_playable_json), playlist_id, fatal=False) +                if not data_playable: +                    continue                  settings = data_playable.get('settings', {})                  if settings: +                    # data-playable with video vpid in settings.playlistObject.items (e.g. +                    # http://www.bbc.com/news/world-us-canada-34473351)                      playlist_object = settings.get('playlistObject', {})                      if playlist_object:                          items = playlist_object.get('items')                          if items and isinstance(items, list): +                            title = playlist_object['title'] +                            description = playlist_object.get('summary')                              duration = int_or_none(items[0].get('duration'))                              programme_id = items[0].get('vpid') -                if not programme_id: -                    # data-playable has no vpid but has a playlist.sxml URLs -                    # in otherSettings.playlist (e.g. -                    # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani) -                    playlist = data_playable.get('otherSettings', {}).get('playlist', {}) -                    if playlist: -                        return self._extract_from_playlist_sxml( -                            playlist.get('progressiveDownloadUrl'), playlist_id, timestamp) +                            formats, subtitles = self._download_media_selector(programme_id) +                            self._sort_formats(formats) +                            entries.append({ +                                'id': programme_id, +                                'title': title, +                                'description': description, +                                'timestamp': timestamp, +                                'duration': duration, +                                'formats': formats, +                                'subtitles': subtitles, +                            }) +                    else: +                        # data-playable without vpid but with a playlist.sxml URLs +                        # in otherSettings.playlist (e.g. +                        # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani) +                        playlist = data_playable.get('otherSettings', {}).get('playlist', {}) +                        if playlist: +                            entries.append(self._extract_from_playlist_sxml( +                                playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)) + +        if entries: +            playlist_title = self._og_search_title(webpage) +            playlist_description = self._og_search_description(webpage, default=None) +            return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + +        # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) +        programme_id = self._search_regex( +            [r'data-video-player-vpid="([\da-z]{8})"', +             r'<param[^>]+name="externalIdentifier"[^>]+value="([\da-z]{8})"'], +            webpage, 'vpid', default=None)          if programme_id:              formats, subtitles = self._download_media_selector(programme_id)  | 
