diff options
| author | Sergey M․ <dstftw@gmail.com> | 2016-03-13 15:54:56 +0600 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2016-03-13 15:54:56 +0600 | 
| commit | 0e832c2c97c62f67593ad356ea6d507778c56759 (patch) | |
| tree | 1ba6898ba85a33f3e4d1e879b9a422a969ba7d63 | |
| parent | 8e4aa7bf18af4403bf98742270483f3b9cfbdeb6 (diff) | |
[bbc] Improve title and description extraction (Closes #8826, closes #8822)
| -rw-r--r-- | youtube_dl/extractor/bbc.py | 27 | 
1 files changed, 18 insertions, 9 deletions
| diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index f4d8b4a2f..497ebfd72 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -564,6 +564,14 @@ class BBCIE(BBCCoUkIE):          },          'playlist_count': 18,      }, { +        # school report playlist with single video +        'url': 'http://www.bbc.co.uk/schoolreport/35744779', +        'info_dict': { +            'id': '35744779', +            'title': 'School which breaks down barriers in Jerusalem', +        }, +        'playlist_count': 1, +    }, {          # single video embedded with data-playable containing vpid          'url': 'http://www.bbc.com/news/world-europe-32041533',          'info_dict': { @@ -734,8 +742,17 @@ class BBCIE(BBCCoUkIE):          json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)          timestamp = json_ld_info.get('timestamp') +          playlist_title = json_ld_info.get('title') -        playlist_description = json_ld_info.get('description') +        if not playlist_title: +            playlist_title = self._og_search_title( +                webpage, default=None) or self._html_search_regex( +                r'<title>(.+?)</title>', webpage, 'playlist title', default=None) +            if playlist_title: +                playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() + +        playlist_description = json_ld_info.get( +            'description') or self._og_search_description(webpage, default=None)          if not timestamp:              timestamp = parse_iso8601(self._search_regex( @@ -795,14 +812,6 @@ class BBCIE(BBCCoUkIE):                              entries.append(self._extract_from_playlist_sxml(                                  playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)) -        playlist_title = self._og_search_title(webpage, default=None) -        playlist_title = playlist_title or self._html_search_regex( -            r'<title>(.*?)</title>', webpage, 'playlist title') - -        playlist_title = self._search_regex(r'(.+)\s*-\s*BBC', playlist_title, 'title', default=playlist_title) - -        playlist_description = self._og_search_description(webpage, default=None) -          if entries:              return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) | 
