[bbc] Improve title and description extraction (Closes #8826, closes #8822)

author: Sergey M․ <dstftw@gmail.com> 2016-03-13 15:54:56 +0600
committer: Sergey M․ <dstftw@gmail.com> 2016-03-13 15:54:56 +0600
commit: 0e832c2c97c62f67593ad356ea6d507778c56759 (patch)
tree: 1ba6898ba85a33f3e4d1e879b9a422a969ba7d63 /youtube_dl/extractor
parent: 8e4aa7bf18af4403bf98742270483f3b9cfbdeb6 (diff)
1 files changed, 18 insertions, 9 deletions
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index f4d8b4a2f..497ebfd72 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -564,6 +564,14 @@ class BBCIE(BBCCoUkIE):
         },
         'playlist_count': 18,
     }, {
+        # school report playlist with single video
+        'url': 'http://www.bbc.co.uk/schoolreport/35744779',
+        'info_dict': {
+            'id': '35744779',
+            'title': 'School which breaks down barriers in Jerusalem',
+        },
+        'playlist_count': 1,
+    }, {
         # single video embedded with data-playable containing vpid
         'url': 'http://www.bbc.com/news/world-europe-32041533',
         'info_dict': {
@@ -734,8 +742,17 @@ class BBCIE(BBCCoUkIE):
 
         json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
         timestamp = json_ld_info.get('timestamp')
+
         playlist_title = json_ld_info.get('title')
-        playlist_description = json_ld_info.get('description')
+        if not playlist_title:
+            playlist_title = self._og_search_title(
+                webpage, default=None) or self._html_search_regex(
+                r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
+            if playlist_title:
+                playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
+
+        playlist_description = json_ld_info.get(
+            'description') or self._og_search_description(webpage, default=None)
 
         if not timestamp:
             timestamp = parse_iso8601(self._search_regex(
@@ -795,14 +812,6 @@ class BBCIE(BBCCoUkIE):
                             entries.append(self._extract_from_playlist_sxml(
                                 playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
 
-        playlist_title = self._og_search_title(webpage, default=None)
-        playlist_title = playlist_title or self._html_search_regex(
-            r'<title>(.*?)</title>', webpage, 'playlist title')
-
-        playlist_title = self._search_regex(r'(.+)\s*-\s*BBC', playlist_title, 'title', default=playlist_title)
-
-        playlist_description = self._og_search_description(webpage, default=None)
-
         if entries:
             return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
author	Sergey M․ <dstftw@gmail.com>	2016-03-13 15:54:56 +0600
committer	Sergey M․ <dstftw@gmail.com>	2016-03-13 15:54:56 +0600
commit	0e832c2c97c62f67593ad356ea6d507778c56759 (patch)
tree	1ba6898ba85a33f3e4d1e879b9a422a969ba7d63 /youtube_dl/extractor
parent	8e4aa7bf18af4403bf98742270483f3b9cfbdeb6 (diff)