aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
authorSergey M․ <dstftw@gmail.com>2016-03-13 15:54:56 +0600
committerSergey M․ <dstftw@gmail.com>2016-03-13 15:54:56 +0600
commit0e832c2c97c62f67593ad356ea6d507778c56759 (patch)
tree1ba6898ba85a33f3e4d1e879b9a422a969ba7d63 /youtube_dl
parent8e4aa7bf18af4403bf98742270483f3b9cfbdeb6 (diff)
[bbc] Improve title and description extraction (Closes #8826, closes #8822)
Diffstat (limited to 'youtube_dl')
-rw-r--r--youtube_dl/extractor/bbc.py27
1 files changed, 18 insertions, 9 deletions
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index f4d8b4a2f..497ebfd72 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -564,6 +564,14 @@ class BBCIE(BBCCoUkIE):
},
'playlist_count': 18,
}, {
+ # school report playlist with single video
+ 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
+ 'info_dict': {
+ 'id': '35744779',
+ 'title': 'School which breaks down barriers in Jerusalem',
+ },
+ 'playlist_count': 1,
+ }, {
# single video embedded with data-playable containing vpid
'url': 'http://www.bbc.com/news/world-europe-32041533',
'info_dict': {
@@ -734,8 +742,17 @@ class BBCIE(BBCCoUkIE):
json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
timestamp = json_ld_info.get('timestamp')
+
playlist_title = json_ld_info.get('title')
- playlist_description = json_ld_info.get('description')
+ if not playlist_title:
+ playlist_title = self._og_search_title(
+ webpage, default=None) or self._html_search_regex(
+ r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
+ if playlist_title:
+ playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
+
+ playlist_description = json_ld_info.get(
+ 'description') or self._og_search_description(webpage, default=None)
if not timestamp:
timestamp = parse_iso8601(self._search_regex(
@@ -795,14 +812,6 @@ class BBCIE(BBCCoUkIE):
entries.append(self._extract_from_playlist_sxml(
playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
- playlist_title = self._og_search_title(webpage, default=None)
- playlist_title = playlist_title or self._html_search_regex(
- r'<title>(.*?)</title>', webpage, 'playlist title')
-
- playlist_title = self._search_regex(r'(.+)\s*-\s*BBC', playlist_title, 'title', default=playlist_title)
-
- playlist_description = self._og_search_description(webpage, default=None)
-
if entries:
return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)