aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRemita Amine <remitamine@gmail.com>2020-11-24 22:49:04 +0100
committerRemita Amine <remitamine@gmail.com>2020-11-24 22:49:04 +0100
commit6c35de4c6bc13c307215cd2f8977b7ac09de28fb (patch)
tree767270126d0afffea84f826b827963d87657b34a
parent579d43951d7d60b5027ae224973b5f541b51c74a (diff)
[bbc] fix BBC News videos extraction
-rw-r--r--youtube_dl/extractor/bbc.py35
1 files changed, 34 insertions, 1 deletions
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index 002c39c39..ca93b7be5 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -981,7 +981,7 @@ class BBCIE(BBCCoUkIE):
group_id = self._search_regex(
r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
webpage, 'group id', default=None)
- if playlist_id:
+ if group_id:
return self.url_result(
'https://www.bbc.co.uk/programmes/%s' % group_id,
ie=BBCCoUkIE.ie_key())
@@ -1118,6 +1118,39 @@ class BBCIE(BBCCoUkIE):
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
+ initial_data = self._parse_json(self._search_regex(
+ r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage,
+ 'preload state', default='{}'), playlist_id, fatal=False)
+ if initial_data:
+ def parse_media(media):
+ if not media:
+ return
+ for item in (try_get(media, lambda x: x['media']['items'], list) or []):
+ item_id = item.get('id')
+ item_title = item.get('title')
+ if not (item_id and item_title):
+ continue
+ formats, subtitles = self._download_media_selector(item_id)
+ self._sort_formats(formats)
+ entries.append({
+ 'id': item_id,
+ 'title': item_title,
+ 'thumbnail': item.get('holdingImageUrl'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ })
+ for resp in (initial_data.get('data') or {}).values():
+ name = resp.get('name')
+ if name == 'media-experience':
+ parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
+ elif name == 'article':
+ for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []):
+ if block.get('type') != 'media':
+ continue
+ parse_media(block.get('model'))
+ return self.playlist_result(
+ entries, playlist_id, playlist_title, playlist_description)
+
def extract_all(pattern):
return list(filter(None, map(
lambda s: self._parse_json(s, playlist_id, fatal=False),