[bbc] Extract description and timestamp from __INITIAL_DATA__ (#28774)

author: dirkf <fieldhouse@gmx.net> 2021-04-20 20:51:55 +0100
committer: GitHub <noreply@github.com> 2021-04-21 02:51:55 +0700
commit: 41920fc80e4fe4a8996aeb31a04826a5a2534814 (patch)
tree: 35f22561392d8444fae689147a26f920f15e9ef0 /youtube_dl/extractor/bbc.py
parent: 9f6c03a00602eb1119e43a522cf50682f6d6a6dd (diff)
1 files changed, 24 insertions, 1 deletions
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index e8d000bbb..71ea25881 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -11,6 +11,7 @@ from ..compat import (
     compat_etree_Element,
     compat_HTTPError,
     compat_parse_qs,
+    compat_str,
     compat_urllib_parse_urlparse,
     compat_urlparse,
 )
@@ -25,8 +26,10 @@ from ..utils import (
     js_to_json,
     parse_duration,
     parse_iso8601,
+    strip_or_none,
     try_get,
     unescapeHTML,
+    unified_timestamp,
     url_or_none,
     urlencode_postdata,
     urljoin,
@@ -761,8 +764,17 @@ class BBCIE(BBCCoUkIE):
         'only_matching': True,
     }, {
         # custom redirection to www.bbc.com
+        # also, video with window.__INITIAL_DATA__
         'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
-        'only_matching': True,
+        'info_dict': {
+            'id': 'p02xzws1',
+            'ext': 'mp4',
+            'title': "Pluto may have 'nitrogen glaciers'",
+            'description': "Pluto could have glaciers of nitrogen ice, new photographs from Nasa's New Horizons probe suggest.",
+            'thumbnail': r're:https?://.+/.+\.jpg',
+            'timestamp': 1437785037,
+            'upload_date': '20150725',
+        },
     }, {
         # single video article embedded with data-media-vpid
         'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
@@ -1164,12 +1176,23 @@ class BBCIE(BBCCoUkIE):
                         continue
                     formats, subtitles = self._download_media_selector(item_id)
                     self._sort_formats(formats)
+                    item_desc = try_get(
+                        media,
+                        lambda x: x['summary']['blocks'][0]['model']['text'],
+                        compat_str)
+                    item_time = None
+                    for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
+                        if try_get(meta, lambda x: x['label']) == 'Published':
+                            item_time = unified_timestamp(meta.get('timestamp'))
+                            break
                     entries.append({
                         'id': item_id,
                         'title': item_title,
                         'thumbnail': item.get('holdingImageUrl'),
                         'formats': formats,
                         'subtitles': subtitles,
+                        'timestamp': item_time,
+                        'description': strip_or_none(item_desc),
                     })
             for resp in (initial_data.get('data') or {}).values():
                 name = resp.get('name')
author	dirkf <fieldhouse@gmx.net>	2021-04-20 20:51:55 +0100
committer	GitHub <noreply@github.com>	2021-04-21 02:51:55 +0700
commit	41920fc80e4fe4a8996aeb31a04826a5a2534814 (patch)
tree	35f22561392d8444fae689147a26f920f15e9ef0 /youtube_dl/extractor/bbc.py
parent	9f6c03a00602eb1119e43a522cf50682f6d6a6dd (diff)