Merge branch 'UP/youtube-dl' into dl/YoutubeSearchURLIE

author: pukkandan <pukkandan.ytdlp@gmail.com> 2022-01-30 01:07:28 +0530
committer: pukkandan <pukkandan.ytdlp@gmail.com> 2022-01-30 01:07:28 +0530
commit: a3373da70c97d356bd4927eff403abd261dd8f9f (patch)
tree: fb77c8f297d7ca1bd6a6f3e37a4c0a4b12399db5 /youtube_dl/extractor/bbc.py
parent: 2c4cb134a90b49a4d44965b57ff43cfd45ec2d69 (diff)
parent: 5014bd67c22b421207b2650d4dc874b95b36dda1 (diff)
1 files changed, 273 insertions, 24 deletions
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index b4daee54e..247d982ce 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -1,31 +1,39 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import functools
 import itertools
+import json
 import re
 
 from .common import InfoExtractor
+from ..compat import (
+    compat_etree_Element,
+    compat_HTTPError,
+    compat_parse_qs,
+    compat_str,
+    compat_urllib_parse_urlparse,
+    compat_urlparse,
+)
 from ..utils import (
+    ExtractorError,
+    OnDemandPagedList,
     clean_html,
     dict_get,
-    ExtractorError,
     float_or_none,
     get_element_by_class,
     int_or_none,
     js_to_json,
     parse_duration,
     parse_iso8601,
+    strip_or_none,
     try_get,
     unescapeHTML,
+    unified_timestamp,
     url_or_none,
     urlencode_postdata,
     urljoin,
 )
-from ..compat import (
-    compat_etree_Element,
-    compat_HTTPError,
-    compat_urlparse,
-)
 
 
 class BBCCoUkIE(InfoExtractor):
@@ -756,8 +764,17 @@ class BBCIE(BBCCoUkIE):
         'only_matching': True,
     }, {
         # custom redirection to www.bbc.com
+        # also, video with window.__INITIAL_DATA__
         'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
-        'only_matching': True,
+        'info_dict': {
+            'id': 'p02xzws1',
+            'ext': 'mp4',
+            'title': "Pluto may have 'nitrogen glaciers'",
+            'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
+            'thumbnail': r're:https?://.+/.+\.jpg',
+            'timestamp': 1437785037,
+            'upload_date': '20150725',
+        },
     }, {
         # single video article embedded with data-media-vpid
         'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
@@ -793,11 +810,25 @@ class BBCIE(BBCCoUkIE):
             'description': 'Learn English words and phrases from this story',
         },
         'add_ie': [BBCCoUkIE.ie_key()],
+    }, {
+        # BBC Reel
+        'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
+        'info_dict': {
+            'id': 'p07c6sb9',
+            'ext': 'mp4',
+            'title': 'How positive thinking is harming your happiness',
+            'alt_title': 'The downsides of positive thinking',
+            'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
+            'duration': 235,
+            'thumbnail': r're:https?://.+/p07c9dsr.jpg',
+            'upload_date': '20190604',
+            'categories': ['Psychology'],
+        },
     }]
 
     @classmethod
     def suitable(cls, url):
-        EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
+        EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
         return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
                 else super(BBCIE, cls).suitable(url))
 
@@ -929,7 +960,7 @@ class BBCIE(BBCCoUkIE):
                                     else:
                                         entry['title'] = info['title']
                                         entry['formats'].extend(info['formats'])
-                                except Exception as e:
+                                except ExtractorError as e:
                                     # Some playlist URL may fail with 500, at the same time
                                     # the other one may work fine (e.g.
                                     # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
@@ -980,6 +1011,37 @@ class BBCIE(BBCCoUkIE):
                 'subtitles': subtitles,
             }
 
+        # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
+        initial_data = self._parse_json(self._html_search_regex(
+            r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
+            webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
+        if initial_data:
+            init_data = try_get(
+                initial_data, lambda x: x['initData']['items'][0], dict) or {}
+            smp_data = init_data.get('smpData') or {}
+            clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
+            version_id = clip_data.get('versionID')
+            if version_id:
+                title = smp_data['title']
+                formats, subtitles = self._download_media_selector(version_id)
+                self._sort_formats(formats)
+                image_url = smp_data.get('holdingImageURL')
+                display_date = init_data.get('displayDate')
+                topic_title = init_data.get('topicTitle')
+
+                return {
+                    'id': version_id,
+                    'title': title,
+                    'formats': formats,
+                    'alt_title': init_data.get('shortTitle'),
+                    'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
+                    'description': smp_data.get('summary') or init_data.get('shortSummary'),
+                    'upload_date': display_date.replace('-', '') if display_date else None,
+                    'subtitles': subtitles,
+                    'duration': int_or_none(clip_data.get('duration')),
+                    'categories': [topic_title] if topic_title else None,
+                }
+
         # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
         # There are several setPayload calls may be present but the video
         # seems to be always related to the first one
@@ -1041,7 +1103,7 @@ class BBCIE(BBCCoUkIE):
                 thumbnail = None
                 image_url = current_programme.get('image_url')
                 if image_url:
-                    thumbnail = image_url.replace('{recipe}', '1920x1920')
+                    thumbnail = image_url.replace('{recipe}', 'raw')
                 return {
                     'id': programme_id,
                     'title': title,
@@ -1114,12 +1176,29 @@ class BBCIE(BBCCoUkIE):
                         continue
                     formats, subtitles = self._download_media_selector(item_id)
                     self._sort_formats(formats)
+                    item_desc = None
+                    blocks = try_get(media, lambda x: x['summary']['blocks'], list)
+                    if blocks:
+                        summary = []
+                        for block in blocks:
+                            text = try_get(block, lambda x: x['model']['text'], compat_str)
+                            if text:
+                                summary.append(text)
+                        if summary:
+                            item_desc = '\n\n'.join(summary)
+                    item_time = None
+                    for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
+                        if try_get(meta, lambda x: x['label']) == 'Published':
+                            item_time = unified_timestamp(meta.get('timestamp'))
+                            break
                     entries.append({
                         'id': item_id,
                         'title': item_title,
                         'thumbnail': item.get('holdingImageUrl'),
                         'formats': formats,
                         'subtitles': subtitles,
+                        'timestamp': item_time,
+                        'description': strip_or_none(item_desc),
                     })
             for resp in (initial_data.get('data') or {}).values():
                 name = resp.get('name')
@@ -1293,21 +1372,149 @@ class BBCCoUkPlaylistBaseIE(InfoExtractor):
             playlist_id, title, description)
 
 
-class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
-    IE_NAME = 'bbc.co.uk:iplayer:playlist'
-    _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
-    _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
-    _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
+class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
+    _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
+
+    @staticmethod
+    def _get_default(episode, key, default_key='default'):
+        return try_get(episode, lambda x: x[key][default_key])
+
+    def _get_description(self, data):
+        synopsis = data.get(self._DESCRIPTION_KEY) or {}
+        return dict_get(synopsis, ('large', 'medium', 'small'))
+
+    def _fetch_page(self, programme_id, per_page, series_id, page):
+        elements = self._get_elements(self._call_api(
+            programme_id, per_page, page + 1, series_id))
+        for element in elements:
+            episode = self._get_episode(element)
+            episode_id = episode.get('id')
+            if not episode_id:
+                continue
+            thumbnail = None
+            image = self._get_episode_image(episode)
+            if image:
+                thumbnail = image.replace('{recipe}', 'raw')
+            category = self._get_default(episode, 'labels', 'category')
+            yield {
+                '_type': 'url',
+                'id': episode_id,
+                'title': self._get_episode_field(episode, 'subtitle'),
+                'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
+                'thumbnail': thumbnail,
+                'description': self._get_description(episode),
+                'categories': [category] if category else None,
+                'series': self._get_episode_field(episode, 'title'),
+                'ie_key': BBCCoUkIE.ie_key(),
+            }
+
+    def _real_extract(self, url):
+        pid = self._match_id(url)
+        qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+        series_id = qs.get('seriesId', [None])[0]
+        page = qs.get('page', [None])[0]
+        per_page = 36 if page else self._PAGE_SIZE
+        fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
+        entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
+        playlist_data = self._get_playlist_data(self._call_api(pid, 1))
+        return self.playlist_result(
+            entries, pid, self._get_playlist_title(playlist_data),
+            self._get_description(playlist_data))
+
+
+class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
+    IE_NAME = 'bbc.co.uk:iplayer:episodes'
+    _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
     _TESTS = [{
         'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
         'info_dict': {
             'id': 'b05rcz9v',
             'title': 'The Disappearance',
-            'description': 'French thriller serial about a missing teenager.',
+            'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
+        },
+        'playlist_mincount': 8,
+    }, {
+        # all seasons
+        'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
+        'info_dict': {
+            'id': 'b094m5t9',
+            'title': 'Doctor Foster',
+            'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
+        },
+        'playlist_mincount': 10,
+    }, {
+        # explicit season
+        'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
+        'info_dict': {
+            'id': 'b094m5t9',
+            'title': 'Doctor Foster',
+            'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
         },
-        'playlist_mincount': 6,
-        'skip': 'This programme is not currently available on BBC iPlayer',
+        'playlist_mincount': 5,
     }, {
+        # all pages
+        'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
+        'info_dict': {
+            'id': 'm0004c4v',
+            'title': 'Beechgrove',
+            'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
+        },
+        'playlist_mincount': 37,
+    }, {
+        # explicit page
+        'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
+        'info_dict': {
+            'id': 'm0004c4v',
+            'title': 'Beechgrove',
+            'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
+        },
+        'playlist_mincount': 1,
+    }]
+    _PAGE_SIZE = 100
+    _DESCRIPTION_KEY = 'synopsis'
+
+    def _get_episode_image(self, episode):
+        return self._get_default(episode, 'image')
+
+    def _get_episode_field(self, episode, field):
+        return self._get_default(episode, field)
+
+    @staticmethod
+    def _get_elements(data):
+        return data['entities']['results']
+
+    @staticmethod
+    def _get_episode(element):
+        return element.get('episode') or {}
+
+    def _call_api(self, pid, per_page, page=1, series_id=None):
+        variables = {
+            'id': pid,
+            'page': page,
+            'perPage': per_page,
+        }
+        if series_id:
+            variables['sliceId'] = series_id
+        return self._download_json(
+            'https://graph.ibl.api.bbc.co.uk/', pid, headers={
+                'Content-Type': 'application/json'
+            }, data=json.dumps({
+                'id': '5692d93d5aac8d796a0305e895e61551',
+                'variables': variables,
+            }).encode('utf-8'))['data']['programme']
+
+    @staticmethod
+    def _get_playlist_data(data):
+        return data
+
+    def _get_playlist_title(self, data):
+        return self._get_default(data, 'title')
+
+
+class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
+    IE_NAME = 'bbc.co.uk:iplayer:group'
+    _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
+    _TESTS = [{
         # Available for over a year unlike 30 days for most other programmes
         'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
         'info_dict': {
@@ -1316,14 +1523,56 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
             'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
         },
         'playlist_mincount': 10,
+    }, {
+        # all pages
+        'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
+        'info_dict': {
+            'id': 'p081d7j7',
+            'title': 'Music in Scotland',
+            'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
+        },
+        'playlist_mincount': 47,
+    }, {
+        # explicit page
+        'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
+        'info_dict': {
+            'id': 'p081d7j7',
+            'title': 'Music in Scotland',
+            'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
+        },
+        'playlist_mincount': 11,
     }]
+    _PAGE_SIZE = 200
+    _DESCRIPTION_KEY = 'synopses'
 
-    def _extract_title_and_description(self, webpage):
-        title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
-        description = self._search_regex(
-            r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
-            webpage, 'description', fatal=False, group='value')
-        return title, description
+    def _get_episode_image(self, episode):
+        return self._get_default(episode, 'images', 'standard')
+
+    def _get_episode_field(self, episode, field):
+        return episode.get(field)
+
+    @staticmethod
+    def _get_elements(data):
+        return data['elements']
+
+    @staticmethod
+    def _get_episode(element):
+        return element
+
+    def _call_api(self, pid, per_page, page=1, series_id=None):
+        return self._download_json(
+            'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
+            pid, query={
+                'page': page,
+                'per_page': per_page,
+            })['group_episodes']
+
+    @staticmethod
+    def _get_playlist_data(data):
+        return data['group']
+
+    def _get_playlist_title(self, data):
+        return data.get('title')
 
 
 class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
author	pukkandan <pukkandan.ytdlp@gmail.com>	2022-01-30 01:07:28 +0530
committer	pukkandan <pukkandan.ytdlp@gmail.com>	2022-01-30 01:07:28 +0530
commit	a3373da70c97d356bd4927eff403abd261dd8f9f (patch)
tree	fb77c8f297d7ca1bd6a6f3e37a4c0a4b12399db5 /youtube_dl/extractor/bbc.py
parent	2c4cb134a90b49a4d44965b57ff43cfd45ec2d69 (diff)
parent	5014bd67c22b421207b2650d4dc874b95b36dda1 (diff)