aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/bbc.py
diff options
context:
space:
mode:
authorpukkandan <pukkandan.ytdlp@gmail.com>2022-01-30 01:07:28 +0530
committerpukkandan <pukkandan.ytdlp@gmail.com>2022-01-30 01:07:28 +0530
commita3373da70c97d356bd4927eff403abd261dd8f9f (patch)
treefb77c8f297d7ca1bd6a6f3e37a4c0a4b12399db5 /youtube_dl/extractor/bbc.py
parent2c4cb134a90b49a4d44965b57ff43cfd45ec2d69 (diff)
parent5014bd67c22b421207b2650d4dc874b95b36dda1 (diff)
Merge branch 'UP/youtube-dl' into dl/YoutubeSearchURLIE
Diffstat (limited to 'youtube_dl/extractor/bbc.py')
-rw-r--r--youtube_dl/extractor/bbc.py297
1 files changed, 273 insertions, 24 deletions
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index b4daee54e..247d982ce 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -1,31 +1,39 @@
# coding: utf-8
from __future__ import unicode_literals
+import functools
import itertools
+import json
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_etree_Element,
+ compat_HTTPError,
+ compat_parse_qs,
+ compat_str,
+ compat_urllib_parse_urlparse,
+ compat_urlparse,
+)
from ..utils import (
+ ExtractorError,
+ OnDemandPagedList,
clean_html,
dict_get,
- ExtractorError,
float_or_none,
get_element_by_class,
int_or_none,
js_to_json,
parse_duration,
parse_iso8601,
+ strip_or_none,
try_get,
unescapeHTML,
+ unified_timestamp,
url_or_none,
urlencode_postdata,
urljoin,
)
-from ..compat import (
- compat_etree_Element,
- compat_HTTPError,
- compat_urlparse,
-)
class BBCCoUkIE(InfoExtractor):
@@ -756,8 +764,17 @@ class BBCIE(BBCCoUkIE):
'only_matching': True,
}, {
# custom redirection to www.bbc.com
+ # also, video with window.__INITIAL_DATA__
'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
- 'only_matching': True,
+ 'info_dict': {
+ 'id': 'p02xzws1',
+ 'ext': 'mp4',
+ 'title': "Pluto may have 'nitrogen glaciers'",
+ 'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
+ 'thumbnail': r're:https?://.+/.+\.jpg',
+ 'timestamp': 1437785037,
+ 'upload_date': '20150725',
+ },
}, {
# single video article embedded with data-media-vpid
'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
@@ -793,11 +810,25 @@ class BBCIE(BBCCoUkIE):
'description': 'Learn English words and phrases from this story',
},
'add_ie': [BBCCoUkIE.ie_key()],
+ }, {
+ # BBC Reel
+ 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
+ 'info_dict': {
+ 'id': 'p07c6sb9',
+ 'ext': 'mp4',
+ 'title': 'How positive thinking is harming your happiness',
+ 'alt_title': 'The downsides of positive thinking',
+ 'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
+ 'duration': 235,
+ 'thumbnail': r're:https?://.+/p07c9dsr.jpg',
+ 'upload_date': '20190604',
+ 'categories': ['Psychology'],
+ },
}]
@classmethod
def suitable(cls, url):
- EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
+ EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
else super(BBCIE, cls).suitable(url))
@@ -929,7 +960,7 @@ class BBCIE(BBCCoUkIE):
else:
entry['title'] = info['title']
entry['formats'].extend(info['formats'])
- except Exception as e:
+ except ExtractorError as e:
# Some playlist URL may fail with 500, at the same time
# the other one may work fine (e.g.
# http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
@@ -980,6 +1011,37 @@ class BBCIE(BBCCoUkIE):
'subtitles': subtitles,
}
+ # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
+ initial_data = self._parse_json(self._html_search_regex(
+ r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
+ webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
+ if initial_data:
+ init_data = try_get(
+ initial_data, lambda x: x['initData']['items'][0], dict) or {}
+ smp_data = init_data.get('smpData') or {}
+ clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
+ version_id = clip_data.get('versionID')
+ if version_id:
+ title = smp_data['title']
+ formats, subtitles = self._download_media_selector(version_id)
+ self._sort_formats(formats)
+ image_url = smp_data.get('holdingImageURL')
+ display_date = init_data.get('displayDate')
+ topic_title = init_data.get('topicTitle')
+
+ return {
+ 'id': version_id,
+ 'title': title,
+ 'formats': formats,
+ 'alt_title': init_data.get('shortTitle'),
+ 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
+ 'description': smp_data.get('summary') or init_data.get('shortSummary'),
+ 'upload_date': display_date.replace('-', '') if display_date else None,
+ 'subtitles': subtitles,
+ 'duration': int_or_none(clip_data.get('duration')),
+ 'categories': [topic_title] if topic_title else None,
+ }
+
# Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
# There are several setPayload calls may be present but the video
# seems to be always related to the first one
@@ -1041,7 +1103,7 @@ class BBCIE(BBCCoUkIE):
thumbnail = None
image_url = current_programme.get('image_url')
if image_url:
- thumbnail = image_url.replace('{recipe}', '1920x1920')
+ thumbnail = image_url.replace('{recipe}', 'raw')
return {
'id': programme_id,
'title': title,
@@ -1114,12 +1176,29 @@ class BBCIE(BBCCoUkIE):
continue
formats, subtitles = self._download_media_selector(item_id)
self._sort_formats(formats)
+ item_desc = None
+ blocks = try_get(media, lambda x: x['summary']['blocks'], list)
+ if blocks:
+ summary = []
+ for block in blocks:
+ text = try_get(block, lambda x: x['model']['text'], compat_str)
+ if text:
+ summary.append(text)
+ if summary:
+ item_desc = '\n\n'.join(summary)
+ item_time = None
+ for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
+ if try_get(meta, lambda x: x['label']) == 'Published':
+ item_time = unified_timestamp(meta.get('timestamp'))
+ break
entries.append({
'id': item_id,
'title': item_title,
'thumbnail': item.get('holdingImageUrl'),
'formats': formats,
'subtitles': subtitles,
+ 'timestamp': item_time,
+ 'description': strip_or_none(item_desc),
})
for resp in (initial_data.get('data') or {}).values():
name = resp.get('name')
@@ -1293,21 +1372,149 @@ class BBCCoUkPlaylistBaseIE(InfoExtractor):
playlist_id, title, description)
-class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
- IE_NAME = 'bbc.co.uk:iplayer:playlist'
- _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
- _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
- _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
+class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
+ _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
+
+ @staticmethod
+ def _get_default(episode, key, default_key='default'):
+ return try_get(episode, lambda x: x[key][default_key])
+
+ def _get_description(self, data):
+ synopsis = data.get(self._DESCRIPTION_KEY) or {}
+ return dict_get(synopsis, ('large', 'medium', 'small'))
+
+ def _fetch_page(self, programme_id, per_page, series_id, page):
+ elements = self._get_elements(self._call_api(
+ programme_id, per_page, page + 1, series_id))
+ for element in elements:
+ episode = self._get_episode(element)
+ episode_id = episode.get('id')
+ if not episode_id:
+ continue
+ thumbnail = None
+ image = self._get_episode_image(episode)
+ if image:
+ thumbnail = image.replace('{recipe}', 'raw')
+ category = self._get_default(episode, 'labels', 'category')
+ yield {
+ '_type': 'url',
+ 'id': episode_id,
+ 'title': self._get_episode_field(episode, 'subtitle'),
+ 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
+ 'thumbnail': thumbnail,
+ 'description': self._get_description(episode),
+ 'categories': [category] if category else None,
+ 'series': self._get_episode_field(episode, 'title'),
+ 'ie_key': BBCCoUkIE.ie_key(),
+ }
+
+ def _real_extract(self, url):
+ pid = self._match_id(url)
+ qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ series_id = qs.get('seriesId', [None])[0]
+ page = qs.get('page', [None])[0]
+ per_page = 36 if page else self._PAGE_SIZE
+ fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
+ entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
+ playlist_data = self._get_playlist_data(self._call_api(pid, 1))
+ return self.playlist_result(
+ entries, pid, self._get_playlist_title(playlist_data),
+ self._get_description(playlist_data))
+
+
+class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
+ IE_NAME = 'bbc.co.uk:iplayer:episodes'
+ _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
_TESTS = [{
'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
'info_dict': {
'id': 'b05rcz9v',
'title': 'The Disappearance',
- 'description': 'French thriller serial about a missing teenager.',
+ 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
+ },
+ 'playlist_mincount': 8,
+ }, {
+ # all seasons
+ 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
+ 'info_dict': {
+ 'id': 'b094m5t9',
+ 'title': 'Doctor Foster',
+ 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
+ },
+ 'playlist_mincount': 10,
+ }, {
+ # explicit season
+ 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
+ 'info_dict': {
+ 'id': 'b094m5t9',
+ 'title': 'Doctor Foster',
+ 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
},
- 'playlist_mincount': 6,
- 'skip': 'This programme is not currently available on BBC iPlayer',
+ 'playlist_mincount': 5,
}, {
+ # all pages
+ 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
+ 'info_dict': {
+ 'id': 'm0004c4v',
+ 'title': 'Beechgrove',
+ 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
+ },
+ 'playlist_mincount': 37,
+ }, {
+ # explicit page
+ 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
+ 'info_dict': {
+ 'id': 'm0004c4v',
+ 'title': 'Beechgrove',
+ 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
+ },
+ 'playlist_mincount': 1,
+ }]
+ _PAGE_SIZE = 100
+ _DESCRIPTION_KEY = 'synopsis'
+
+ def _get_episode_image(self, episode):
+ return self._get_default(episode, 'image')
+
+ def _get_episode_field(self, episode, field):
+ return self._get_default(episode, field)
+
+ @staticmethod
+ def _get_elements(data):
+ return data['entities']['results']
+
+ @staticmethod
+ def _get_episode(element):
+ return element.get('episode') or {}
+
+ def _call_api(self, pid, per_page, page=1, series_id=None):
+ variables = {
+ 'id': pid,
+ 'page': page,
+ 'perPage': per_page,
+ }
+ if series_id:
+ variables['sliceId'] = series_id
+ return self._download_json(
+ 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
+ 'Content-Type': 'application/json'
+ }, data=json.dumps({
+ 'id': '5692d93d5aac8d796a0305e895e61551',
+ 'variables': variables,
+ }).encode('utf-8'))['data']['programme']
+
+ @staticmethod
+ def _get_playlist_data(data):
+ return data
+
+ def _get_playlist_title(self, data):
+ return self._get_default(data, 'title')
+
+
+class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
+ IE_NAME = 'bbc.co.uk:iplayer:group'
+ _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
+ _TESTS = [{
# Available for over a year unlike 30 days for most other programmes
'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
'info_dict': {
@@ -1316,14 +1523,56 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
},
'playlist_mincount': 10,
+ }, {
+ # all pages
+ 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
+ 'info_dict': {
+ 'id': 'p081d7j7',
+ 'title': 'Music in Scotland',
+ 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
+ },
+ 'playlist_mincount': 47,
+ }, {
+ # explicit page
+ 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
+ 'info_dict': {
+ 'id': 'p081d7j7',
+ 'title': 'Music in Scotland',
+ 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
+ },
+ 'playlist_mincount': 11,
}]
+ _PAGE_SIZE = 200
+ _DESCRIPTION_KEY = 'synopses'
- def _extract_title_and_description(self, webpage):
- title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
- description = self._search_regex(
- r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
- webpage, 'description', fatal=False, group='value')
- return title, description
+ def _get_episode_image(self, episode):
+ return self._get_default(episode, 'images', 'standard')
+
+ def _get_episode_field(self, episode, field):
+ return episode.get(field)
+
+ @staticmethod
+ def _get_elements(data):
+ return data['elements']
+
+ @staticmethod
+ def _get_episode(element):
+ return element
+
+ def _call_api(self, pid, per_page, page=1, series_id=None):
+ return self._download_json(
+ 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
+ pid, query={
+ 'page': page,
+ 'per_page': per_page,
+ })['group_episodes']
+
+ @staticmethod
+ def _get_playlist_data(data):
+ return data['group']
+
+ def _get_playlist_title(self, data):
+ return data.get('title')
class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):