aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/bbc.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/bbc.py')
-rw-r--r--youtube_dl/extractor/bbc.py659
1 files changed, 565 insertions, 94 deletions
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index dd65b8d86..378b52f4f 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -1,66 +1,81 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
+import functools
import itertools
+import json
+import re
from .common import InfoExtractor
+from ..compat import (
+ compat_etree_Element,
+ compat_HTTPError,
+ compat_parse_qs,
+ compat_str,
+ compat_urllib_error,
+ compat_urllib_parse_urlparse,
+ compat_urlparse,
+)
from ..utils import (
- dict_get,
ExtractorError,
+ OnDemandPagedList,
+ clean_html,
+ dict_get,
float_or_none,
+ get_element_by_class,
int_or_none,
+ js_to_json,
parse_duration,
parse_iso8601,
+ strip_or_none,
try_get,
unescapeHTML,
-)
-from ..compat import (
- compat_etree_fromstring,
- compat_HTTPError,
- compat_urlparse,
+ unified_timestamp,
+ url_or_none,
+ urlencode_postdata,
+ urljoin,
)
class BBCCoUkIE(InfoExtractor):
IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC iPlayer'
- _ID_REGEX = r'[pb][\da-z]{7}'
+ _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
_VALID_URL = r'''(?x)
https?://
(?:www\.)?bbc\.co\.uk/
(?:
programmes/(?!articles/)|
iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
- music/clips[/#]|
- radio/player/
+ music/(?:clips|audiovideo/popular)[/#]|
+ radio/player/|
+ sounds/play/|
+ events/[^/]+/play/[^/]+/
)
(?P<id>%s)(?!/(?:episodes|broadcasts|clips))
''' % _ID_REGEX
- _MEDIASELECTOR_URLS = [
+ _LOGIN_URL = 'https://account.bbc.com/signin'
+ _NETRC_MACHINE = 'bbc'
+
+ _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
+ _MEDIA_SETS = [
# Provides HQ HLS streams with even better quality that pc mediaset but fails
# with geolocation in some cases when it's even not geo restricted at all (e.g.
# http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
- 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
- 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
+ 'iptv-all',
+ 'pc',
]
- _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
_EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
- _NAMESPACES = (
- _MEDIASELECTION_NS,
- _EMP_PLAYLIST_NS,
- )
-
_TESTS = [
{
'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
'info_dict': {
'id': 'b039d07m',
'ext': 'flv',
- 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4',
+ 'title': 'Kaleidoscope, Leonard Cohen',
'description': 'The Canadian poet and songwriter reflects on his musical career.',
},
'params': {
@@ -198,7 +213,7 @@ class BBCCoUkIE(InfoExtractor):
},
'skip': 'Now it\'s really geo-restricted',
}, {
- # compact player (https://github.com/rg3/youtube-dl/issues/8147)
+ # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
'info_dict': {
'id': 'p028bfkj',
@@ -211,6 +226,20 @@ class BBCCoUkIE(InfoExtractor):
'skip_download': True,
},
}, {
+ 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
+ 'note': 'Audio',
+ 'info_dict': {
+ 'id': 'm0007jz9',
+ 'ext': 'mp4',
+ 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
+ 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
+ 'duration': 9840,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
'only_matching': True,
}, {
@@ -222,10 +251,52 @@ class BBCCoUkIE(InfoExtractor):
}, {
'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
'only_matching': True,
- }
- ]
+ }, {
+ 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
+ 'only_matching': True,
+ }]
+
+ def _login(self):
+ username, password = self._get_login_info()
+ if username is None:
+ return
+
+ login_page = self._download_webpage(
+ self._LOGIN_URL, None, 'Downloading signin page')
+
+ login_form = self._hidden_inputs(login_page)
+
+ login_form.update({
+ 'username': username,
+ 'password': password,
+ })
+
+ post_url = urljoin(self._LOGIN_URL, self._search_regex(
+ r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
+ 'post url', default=self._LOGIN_URL, group='url'))
+
+ response, urlh = self._download_webpage_handle(
+ post_url, None, 'Logging in', data=urlencode_postdata(login_form),
+ headers={'Referer': self._LOGIN_URL})
- _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
+ if self._LOGIN_URL in urlh.geturl():
+ error = clean_html(get_element_by_class('form-message', response))
+ if error:
+ raise ExtractorError(
+ 'Unable to login: %s' % error, expected=True)
+ raise ExtractorError('Unable to log in')
+
+ def _real_initialize(self):
+ self._login()
class MediaSelectionError(Exception):
def __init__(self, id):
@@ -238,34 +309,32 @@ class BBCCoUkIE(InfoExtractor):
def _extract_items(self, playlist):
return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
- def _findall_ns(self, element, xpath):
- elements = []
- for ns in self._NAMESPACES:
- elements.extend(element.findall(xpath % ns))
- return elements
-
def _extract_medias(self, media_selection):
- error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
- if error is None:
- media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
- if error is not None:
- raise BBCCoUkIE.MediaSelectionError(error.get('id'))
- return self._findall_ns(media_selection, './{%s}media')
+ error = media_selection.get('result')
+ if error:
+ raise BBCCoUkIE.MediaSelectionError(error)
+ return media_selection.get('media') or []
def _extract_connections(self, media):
- return self._findall_ns(media, './{%s}connection')
+ return media.get('connection') or []
def _get_subtitles(self, media, programme_id):
subtitles = {}
for connection in self._extract_connections(media):
- captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
- lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
- subtitles[lang] = [
+ cc_url = url_or_none(connection.get('href'))
+ if not cc_url:
+ continue
+ captions = self._download_xml(
+ cc_url, programme_id, 'Downloading captions', fatal=False)
+ if not isinstance(captions, compat_etree_Element):
+ continue
+ subtitles['en'] = [
{
'url': connection.get('href'),
'ext': 'ttml',
},
]
+ break
return subtitles
def _raise_extractor_error(self, media_selection_error):
@@ -275,10 +344,10 @@ class BBCCoUkIE(InfoExtractor):
def _download_media_selector(self, programme_id):
last_exception = None
- for mediaselector_url in self._MEDIASELECTOR_URLS:
+ for media_set in self._MEDIA_SETS:
try:
return self._download_media_selector_url(
- mediaselector_url % programme_id, programme_id)
+ self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
except BBCCoUkIE.MediaSelectionError as e:
if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
last_exception = e
@@ -287,14 +356,9 @@ class BBCCoUkIE(InfoExtractor):
self._raise_extractor_error(last_exception)
def _download_media_selector_url(self, url, programme_id=None):
- try:
- media_selection = self._download_xml(
- url, programme_id, 'Downloading media selection XML')
- except ExtractorError as ee:
- if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
- media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
- else:
- raise
+ media_selection = self._download_json(
+ url, programme_id, 'Downloading media selection JSON',
+ expected_status=(403, 404))
return self._process_media_selector(media_selection, programme_id)
def _process_media_selector(self, media_selection, programme_id):
@@ -307,7 +371,6 @@ class BBCCoUkIE(InfoExtractor):
if kind in ('video', 'audio'):
bitrate = int_or_none(media.get('bitrate'))
encoding = media.get('encoding')
- service = media.get('service')
width = int_or_none(media.get('width'))
height = int_or_none(media.get('height'))
file_size = int_or_none(media.get('media_file_size'))
@@ -322,8 +385,6 @@ class BBCCoUkIE(InfoExtractor):
supplier = connection.get('supplier')
transfer_format = connection.get('transferFormat')
format_id = supplier or conn_kind or protocol
- if service:
- format_id = '%s_%s' % (service, format_id)
# ASX playlist
if supplier == 'asx':
for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
@@ -335,23 +396,22 @@ class BBCCoUkIE(InfoExtractor):
formats.extend(self._extract_mpd_formats(
href, programme_id, mpd_id=format_id, fatal=False))
elif transfer_format == 'hls':
- formats.extend(self._extract_m3u8_formats(
- href, programme_id, ext='mp4', entry_protocol='m3u8_native',
- m3u8_id=format_id, fatal=False))
- if re.search(self._USP_RE, href):
- usp_formats = self._extract_m3u8_formats(
- re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href),
- programme_id, ext='mp4', entry_protocol='m3u8_native',
+ # TODO: let expected_status be passed into _extract_xxx_formats() instead
+ try:
+ fmts = self._extract_m3u8_formats(
+ href, programme_id, ext='mp4', entry_protocol='m3u8_native',
m3u8_id=format_id, fatal=False)
- for f in usp_formats:
- if f.get('height') and f['height'] > 720:
- continue
- formats.append(f)
+ except ExtractorError as e:
+ if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError)
+ and e.exc_info[1].code in (403, 404)):
+ raise
+ fmts = []
+ formats.extend(fmts)
elif transfer_format == 'hds':
formats.extend(self._extract_f4m_formats(
href, programme_id, f4m_id=format_id, fatal=False))
else:
- if not service and not supplier and bitrate:
+ if not supplier and bitrate:
format_id += '-%d' % bitrate
fmt = {
'format_id': format_id,
@@ -458,7 +518,7 @@ class BBCCoUkIE(InfoExtractor):
def get_programme_id(item):
def get_from_attributes(item):
- for p in('identifier', 'group'):
+ for p in ('identifier', 'group'):
value = item.get(p)
if value and re.match(r'^[pb][\da-z]{7}$', value):
return value
@@ -483,6 +543,12 @@ class BBCCoUkIE(InfoExtractor):
webpage = self._download_webpage(url, group_id, 'Downloading video page')
+ error = self._search_regex(
+ r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
+ webpage, 'error', default=None)
+ if error:
+ raise ExtractorError(error, expected=True)
+
programme_id = None
duration = None
@@ -531,16 +597,9 @@ class BBCIE(BBCCoUkIE):
IE_DESC = 'BBC'
_VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
- _MEDIASELECTOR_URLS = [
- # Provides HQ HLS streams but fails with geolocation in some cases when it's
- # even not geo restricted at all
- 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
- # Provides more formats, namely direct mp4 links, but fails on some videos with
- # notukerror for non UK (?) users (e.g.
- # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
- 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
- # Provides fewer formats, but works everywhere for everybody (hopefully)
- 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
+ _MEDIA_SETS = [
+ 'mobile-tablet-main',
+ 'pc',
]
_TESTS = [{
@@ -548,7 +607,7 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.com/news/world-europe-32668511',
'info_dict': {
'id': 'world-europe-32668511',
- 'title': 'Russia stages massive WW2 parade despite Western boycott',
+ 'title': 'Russia stages massive WW2 parade',
'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
},
'playlist_count': 2,
@@ -714,17 +773,83 @@ class BBCIE(BBCCoUkIE):
'only_matching': True,
}, {
# custom redirection to www.bbc.com
+ # also, video with window.__INITIAL_DATA__
'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
- 'only_matching': True,
+ 'info_dict': {
+ 'id': 'p02xzws1',
+ 'ext': 'mp4',
+ 'title': "Pluto may have 'nitrogen glaciers'",
+ 'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
+ 'thumbnail': r're:https?://.+/.+\.jpg',
+ 'timestamp': 1437785037,
+ 'upload_date': '20150725',
+ },
+ }, {
+ # video with window.__INITIAL_DATA__ and value as JSON string
+ 'url': 'https://www.bbc.com/news/av/world-europe-59468682',
+ 'info_dict': {
+ 'id': 'p0b71qth',
+ 'ext': 'mp4',
+ 'title': 'Why France is making this woman a national hero',
+ 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4',
+ 'thumbnail': r're:https?://.+/.+\.jpg',
+ 'timestamp': 1638230731,
+ 'upload_date': '20211130',
+ },
}, {
# single video article embedded with data-media-vpid
'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
'only_matching': True,
+ }, {
+ # bbcthreeConfig
+ 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
+ 'info_dict': {
+ 'id': 'p06556y7',
+ 'ext': 'mp4',
+ 'title': 'Things Not To Say to people that live on council estates',
+ 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
+ 'duration': 360,
+ 'thumbnail': r're:https?://.+/.+\.jpg',
+ },
+ }, {
+ # window.__PRELOADED_STATE__
+ 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
+ 'info_dict': {
+ 'id': 'b0b9z4vz',
+ 'ext': 'mp4',
+ 'title': 'Prom 6: An American in Paris and Turangalila',
+ 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
+ 'uploader': 'Radio 3',
+ 'uploader_id': 'bbc_radio_three',
+ },
+ }, {
+ 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
+ 'info_dict': {
+ 'id': 'p06w9tws',
+ 'ext': 'mp4',
+ 'title': 'md5:2fabf12a726603193a2879a055f72514',
+ 'description': 'Learn English words and phrases from this story',
+ },
+ 'add_ie': [BBCCoUkIE.ie_key()],
+ }, {
+ # BBC Reel
+ 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
+ 'info_dict': {
+ 'id': 'p07c6sb9',
+ 'ext': 'mp4',
+ 'title': 'How positive thinking is harming your happiness',
+ 'alt_title': 'The downsides of positive thinking',
+ 'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
+ 'duration': 235,
+ 'thumbnail': r're:https?://.+/p07c9dsr.jpg',
+ 'upload_date': '20190604',
+ 'categories': ['Psychology'],
+ },
}]
@classmethod
def suitable(cls, url):
- EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
+ EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
else super(BBCIE, cls).suitable(url))
@@ -856,7 +981,7 @@ class BBCIE(BBCCoUkIE):
else:
entry['title'] = info['title']
entry['formats'].extend(info['formats'])
- except Exception as e:
+ except ExtractorError as e:
# Some playlist URL may fail with 500, at the same time
# the other one may work fine (e.g.
# http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
@@ -870,6 +995,15 @@ class BBCIE(BBCCoUkIE):
if entries:
return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
+ # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
+ group_id = self._search_regex(
+ r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
+ webpage, 'group id', default=None)
+ if group_id:
+ return self.url_result(
+ 'https://www.bbc.co.uk/programmes/%s' % group_id,
+ ie=BBCCoUkIE.ie_key())
+
# single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
programme_id = self._search_regex(
[r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
@@ -898,6 +1032,37 @@ class BBCIE(BBCCoUkIE):
'subtitles': subtitles,
}
+ # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
+ initial_data = self._parse_json(self._html_search_regex(
+ r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
+ webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
+ if initial_data:
+ init_data = try_get(
+ initial_data, lambda x: x['initData']['items'][0], dict) or {}
+ smp_data = init_data.get('smpData') or {}
+ clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
+ version_id = clip_data.get('versionID')
+ if version_id:
+ title = smp_data['title']
+ formats, subtitles = self._download_media_selector(version_id)
+ self._sort_formats(formats)
+ image_url = smp_data.get('holdingImageURL')
+ display_date = init_data.get('displayDate')
+ topic_title = init_data.get('topicTitle')
+
+ return {
+ 'id': version_id,
+ 'title': title,
+ 'formats': formats,
+ 'alt_title': init_data.get('shortTitle'),
+ 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
+ 'description': smp_data.get('summary') or init_data.get('shortSummary'),
+ 'upload_date': display_date.replace('-', '') if display_date else None,
+ 'subtitles': subtitles,
+ 'duration': int_or_none(clip_data.get('duration')),
+ 'categories': [topic_title] if topic_title else None,
+ }
+
# Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
# There are several setPayload calls may be present but the video
# seems to be always related to the first one
@@ -942,6 +1107,142 @@ class BBCIE(BBCCoUkIE):
'subtitles': subtitles,
}
+ preload_state = self._parse_json(self._search_regex(
+ r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
+ 'preload state', default='{}'), playlist_id, fatal=False)
+ if preload_state:
+ current_programme = preload_state.get('programmes', {}).get('current') or {}
+ programme_id = current_programme.get('id')
+ if current_programme and programme_id and current_programme.get('type') == 'playable_item':
+ title = current_programme.get('titles', {}).get('tertiary') or playlist_title
+ formats, subtitles = self._download_media_selector(programme_id)
+ self._sort_formats(formats)
+ synopses = current_programme.get('synopses') or {}
+ network = current_programme.get('network') or {}
+ duration = int_or_none(
+ current_programme.get('duration', {}).get('value'))
+ thumbnail = None
+ image_url = current_programme.get('image_url')
+ if image_url:
+ thumbnail = image_url.replace('{recipe}', 'raw')
+ return {
+ 'id': programme_id,
+ 'title': title,
+ 'description': dict_get(synopses, ('long', 'medium', 'short')),
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'uploader': network.get('short_title'),
+ 'uploader_id': network.get('id'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ bbc3_config = self._parse_json(
+ self._search_regex(
+ r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
+ 'bbcthree config', default='{}'),
+ playlist_id, transform_source=js_to_json, fatal=False) or {}
+ payload = bbc3_config.get('payload') or {}
+ if payload:
+ clip = payload.get('currentClip') or {}
+ clip_vpid = clip.get('vpid')
+ clip_title = clip.get('title')
+ if clip_vpid and clip_title:
+ formats, subtitles = self._download_media_selector(clip_vpid)
+ self._sort_formats(formats)
+ return {
+ 'id': clip_vpid,
+ 'title': clip_title,
+ 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
+ 'description': clip.get('description'),
+ 'duration': parse_duration(clip.get('duration')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+ bbc3_playlist = try_get(
+ payload, lambda x: x['content']['bbcMedia']['playlist'],
+ dict)
+ if bbc3_playlist:
+ playlist_title = bbc3_playlist.get('title') or playlist_title
+ thumbnail = bbc3_playlist.get('holdingImageURL')
+ entries = []
+ for bbc3_item in bbc3_playlist['items']:
+ programme_id = bbc3_item.get('versionID')
+ if not programme_id:
+ continue
+ formats, subtitles = self._download_media_selector(programme_id)
+ self._sort_formats(formats)
+ entries.append({
+ 'id': programme_id,
+ 'title': playlist_title,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ })
+ return self.playlist_result(
+ entries, playlist_id, playlist_title, playlist_description)
+
+ initial_data = self._search_regex(
+ r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
+ 'quoted preload state', default=None)
+ if initial_data is None:
+ initial_data = self._search_regex(
+ r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
+ 'preload state', default={})
+ else:
+ initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
+ initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
+ if initial_data:
+ def parse_media(media):
+ if not media:
+ return
+ for item in (try_get(media, lambda x: x['media']['items'], list) or []):
+ item_id = item.get('id')
+ item_title = item.get('title')
+ if not (item_id and item_title):
+ continue
+ formats, subtitles = self._download_media_selector(item_id)
+ self._sort_formats(formats)
+ item_desc = None
+ blocks = try_get(media, lambda x: x['summary']['blocks'], list)
+ if blocks:
+ summary = []
+ for block in blocks:
+ text = try_get(block, lambda x: x['model']['text'], compat_str)
+ if text:
+ summary.append(text)
+ if summary:
+ item_desc = '\n\n'.join(summary)
+ item_time = None
+ for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
+ if try_get(meta, lambda x: x['label']) == 'Published':
+ item_time = unified_timestamp(meta.get('timestamp'))
+ break
+ entries.append({
+ 'id': item_id,
+ 'title': item_title,
+ 'thumbnail': item.get('holdingImageUrl'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'timestamp': item_time,
+ 'description': strip_or_none(item_desc),
+ })
+ for resp in (initial_data.get('data') or {}).values():
+ name = resp.get('name')
+ if name == 'media-experience':
+ parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
+ elif name == 'article':
+ for block in (try_get(resp,
+ (lambda x: x['data']['blocks'],
+ lambda x: x['data']['content']['model']['blocks'],),
+ list) or []):
+ if block.get('type') != 'media':
+ continue
+ parse_media(block.get('model'))
+ return self.playlist_result(
+ entries, playlist_id, playlist_title, playlist_description)
+
def extract_all(pattern):
return list(filter(None, map(
lambda s: self._parse_json(s, playlist_id, fatal=False),
@@ -1102,21 +1403,149 @@ class BBCCoUkPlaylistBaseIE(InfoExtractor):
playlist_id, title, description)
-class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
- IE_NAME = 'bbc.co.uk:iplayer:playlist'
- _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
- _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
- _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
+class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
+ _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
+
+ @staticmethod
+ def _get_default(episode, key, default_key='default'):
+ return try_get(episode, lambda x: x[key][default_key])
+
+ def _get_description(self, data):
+ synopsis = data.get(self._DESCRIPTION_KEY) or {}
+ return dict_get(synopsis, ('large', 'medium', 'small'))
+
+ def _fetch_page(self, programme_id, per_page, series_id, page):
+ elements = self._get_elements(self._call_api(
+ programme_id, per_page, page + 1, series_id))
+ for element in elements:
+ episode = self._get_episode(element)
+ episode_id = episode.get('id')
+ if not episode_id:
+ continue
+ thumbnail = None
+ image = self._get_episode_image(episode)
+ if image:
+ thumbnail = image.replace('{recipe}', 'raw')
+ category = self._get_default(episode, 'labels', 'category')
+ yield {
+ '_type': 'url',
+ 'id': episode_id,
+ 'title': self._get_episode_field(episode, 'subtitle'),
+ 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
+ 'thumbnail': thumbnail,
+ 'description': self._get_description(episode),
+ 'categories': [category] if category else None,
+ 'series': self._get_episode_field(episode, 'title'),
+ 'ie_key': BBCCoUkIE.ie_key(),
+ }
+
+ def _real_extract(self, url):
+ pid = self._match_id(url)
+ qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ series_id = qs.get('seriesId', [None])[0]
+ page = qs.get('page', [None])[0]
+ per_page = 36 if page else self._PAGE_SIZE
+ fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
+ entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
+ playlist_data = self._get_playlist_data(self._call_api(pid, 1))
+ return self.playlist_result(
+ entries, pid, self._get_playlist_title(playlist_data),
+ self._get_description(playlist_data))
+
+
+class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
+ IE_NAME = 'bbc.co.uk:iplayer:episodes'
+ _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
_TESTS = [{
'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
'info_dict': {
'id': 'b05rcz9v',
'title': 'The Disappearance',
- 'description': 'French thriller serial about a missing teenager.',
+ 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
+ },
+ 'playlist_mincount': 8,
+ }, {
+ # all seasons
+ 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
+ 'info_dict': {
+ 'id': 'b094m5t9',
+ 'title': 'Doctor Foster',
+ 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
+ },
+ 'playlist_mincount': 10,
+ }, {
+ # explicit season
+ 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
+ 'info_dict': {
+ 'id': 'b094m5t9',
+ 'title': 'Doctor Foster',
+ 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
},
- 'playlist_mincount': 6,
- 'skip': 'This programme is not currently available on BBC iPlayer',
+ 'playlist_mincount': 5,
}, {
+ # all pages
+ 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
+ 'info_dict': {
+ 'id': 'm0004c4v',
+ 'title': 'Beechgrove',
+ 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
+ },
+ 'playlist_mincount': 37,
+ }, {
+ # explicit page
+ 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
+ 'info_dict': {
+ 'id': 'm0004c4v',
+ 'title': 'Beechgrove',
+ 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
+ },
+ 'playlist_mincount': 1,
+ }]
+ _PAGE_SIZE = 100
+ _DESCRIPTION_KEY = 'synopsis'
+
+ def _get_episode_image(self, episode):
+ return self._get_default(episode, 'image')
+
+ def _get_episode_field(self, episode, field):
+ return self._get_default(episode, field)
+
+ @staticmethod
+ def _get_elements(data):
+ return data['entities']['results']
+
+ @staticmethod
+ def _get_episode(element):
+ return element.get('episode') or {}
+
+ def _call_api(self, pid, per_page, page=1, series_id=None):
+ variables = {
+ 'id': pid,
+ 'page': page,
+ 'perPage': per_page,
+ }
+ if series_id:
+ variables['sliceId'] = series_id
+ return self._download_json(
+ 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
+ 'Content-Type': 'application/json'
+ }, data=json.dumps({
+ 'id': '5692d93d5aac8d796a0305e895e61551',
+ 'variables': variables,
+ }).encode('utf-8'))['data']['programme']
+
+ @staticmethod
+ def _get_playlist_data(data):
+ return data
+
+ def _get_playlist_title(self, data):
+ return self._get_default(data, 'title')
+
+
+class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
+ IE_NAME = 'bbc.co.uk:iplayer:group'
+ _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
+ _TESTS = [{
# Available for over a year unlike 30 days for most other programmes
'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
'info_dict': {
@@ -1125,14 +1554,56 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
},
'playlist_mincount': 10,
+ }, {
+ # all pages
+ 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
+ 'info_dict': {
+ 'id': 'p081d7j7',
+ 'title': 'Music in Scotland',
+ 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
+ },
+ 'playlist_mincount': 47,
+ }, {
+ # explicit page
+ 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
+ 'info_dict': {
+ 'id': 'p081d7j7',
+ 'title': 'Music in Scotland',
+ 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
+ },
+ 'playlist_mincount': 11,
}]
+ _PAGE_SIZE = 200
+ _DESCRIPTION_KEY = 'synopses'
- def _extract_title_and_description(self, webpage):
- title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
- description = self._search_regex(
- r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
- webpage, 'description', fatal=False, group='value')
- return title, description
+ def _get_episode_image(self, episode):
+ return self._get_default(episode, 'images', 'standard')
+
+ def _get_episode_field(self, episode, field):
+ return episode.get(field)
+
+ @staticmethod
+ def _get_elements(data):
+ return data['elements']
+
+ @staticmethod
+ def _get_episode(element):
+ return element
+
+ def _call_api(self, pid, per_page, page=1, series_id=None):
+ return self._download_json(
+ 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
+ pid, query={
+ 'page': page,
+ 'per_page': per_page,
+ })['group_episodes']
+
+ @staticmethod
+ def _get_playlist_data(data):
+ return data['group']
+
+ def _get_playlist_title(self, data):
+ return data.get('title')
class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):