aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/bbc.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/bbc.py')
-rw-r--r--youtube_dl/extractor/bbc.py85
1 files changed, 69 insertions, 16 deletions
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index 2a0901ee4..9a1b6e3dc 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -20,7 +20,9 @@ class BBCCoUkIE(InfoExtractor):
IE_DESC = 'BBC iPlayer'
_VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
- _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s'
+ _MEDIASELECTOR_URLS = [
+ 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
+ ]
_TESTS = [
{
@@ -162,6 +164,10 @@ class BBCCoUkIE(InfoExtractor):
}
]
+ class MediaSelectionError(Exception):
+ def __init__(self, id):
+ self.id = id
+
def _extract_asx_playlist(self, connection, programme_id):
asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
return [ref.get('href') for ref in asx.findall('./Entry/ref')]
@@ -172,6 +178,7 @@ class BBCCoUkIE(InfoExtractor):
supplier = connection.get('supplier')
if protocol == 'http':
href = connection.get('href')
+ transfer_format = connection.get('transferFormat')
# ASX playlist
if supplier == 'asx':
for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
@@ -179,6 +186,9 @@ class BBCCoUkIE(InfoExtractor):
'url': ref,
'format_id': 'ref%s_%s' % (i, supplier),
})
+ # Skip DASH until supported
+ elif transfer_format == 'dash':
+ pass
# Direct link
else:
formats.append({
@@ -208,8 +218,7 @@ class BBCCoUkIE(InfoExtractor):
def _extract_medias(self, media_selection):
error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error')
if error is not None:
- raise ExtractorError(
- '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True)
+ raise BBCCoUkIE.MediaSelectionError(error.get('id'))
return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
def _extract_connections(self, media):
@@ -266,9 +275,23 @@ class BBCCoUkIE(InfoExtractor):
]
return subtitles
+ def _raise_extractor_error(self, media_selection_error):
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
+ expected=True)
+
def _download_media_selector(self, programme_id):
- return self._download_media_selector_url(
- self._MEDIASELECTOR_URL % programme_id, programme_id)
+ last_exception = None
+ for mediaselector_url in self._MEDIASELECTOR_URLS:
+ try:
+ return self._download_media_selector_url(
+ mediaselector_url % programme_id, programme_id)
+ except BBCCoUkIE.MediaSelectionError as e:
+ if e.id == 'notukerror':
+ last_exception = e
+ continue
+ self._raise_extractor_error(e)
+ self._raise_extractor_error(last_exception)
def _download_media_selector_url(self, url, programme_id=None):
try:
@@ -293,7 +316,6 @@ class BBCCoUkIE(InfoExtractor):
formats.extend(self._extract_video(media, programme_id))
elif kind == 'captions':
subtitles = self.extract_subtitles(media, programme_id)
-
return formats, subtitles
def _download_playlist(self, playlist_id):
@@ -422,9 +444,14 @@ class BBCIE(BBCCoUkIE):
IE_DESC = 'BBC'
_VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
- # fails with notukerror for some videos
- #_MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s'
- _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s'
+ _MEDIASELECTOR_URLS = [
+ # Provides more formats, namely direct mp4 links, but fails on some videos with
+ # notukerror for non UK (?) users (e.g.
+ # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
+ 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
+ # Provides fewer formats, but works everywhere for everybody (hopefully)
+ 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
+ ]
_TESTS = [{
# article with multiple videos embedded with data-media-meta containing
@@ -447,11 +474,19 @@ class BBCIE(BBCCoUkIE):
'playlist_count': 9,
'skip': 'Save time',
}, {
+ # article with multiple videos embedded with `new SMP()`
+ 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
+ 'info_dict': {
+ 'id': '3662a707-0af9-3149-963f-47bea720b460',
+ 'title': 'BBC Blogs - Adam Curtis - BUGGER',
+ },
+ 'playlist_count': 18,
+ }, {
# single video embedded with mediaAssetPage.init()
'url': 'http://www.bbc.com/news/world-europe-32041533',
'info_dict': {
'id': 'p02mprgb',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
'duration': 47,
'timestamp': 1427219242,
@@ -511,7 +546,7 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
'info_dict': {
'id': 'p018zqqg',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Hyundai Santa Fe Sport: Rock star',
'description': 'md5:b042a26142c4154a6e472933cf20793d',
'timestamp': 1368473503,
@@ -526,7 +561,7 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.com/sport/0/football/33653409',
'info_dict': {
'id': 'p02xycnp',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
'description': 'md5:398fca0e2e701c609d726e034fa1fc89',
'duration': 140,
@@ -633,12 +668,30 @@ class BBCIE(BBCCoUkIE):
playlist_title = self._html_search_regex(
r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title')
- playlist_description = self._og_search_description(webpage)
+ playlist_description = self._og_search_description(webpage, default=None)
+
+ def extract_all(pattern):
+ return list(filter(None, map(
+ lambda s: self._parse_json(s, playlist_id, fatal=False),
+ re.findall(pattern, webpage))))
+
+ # Multiple video article (e.g.
+ # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
+ EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?'
+ entries = []
+ for match in extract_all(r'new\s+SMP\(({.+?})\)'):
+ embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
+ if embed_url and re.match(EMBED_URL, embed_url):
+ entries.append(embed_url)
+ entries.extend(re.findall(
+ r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
+ if entries:
+ return self.playlist_result(
+ [self.url_result(entry, 'BBCCoUk') for entry in entries],
+ playlist_id, playlist_title, playlist_description)
# Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
- medias = list(filter(None, map(
- lambda s: self._parse_json(s, playlist_id, fatal=False),
- re.findall(r"data-media-meta='({[^']+})'", webpage))))
+ medias = extract_all(r"data-media-meta='({[^']+})'")
if not medias:
# Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)