From 9e96dc8b3561c1e6e62ce6a34efba485e5e49054 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 19 Jun 2015 01:36:59 -0500 Subject: Support BBC News (bbc.com/news) --- docs/supportedsites.md | 1 + youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/bbcnews.py | 162 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 164 insertions(+) create mode 100644 youtube_dl/extractor/bbcnews.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 220e52b98..d4ccbbd3a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -50,6 +50,7 @@ - **Bandcamp** - **Bandcamp:album** - **bbc.co.uk**: BBC iPlayer + - **bbc.com**: BBC news videos - **BeatportPro** - **Beeg** - **BehindKink** diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6fdaf90b2..51d2d20e9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -36,6 +36,7 @@ from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE from .bbccouk import BBCCoUkIE +from .bbcnews import BBCNewsIE from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py new file mode 100644 index 000000000..b10e30a81 --- /dev/null +++ b/youtube_dl/extractor/bbcnews.py @@ -0,0 +1,162 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, +) +from ..compat import compat_HTTPError +import re +from .bbccouk import BBCCoUkIE + +class BBCNewsIE(BBCCoUkIE): + IE_NAME = 'bbc.com' + IE_DESC = 'BBC news' + _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + + _TESTS = [{ + 'url': 'http://www.bbc.com/news/world-europe-32668511', + 'info_dict': { + 'id': 'world-europe-32668511', + 'title': 'Russia stages massive WW2 parade despite Western boycott', + }, + 'playlist_count': 2, + },{ + 'url': 'http://www.bbc.com/news/business-28299555', + 'info_dict': { + 'id': 'business-28299555', + 'title': 'Farnborough Airshow: Video highlights', + }, + 'playlist_count': 9, + },{ + 'url': 'http://www.bbc.com/news/world-europe-32041533', + 'note': 'Video', + 'info_dict': { + 'id': 'p02mprgb', + 'ext': 'mp4', + 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'duration': 47, + }, + 'params': { + 'skip_download': True, + } + }] + + def _duration_str2int(self, str): + if not str: + return None + ret = re.match(r'^\d+$', str) + if ret: + return int(ret.group(0)) + ret = re.match(r'PT((?P\d+)H)?((?P\d+)M)?(?P\d+)S$', str) + if ret: + total=int(ret.group('s')) + if ret.group('m'): + total+=(int(ret.group('m'))*60) + if ret.group('h'): + total+=(int(ret.group('h'))*3600) + return total + return None + + def _download_media_selector(self, programme_id): + # bbc news uses http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ not + # http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ + # Could add third urlspec arg to BBCCoUkIE._download_media_selector instead of duplicating it + + try: + media_selection = self._download_xml( + 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' % programme_id, + programme_id, 'Downloading media selection XML') + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: + media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8')) + else: + raise + formats = [] + subtitles = None + + for media in self._extract_medias(media_selection): + kind = media.get('kind') + if kind == 'audio': + formats.extend(self._extract_audio(media, programme_id)) + elif kind == 'video': + formats.extend(self._extract_video(media, programme_id)) + elif kind == 'captions': + subtitles = self.extract_subtitles(media, programme_id) + + formats = [] + subtitles = None + + for media in self._extract_medias(media_selection): + kind = media.get('kind') + if kind == 'audio': + formats.extend(self._extract_audio(media, programme_id)) + elif kind == 'video': + formats.extend(self._extract_video(media, programme_id)) + elif kind == 'captions': + subtitles = self.extract_subtitles(media, programme_id) + + return formats, subtitles + + def _real_extract(self, url): + list_id = self._match_id(url) + webpage = self._download_webpage(url, list_id) + + list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') + + pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) + if pubdate: + pubdate = pubdate.replace('-','') + + ret = [] + # works with bbc.com/news/something-something-123456 articles + matches = re.findall(r"data-media-meta='({[^']+})'", webpage) + if not matches: + # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} + # in http://www.bbc.com/news/video_and_audio/international + matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + if not matches: + raise ExtractorError('No video found', expected=True) + + for ent in matches: + jent = self._parse_json(ent,list_id) + + programme_id = jent.get('externalId',None) + xml_url = jent.get('href', None) + + title = jent['caption'] + duration = self._duration_str2int(jent.get('duration',None)) + description = list_title + ' - ' + jent.get('caption','') + thumbnail = None + if jent.has_key('image'): + thumbnail=jent['image'].get('href',None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + elif xml_url: + # Cheap fallback + # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml + xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') + programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') + formats, subtitles = self._download_media_selector(programme_id) + else: + raise ExtractorError('data-media-meta entry has no externalId or href value.') + + self._sort_formats(formats) + + ret.append( { + 'id': programme_id, + 'uploader': 'BBC News', + 'upload_date': pubdate, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } ) + + if len(ret) > 0: + return self.playlist_result(ret, list_id, list_title) + raise ExtractorError('No video found', expected=True) -- cgit v1.2.3 From a8b081a0523c412fd4e01d5cddec7ae382c4793e Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 19 Jun 2015 01:52:25 -0500 Subject: BBCNewsIE: eliminate redundant function. BBCCoUkIE._download_media_selector: use class variable instead of hardcoded string for mediaselector_url template. --- youtube_dl/extractor/bbccouk.py | 4 +++- youtube_dl/extractor/bbcnews.py | 42 ++--------------------------------------- 2 files changed, 5 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index 0305f88b5..dcc5fc2fa 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -15,6 +15,8 @@ class BBCCoUkIE(InfoExtractor): IE_DESC = 'BBC iPlayer' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' + mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' + _TESTS = [ { 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', @@ -277,7 +279,7 @@ class BBCCoUkIE(InfoExtractor): def _download_media_selector(self, programme_id): try: media_selection = self._download_xml( - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id, + self.mediaselector_url % programme_id, programme_id, 'Downloading media selection XML') except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py index b10e30a81..9bb8d42e6 100644 --- a/youtube_dl/extractor/bbcnews.py +++ b/youtube_dl/extractor/bbcnews.py @@ -14,6 +14,8 @@ class BBCNewsIE(BBCCoUkIE): IE_DESC = 'BBC news' _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + _TESTS = [{ 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { @@ -59,46 +61,6 @@ class BBCNewsIE(BBCCoUkIE): return total return None - def _download_media_selector(self, programme_id): - # bbc news uses http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ not - # http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ - # Could add third urlspec arg to BBCCoUkIE._download_media_selector instead of duplicating it - - try: - media_selection = self._download_xml( - 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' % programme_id, - programme_id, 'Downloading media selection XML') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8')) - else: - raise - formats = [] - subtitles = None - - for media in self._extract_medias(media_selection): - kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) - elif kind == 'captions': - subtitles = self.extract_subtitles(media, programme_id) - - formats = [] - subtitles = None - - for media in self._extract_medias(media_selection): - kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) - elif kind == 'captions': - subtitles = self.extract_subtitles(media, programme_id) - - return formats, subtitles - def _real_extract(self, url): list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) -- cgit v1.2.3 From d5552a3477a0970f4aaaa746ce07c816267bb9cf Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 19 Jun 2015 06:25:50 -0500 Subject: bbcnews: Switch to parse_duration, revert change to docs/supportedsites.md --- docs/supportedsites.md | 1 - youtube_dl/extractor/bbcnews.py | 19 ++----------------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d4ccbbd3a..220e52b98 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -50,7 +50,6 @@ - **Bandcamp** - **Bandcamp:album** - **bbc.co.uk**: BBC iPlayer - - **bbc.com**: BBC news videos - **BeatportPro** - **Beeg** - **BehindKink** diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py index 9bb8d42e6..fd4a5e38f 100644 --- a/youtube_dl/extractor/bbcnews.py +++ b/youtube_dl/extractor/bbcnews.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( ExtractorError, + parse_duration, int_or_none, ) from ..compat import compat_HTTPError @@ -45,22 +46,6 @@ class BBCNewsIE(BBCCoUkIE): } }] - def _duration_str2int(self, str): - if not str: - return None - ret = re.match(r'^\d+$', str) - if ret: - return int(ret.group(0)) - ret = re.match(r'PT((?P\d+)H)?((?P\d+)M)?(?P\d+)S$', str) - if ret: - total=int(ret.group('s')) - if ret.group('m'): - total+=(int(ret.group('m'))*60) - if ret.group('h'): - total+=(int(ret.group('h'))*3600) - return total - return None - def _real_extract(self, url): list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) @@ -88,7 +73,7 @@ class BBCNewsIE(BBCCoUkIE): xml_url = jent.get('href', None) title = jent['caption'] - duration = self._duration_str2int(jent.get('duration',None)) + duration = parse_duration(jent.get('duration',None)) description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): -- cgit v1.2.3 From 10273d6e0846cd8f3762e3777712d5cd2a0cafcd Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 08:22:13 -0500 Subject: toss new stuff into old file --- youtube_dl/extractor/__init__.py | 3 +- youtube_dl/extractor/bbccouk.py | 101 ++++++++++++++++++++++++++++++++++++ youtube_dl/extractor/bbcnews.py | 109 --------------------------------------- 3 files changed, 102 insertions(+), 111 deletions(-) delete mode 100644 youtube_dl/extractor/bbcnews.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 51d2d20e9..f9f7bdfaf 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -35,8 +35,7 @@ from .azubu import AzubuIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE -from .bbccouk import BBCCoUkIE -from .bbcnews import BBCNewsIE +from .bbccouk import BBCCoUkIE, BBCNewsIE from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index dcc5fc2fa..ea682fb6f 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -5,9 +5,11 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( ExtractorError, + parse_duration, int_or_none, ) from ..compat import compat_HTTPError +import re class BBCCoUkIE(InfoExtractor): @@ -394,3 +396,102 @@ class BBCCoUkIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class BBCNewsIE(BBCCoUkIE): + IE_NAME = 'bbc.com' + IE_DESC = 'BBC news' + _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + + mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + + _TESTS = [{ + 'url': 'http://www.bbc.com/news/world-europe-32668511', + 'info_dict': { + 'id': 'world-europe-32668511', + 'title': 'Russia stages massive WW2 parade despite Western boycott', + }, + 'playlist_count': 2, + },{ + 'url': 'http://www.bbc.com/news/business-28299555', + 'info_dict': { + 'id': 'business-28299555', + 'title': 'Farnborough Airshow: Video highlights', + }, + 'playlist_count': 9, + },{ + 'url': 'http://www.bbc.com/news/world-europe-32041533', + 'note': 'Video', + 'info_dict': { + 'id': 'p02mprgb', + 'ext': 'mp4', + 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'duration': 47, + }, + 'params': { + 'skip_download': True, + } + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + webpage = self._download_webpage(url, list_id) + + list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') + + pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) + if pubdate: + pubdate = pubdate.replace('-','') + + ret = [] + # works with bbc.com/news/something-something-123456 articles + matches = re.findall(r"data-media-meta='({[^']+})'", webpage) + if not matches: + # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} + # in http://www.bbc.com/news/video_and_audio/international + matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + if not matches: + raise ExtractorError('No video found', expected=True) + + for ent in matches: + jent = self._parse_json(ent,list_id) + + programme_id = jent.get('externalId',None) + xml_url = jent.get('href', None) + + title = jent['caption'] + duration = parse_duration(jent.get('duration',None)) + description = list_title + ' - ' + jent.get('caption','') + thumbnail = None + if jent.has_key('image'): + thumbnail=jent['image'].get('href',None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + elif xml_url: + # Cheap fallback + # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml + xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') + programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') + formats, subtitles = self._download_media_selector(programme_id) + else: + raise ExtractorError('data-media-meta entry has no externalId or href value.') + + self._sort_formats(formats) + + ret.append( { + 'id': programme_id, + 'uploader': 'BBC News', + 'upload_date': pubdate, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } ) + + if len(ret) > 0: + return self.playlist_result(ret, list_id, list_title) + raise ExtractorError('No video found', expected=True) diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py deleted file mode 100644 index fd4a5e38f..000000000 --- a/youtube_dl/extractor/bbcnews.py +++ /dev/null @@ -1,109 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - parse_duration, - int_or_none, -) -from ..compat import compat_HTTPError -import re -from .bbccouk import BBCCoUkIE - -class BBCNewsIE(BBCCoUkIE): - IE_NAME = 'bbc.com' - IE_DESC = 'BBC news' - _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' - - mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' - - _TESTS = [{ - 'url': 'http://www.bbc.com/news/world-europe-32668511', - 'info_dict': { - 'id': 'world-europe-32668511', - 'title': 'Russia stages massive WW2 parade despite Western boycott', - }, - 'playlist_count': 2, - },{ - 'url': 'http://www.bbc.com/news/business-28299555', - 'info_dict': { - 'id': 'business-28299555', - 'title': 'Farnborough Airshow: Video highlights', - }, - 'playlist_count': 9, - },{ - 'url': 'http://www.bbc.com/news/world-europe-32041533', - 'note': 'Video', - 'info_dict': { - 'id': 'p02mprgb', - 'ext': 'mp4', - 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'duration': 47, - }, - 'params': { - 'skip_download': True, - } - }] - - def _real_extract(self, url): - list_id = self._match_id(url) - webpage = self._download_webpage(url, list_id) - - list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') - - pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) - if pubdate: - pubdate = pubdate.replace('-','') - - ret = [] - # works with bbc.com/news/something-something-123456 articles - matches = re.findall(r"data-media-meta='({[^']+})'", webpage) - if not matches: - # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} - # in http://www.bbc.com/news/video_and_audio/international - matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - if not matches: - raise ExtractorError('No video found', expected=True) - - for ent in matches: - jent = self._parse_json(ent,list_id) - - programme_id = jent.get('externalId',None) - xml_url = jent.get('href', None) - - title = jent['caption'] - duration = parse_duration(jent.get('duration',None)) - description = list_title + ' - ' + jent.get('caption','') - thumbnail = None - if jent.has_key('image'): - thumbnail=jent['image'].get('href',None) - - if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - elif xml_url: - # Cheap fallback - # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml - xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') - programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') - formats, subtitles = self._download_media_selector(programme_id) - else: - raise ExtractorError('data-media-meta entry has no externalId or href value.') - - self._sort_formats(formats) - - ret.append( { - 'id': programme_id, - 'uploader': 'BBC News', - 'upload_date': pubdate, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } ) - - if len(ret) > 0: - return self.playlist_result(ret, list_id, list_title) - raise ExtractorError('No video found', expected=True) -- cgit v1.2.3 From 75ab0ebcf593ec91a46d83e69854ffa313d33309 Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 08:24:02 -0500 Subject: no .get('..',None) --- youtube_dl/extractor/bbccouk.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index ea682fb6f..de4d7f9c0 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -457,15 +457,15 @@ class BBCNewsIE(BBCCoUkIE): for ent in matches: jent = self._parse_json(ent,list_id) - programme_id = jent.get('externalId',None) - xml_url = jent.get('href', None) + programme_id = jent.get('externalId') + xml_url = jent.get('href') title = jent['caption'] - duration = parse_duration(jent.get('duration',None)) + duration = parse_duration(jent.get('duration') description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): - thumbnail=jent['image'].get('href',None) + thumbnail=jent['image'].get('href') if programme_id: formats, subtitles = self._download_media_selector(programme_id) -- cgit v1.2.3 From 77c975f536befbe89bf718e86282958d391d9ffe Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 08:28:14 -0500 Subject: typofix --- youtube_dl/extractor/bbccouk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index de4d7f9c0..f9404f3fa 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -461,7 +461,7 @@ class BBCNewsIE(BBCCoUkIE): xml_url = jent.get('href') title = jent['caption'] - duration = parse_duration(jent.get('duration') + duration = parse_duration(jent.get('duration')) description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): -- cgit v1.2.3 From de939d89eb83c851c6db66933e5fc0c401a1a679 Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 11:04:46 -0500 Subject: Support BBC news in other languages, non-mediaselector videos --- youtube_dl/extractor/bbccouk.py | 87 ++++++++++++++++++++++++++++++++++------- 1 file changed, 73 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index f9404f3fa..72e20857b 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -401,7 +401,7 @@ class BBCCoUkIE(InfoExtractor): class BBCNewsIE(BBCCoUkIE): IE_NAME = 'bbc.com' IE_DESC = 'BBC news' - _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?bbc\.com/.+?/(?P[^/]+)$' mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' @@ -432,56 +432,115 @@ class BBCNewsIE(BBCCoUkIE): 'params': { 'skip_download': True, } + },{ + 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', + 'note': 'Video', + 'info_dict': { + 'id': 'NA', + 'ext': 'mp4', + 'title': 'YPG - Tel Abyad..n tamam. kontrol.m.zde', + 'duration': 47, + }, + 'params': { + 'skip_download': True, + } + },{ + 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', + 'note': 'Video', + 'info_dict': { + 'id': '39275083', + 'ext': 'mp4', + 'title': 'Honduras militariza sus hospitales por nuevo esc.ndalo de corrupci.n', + 'duration': 87, + }, + 'params': { + 'skip_download': True, + } }] def _real_extract(self, url): list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) - list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') + list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'list title') pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) if pubdate: pubdate = pubdate.replace('-','') ret = [] + jsent = [] + # works with bbc.com/news/something-something-123456 articles - matches = re.findall(r"data-media-meta='({[^']+})'", webpage) - if not matches: + jsent = map( + lambda m: self._parse_json(m,list_id), + re.findall(r"data-media-meta='({[^']+})'", webpage) + ) + + if len(jsent) == 0: + # http://www.bbc.com/news/video_and_audio/international + # and single-video articles + masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) + if masset: + jmasset = self._parse_json(masset,list_id) + for key, val in jmasset.get('videos',{}).items(): + for skey, sval in val.items(): + sval['id'] = skey + jsent.append(sval) + + if len(jsent) == 0: # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} # in http://www.bbc.com/news/video_and_audio/international - matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - if not matches: - raise ExtractorError('No video found', expected=True) + # prone to breaking if entries have sourceFiles list + jsent = map( + lambda m: self._parse_json(m,list_id), + re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + ) - for ent in matches: - jent = self._parse_json(ent,list_id) + if len(jsent) == 0: + raise ExtractorError('No video found', expected=True) + for jent in jsent: programme_id = jent.get('externalId') - xml_url = jent.get('href') + xml_url = jent.get('hxref') + + title = jent.get('caption',list_title) - title = jent['caption'] duration = parse_duration(jent.get('duration')) description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): thumbnail=jent['image'].get('href') + formats = [] + subtitles = [] + if programme_id: formats, subtitles = self._download_media_selector(programme_id) + elif jent.has_key('sourceFiles'): + # mediaselector not used at + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu + for key, val in jent['sourceFiles'].items(): + formats.append( { + 'ext': val.get('encoding'), + 'url': val.get('url'), + 'filesize': int(val.get('filesize')), + 'format_id': key + } ) elif xml_url: # Cheap fallback # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') formats, subtitles = self._download_media_selector(programme_id) - else: - raise ExtractorError('data-media-meta entry has no externalId or href value.') + + if len(formats) == 0: + raise ExtractorError('unsupported json media entry.\n '+str(jent)+'\n') self._sort_formats(formats) ret.append( { - 'id': programme_id, + 'id': jent.get('programme_id',jent.get('id')), 'uploader': 'BBC News', 'upload_date': pubdate, 'title': title, -- cgit v1.2.3 From 7bb23aeca4e9076528e3d31d501a9a288dcd444c Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 11:08:13 -0500 Subject: rename bbccouk.py -> bbc.py --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/bbc.py | 556 +++++++++++++++++++++++++++++++++++++++ youtube_dl/extractor/bbccouk.py | 556 --------------------------------------- 3 files changed, 557 insertions(+), 557 deletions(-) create mode 100644 youtube_dl/extractor/bbc.py delete mode 100644 youtube_dl/extractor/bbccouk.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f9f7bdfaf..a48346e60 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -35,7 +35,7 @@ from .azubu import AzubuIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE -from .bbccouk import BBCCoUkIE, BBCNewsIE +from .bbc import BBCCoUkIE, BBCNewsIE from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py new file mode 100644 index 000000000..72e20857b --- /dev/null +++ b/youtube_dl/extractor/bbc.py @@ -0,0 +1,556 @@ +from __future__ import unicode_literals + +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_duration, + int_or_none, +) +from ..compat import compat_HTTPError +import re + + +class BBCCoUkIE(InfoExtractor): + IE_NAME = 'bbc.co.uk' + IE_DESC = 'BBC iPlayer' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' + + mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' + + _TESTS = [ + { + 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', + 'info_dict': { + 'id': 'b039d07m', + 'ext': 'flv', + 'title': 'Kaleidoscope, Leonard Cohen', + 'description': 'The Canadian poet and songwriter reflects on his musical career.', + 'duration': 1740, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, + { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/', + 'info_dict': { + 'id': 'b00yng1d', + 'ext': 'flv', + 'title': 'The Man in Black: Series 3: The Printed Name', + 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.", + 'duration': 1800, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Episode is no longer available on BBC iPlayer Radio', + }, + { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/', + 'info_dict': { + 'id': 'b00yng1d', + 'ext': 'flv', + 'title': 'The Voice UK: Series 3: Blind Auditions 5', + 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.", + 'duration': 5100, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', + }, + { + 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion', + 'info_dict': { + 'id': 'b03k3pb7', + 'ext': 'flv', + 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction", + 'description': '2. Invasion', + 'duration': 3600, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', + }, { + 'url': 'http://www.bbc.co.uk/programmes/b04v20dw', + 'info_dict': { + 'id': 'b04v209v', + 'ext': 'flv', + 'title': 'Pete Tong, The Essential New Tune Special', + 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!", + 'duration': 10800, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3', + 'note': 'Audio', + 'info_dict': { + 'id': 'p02frcch', + 'ext': 'flv', + 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix', + 'description': 'French house superstar Madeon takes us out of the club and onto the after party.', + 'duration': 3507, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz', + 'note': 'Video', + 'info_dict': { + 'id': 'p025c103', + 'ext': 'flv', + 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)', + 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014', + 'duration': 226, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls', + 'info_dict': { + 'id': 'p02n76xf', + 'ext': 'flv', + 'title': 'Natural World, 2015-2016: 2. Super Powered Owls', + 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d', + 'duration': 3540, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'geolocation', + }, { + 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition', + 'info_dict': { + 'id': 'b05zmgw1', + 'ext': 'flv', + 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.', + 'title': 'Royal Academy Summer Exhibition', + 'duration': 3540, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'geolocation', + }, { + 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo', + 'only_matching': True, + } + ] + + def _extract_asx_playlist(self, connection, programme_id): + asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') + return [ref.get('href') for ref in asx.findall('./Entry/ref')] + + def _extract_connection(self, connection, programme_id): + formats = [] + protocol = connection.get('protocol') + supplier = connection.get('supplier') + if protocol == 'http': + href = connection.get('href') + # ASX playlist + if supplier == 'asx': + for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): + formats.append({ + 'url': ref, + 'format_id': 'ref%s_%s' % (i, supplier), + }) + # Direct link + else: + formats.append({ + 'url': href, + 'format_id': supplier, + }) + elif protocol == 'rtmp': + application = connection.get('application', 'ondemand') + auth_string = connection.get('authString') + identifier = connection.get('identifier') + server = connection.get('server') + formats.append({ + 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), + 'play_path': identifier, + 'app': '%s?%s' % (application, auth_string), + 'page_url': 'http://www.bbc.co.uk', + 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', + 'rtmp_live': False, + 'ext': 'flv', + 'format_id': supplier, + }) + return formats + + def _extract_items(self, playlist): + return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item') + + def _extract_medias(self, media_selection): + error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error') + if error is not None: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True) + return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media') + + def _extract_connections(self, media): + return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection') + + def _extract_video(self, media, programme_id): + formats = [] + vbr = int(media.get('bitrate')) + vcodec = media.get('encoding') + service = media.get('service') + width = int(media.get('width')) + height = int(media.get('height')) + file_size = int(media.get('media_file_size')) + for connection in self._extract_connections(media): + conn_formats = self._extract_connection(connection, programme_id) + for format in conn_formats: + format.update({ + 'format_id': '%s_%s' % (service, format['format_id']), + 'width': width, + 'height': height, + 'vbr': vbr, + 'vcodec': vcodec, + 'filesize': file_size, + }) + formats.extend(conn_formats) + return formats + + def _extract_audio(self, media, programme_id): + formats = [] + abr = int(media.get('bitrate')) + acodec = media.get('encoding') + service = media.get('service') + for connection in self._extract_connections(media): + conn_formats = self._extract_connection(connection, programme_id) + for format in conn_formats: + format.update({ + 'format_id': '%s_%s' % (service, format['format_id']), + 'abr': abr, + 'acodec': acodec, + }) + formats.extend(conn_formats) + return formats + + def _get_subtitles(self, media, programme_id): + subtitles = {} + for connection in self._extract_connections(media): + captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') + lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') + ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}')) + srt = '' + + def _extract_text(p): + if p.text is not None: + stripped_text = p.text.strip() + if stripped_text: + return stripped_text + return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span')) + for pos, p in enumerate(ps): + srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p)) + subtitles[lang] = [ + { + 'url': connection.get('href'), + 'ext': 'ttml', + }, + { + 'data': srt, + 'ext': 'srt', + }, + ] + return subtitles + + def _download_media_selector(self, programme_id): + try: + media_selection = self._download_xml( + self.mediaselector_url % programme_id, + programme_id, 'Downloading media selection XML') + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: + media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) + else: + raise + + formats = [] + subtitles = None + + for media in self._extract_medias(media_selection): + kind = media.get('kind') + if kind == 'audio': + formats.extend(self._extract_audio(media, programme_id)) + elif kind == 'video': + formats.extend(self._extract_video(media, programme_id)) + elif kind == 'captions': + subtitles = self.extract_subtitles(media, programme_id) + + return formats, subtitles + + def _download_playlist(self, playlist_id): + try: + playlist = self._download_json( + 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, + playlist_id, 'Downloading playlist JSON') + + version = playlist.get('defaultAvailableVersion') + if version: + smp_config = version['smpConfig'] + title = smp_config['title'] + description = smp_config['summary'] + for item in smp_config['items']: + kind = item['kind'] + if kind != 'programme' and kind != 'radioProgramme': + continue + programme_id = item.get('vpid') + duration = int(item.get('duration')) + formats, subtitles = self._download_media_selector(programme_id) + return programme_id, title, description, duration, formats, subtitles + except ExtractorError as ee: + if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): + raise + + # fallback to legacy playlist + playlist = self._download_xml( + 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, + playlist_id, 'Downloading legacy playlist XML') + + no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems') + if no_items is not None: + reason = no_items.get('reason') + if reason == 'preAvailability': + msg = 'Episode %s is not yet available' % playlist_id + elif reason == 'postAvailability': + msg = 'Episode %s is no longer available' % playlist_id + elif reason == 'noMedia': + msg = 'Episode %s is not currently available' % playlist_id + else: + msg = 'Episode %s is not available: %s' % (playlist_id, reason) + raise ExtractorError(msg, expected=True) + + for item in self._extract_items(playlist): + kind = item.get('kind') + if kind != 'programme' and kind != 'radioProgramme': + continue + title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text + description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text + programme_id = item.get('identifier') + duration = int(item.get('duration')) + formats, subtitles = self._download_media_selector(programme_id) + + return programme_id, title, description, duration, formats, subtitles + + def _real_extract(self, url): + group_id = self._match_id(url) + + webpage = self._download_webpage(url, group_id, 'Downloading video page') + + programme_id = None + + tviplayer = self._search_regex( + r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById', + webpage, 'player', default=None) + + if tviplayer: + player = self._parse_json(tviplayer, group_id).get('player', {}) + duration = int_or_none(player.get('duration')) + programme_id = player.get('vpid') + + if not programme_id: + programme_id = self._search_regex( + r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + title = self._og_search_title(webpage) + description = self._search_regex( + r'

([^<]+)

', + webpage, 'description', fatal=False) + else: + programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) + + self._sort_formats(formats) + + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } + + +class BBCNewsIE(BBCCoUkIE): + IE_NAME = 'bbc.com' + IE_DESC = 'BBC news' + _VALID_URL = r'https?://(?:www\.)?bbc\.com/.+?/(?P[^/]+)$' + + mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + + _TESTS = [{ + 'url': 'http://www.bbc.com/news/world-europe-32668511', + 'info_dict': { + 'id': 'world-europe-32668511', + 'title': 'Russia stages massive WW2 parade despite Western boycott', + }, + 'playlist_count': 2, + },{ + 'url': 'http://www.bbc.com/news/business-28299555', + 'info_dict': { + 'id': 'business-28299555', + 'title': 'Farnborough Airshow: Video highlights', + }, + 'playlist_count': 9, + },{ + 'url': 'http://www.bbc.com/news/world-europe-32041533', + 'note': 'Video', + 'info_dict': { + 'id': 'p02mprgb', + 'ext': 'mp4', + 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'duration': 47, + }, + 'params': { + 'skip_download': True, + } + },{ + 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', + 'note': 'Video', + 'info_dict': { + 'id': 'NA', + 'ext': 'mp4', + 'title': 'YPG - Tel Abyad..n tamam. kontrol.m.zde', + 'duration': 47, + }, + 'params': { + 'skip_download': True, + } + },{ + 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', + 'note': 'Video', + 'info_dict': { + 'id': '39275083', + 'ext': 'mp4', + 'title': 'Honduras militariza sus hospitales por nuevo esc.ndalo de corrupci.n', + 'duration': 87, + }, + 'params': { + 'skip_download': True, + } + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + webpage = self._download_webpage(url, list_id) + + list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'list title') + + pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) + if pubdate: + pubdate = pubdate.replace('-','') + + ret = [] + jsent = [] + + # works with bbc.com/news/something-something-123456 articles + jsent = map( + lambda m: self._parse_json(m,list_id), + re.findall(r"data-media-meta='({[^']+})'", webpage) + ) + + if len(jsent) == 0: + # http://www.bbc.com/news/video_and_audio/international + # and single-video articles + masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) + if masset: + jmasset = self._parse_json(masset,list_id) + for key, val in jmasset.get('videos',{}).items(): + for skey, sval in val.items(): + sval['id'] = skey + jsent.append(sval) + + if len(jsent) == 0: + # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} + # in http://www.bbc.com/news/video_and_audio/international + # prone to breaking if entries have sourceFiles list + jsent = map( + lambda m: self._parse_json(m,list_id), + re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + ) + + if len(jsent) == 0: + raise ExtractorError('No video found', expected=True) + + for jent in jsent: + programme_id = jent.get('externalId') + xml_url = jent.get('hxref') + + title = jent.get('caption',list_title) + + duration = parse_duration(jent.get('duration')) + description = list_title + ' - ' + jent.get('caption','') + thumbnail = None + if jent.has_key('image'): + thumbnail=jent['image'].get('href') + + formats = [] + subtitles = [] + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + elif jent.has_key('sourceFiles'): + # mediaselector not used at + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu + for key, val in jent['sourceFiles'].items(): + formats.append( { + 'ext': val.get('encoding'), + 'url': val.get('url'), + 'filesize': int(val.get('filesize')), + 'format_id': key + } ) + elif xml_url: + # Cheap fallback + # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml + xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') + programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') + formats, subtitles = self._download_media_selector(programme_id) + + if len(formats) == 0: + raise ExtractorError('unsupported json media entry.\n '+str(jent)+'\n') + + self._sort_formats(formats) + + ret.append( { + 'id': jent.get('programme_id',jent.get('id')), + 'uploader': 'BBC News', + 'upload_date': pubdate, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } ) + + if len(ret) > 0: + return self.playlist_result(ret, list_id, list_title) + raise ExtractorError('No video found', expected=True) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py deleted file mode 100644 index 72e20857b..000000000 --- a/youtube_dl/extractor/bbccouk.py +++ /dev/null @@ -1,556 +0,0 @@ -from __future__ import unicode_literals - -import xml.etree.ElementTree - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - parse_duration, - int_or_none, -) -from ..compat import compat_HTTPError -import re - - -class BBCCoUkIE(InfoExtractor): - IE_NAME = 'bbc.co.uk' - IE_DESC = 'BBC iPlayer' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' - - mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' - - _TESTS = [ - { - 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', - 'info_dict': { - 'id': 'b039d07m', - 'ext': 'flv', - 'title': 'Kaleidoscope, Leonard Cohen', - 'description': 'The Canadian poet and songwriter reflects on his musical career.', - 'duration': 1740, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, - { - 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/', - 'info_dict': { - 'id': 'b00yng1d', - 'ext': 'flv', - 'title': 'The Man in Black: Series 3: The Printed Name', - 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.", - 'duration': 1800, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Episode is no longer available on BBC iPlayer Radio', - }, - { - 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/', - 'info_dict': { - 'id': 'b00yng1d', - 'ext': 'flv', - 'title': 'The Voice UK: Series 3: Blind Auditions 5', - 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.", - 'duration': 5100, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', - }, - { - 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion', - 'info_dict': { - 'id': 'b03k3pb7', - 'ext': 'flv', - 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction", - 'description': '2. Invasion', - 'duration': 3600, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', - }, { - 'url': 'http://www.bbc.co.uk/programmes/b04v20dw', - 'info_dict': { - 'id': 'b04v209v', - 'ext': 'flv', - 'title': 'Pete Tong, The Essential New Tune Special', - 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!", - 'duration': 10800, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3', - 'note': 'Audio', - 'info_dict': { - 'id': 'p02frcch', - 'ext': 'flv', - 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix', - 'description': 'French house superstar Madeon takes us out of the club and onto the after party.', - 'duration': 3507, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz', - 'note': 'Video', - 'info_dict': { - 'id': 'p025c103', - 'ext': 'flv', - 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)', - 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014', - 'duration': 226, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls', - 'info_dict': { - 'id': 'p02n76xf', - 'ext': 'flv', - 'title': 'Natural World, 2015-2016: 2. Super Powered Owls', - 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d', - 'duration': 3540, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'geolocation', - }, { - 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition', - 'info_dict': { - 'id': 'b05zmgw1', - 'ext': 'flv', - 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.', - 'title': 'Royal Academy Summer Exhibition', - 'duration': 3540, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - 'skip': 'geolocation', - }, { - 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', - 'only_matching': True, - }, { - 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3', - 'only_matching': True, - }, { - 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo', - 'only_matching': True, - } - ] - - def _extract_asx_playlist(self, connection, programme_id): - asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') - return [ref.get('href') for ref in asx.findall('./Entry/ref')] - - def _extract_connection(self, connection, programme_id): - formats = [] - protocol = connection.get('protocol') - supplier = connection.get('supplier') - if protocol == 'http': - href = connection.get('href') - # ASX playlist - if supplier == 'asx': - for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): - formats.append({ - 'url': ref, - 'format_id': 'ref%s_%s' % (i, supplier), - }) - # Direct link - else: - formats.append({ - 'url': href, - 'format_id': supplier, - }) - elif protocol == 'rtmp': - application = connection.get('application', 'ondemand') - auth_string = connection.get('authString') - identifier = connection.get('identifier') - server = connection.get('server') - formats.append({ - 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), - 'play_path': identifier, - 'app': '%s?%s' % (application, auth_string), - 'page_url': 'http://www.bbc.co.uk', - 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', - 'rtmp_live': False, - 'ext': 'flv', - 'format_id': supplier, - }) - return formats - - def _extract_items(self, playlist): - return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item') - - def _extract_medias(self, media_selection): - error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error') - if error is not None: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True) - return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media') - - def _extract_connections(self, media): - return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection') - - def _extract_video(self, media, programme_id): - formats = [] - vbr = int(media.get('bitrate')) - vcodec = media.get('encoding') - service = media.get('service') - width = int(media.get('width')) - height = int(media.get('height')) - file_size = int(media.get('media_file_size')) - for connection in self._extract_connections(media): - conn_formats = self._extract_connection(connection, programme_id) - for format in conn_formats: - format.update({ - 'format_id': '%s_%s' % (service, format['format_id']), - 'width': width, - 'height': height, - 'vbr': vbr, - 'vcodec': vcodec, - 'filesize': file_size, - }) - formats.extend(conn_formats) - return formats - - def _extract_audio(self, media, programme_id): - formats = [] - abr = int(media.get('bitrate')) - acodec = media.get('encoding') - service = media.get('service') - for connection in self._extract_connections(media): - conn_formats = self._extract_connection(connection, programme_id) - for format in conn_formats: - format.update({ - 'format_id': '%s_%s' % (service, format['format_id']), - 'abr': abr, - 'acodec': acodec, - }) - formats.extend(conn_formats) - return formats - - def _get_subtitles(self, media, programme_id): - subtitles = {} - for connection in self._extract_connections(media): - captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') - lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') - ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}')) - srt = '' - - def _extract_text(p): - if p.text is not None: - stripped_text = p.text.strip() - if stripped_text: - return stripped_text - return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span')) - for pos, p in enumerate(ps): - srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p)) - subtitles[lang] = [ - { - 'url': connection.get('href'), - 'ext': 'ttml', - }, - { - 'data': srt, - 'ext': 'srt', - }, - ] - return subtitles - - def _download_media_selector(self, programme_id): - try: - media_selection = self._download_xml( - self.mediaselector_url % programme_id, - programme_id, 'Downloading media selection XML') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) - else: - raise - - formats = [] - subtitles = None - - for media in self._extract_medias(media_selection): - kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) - elif kind == 'captions': - subtitles = self.extract_subtitles(media, programme_id) - - return formats, subtitles - - def _download_playlist(self, playlist_id): - try: - playlist = self._download_json( - 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, - playlist_id, 'Downloading playlist JSON') - - version = playlist.get('defaultAvailableVersion') - if version: - smp_config = version['smpConfig'] - title = smp_config['title'] - description = smp_config['summary'] - for item in smp_config['items']: - kind = item['kind'] - if kind != 'programme' and kind != 'radioProgramme': - continue - programme_id = item.get('vpid') - duration = int(item.get('duration')) - formats, subtitles = self._download_media_selector(programme_id) - return programme_id, title, description, duration, formats, subtitles - except ExtractorError as ee: - if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): - raise - - # fallback to legacy playlist - playlist = self._download_xml( - 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, - playlist_id, 'Downloading legacy playlist XML') - - no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems') - if no_items is not None: - reason = no_items.get('reason') - if reason == 'preAvailability': - msg = 'Episode %s is not yet available' % playlist_id - elif reason == 'postAvailability': - msg = 'Episode %s is no longer available' % playlist_id - elif reason == 'noMedia': - msg = 'Episode %s is not currently available' % playlist_id - else: - msg = 'Episode %s is not available: %s' % (playlist_id, reason) - raise ExtractorError(msg, expected=True) - - for item in self._extract_items(playlist): - kind = item.get('kind') - if kind != 'programme' and kind != 'radioProgramme': - continue - title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text - description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text - programme_id = item.get('identifier') - duration = int(item.get('duration')) - formats, subtitles = self._download_media_selector(programme_id) - - return programme_id, title, description, duration, formats, subtitles - - def _real_extract(self, url): - group_id = self._match_id(url) - - webpage = self._download_webpage(url, group_id, 'Downloading video page') - - programme_id = None - - tviplayer = self._search_regex( - r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById', - webpage, 'player', default=None) - - if tviplayer: - player = self._parse_json(tviplayer, group_id).get('player', {}) - duration = int_or_none(player.get('duration')) - programme_id = player.get('vpid') - - if not programme_id: - programme_id = self._search_regex( - r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None) - - if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - title = self._og_search_title(webpage) - description = self._search_regex( - r'

([^<]+)

', - webpage, 'description', fatal=False) - else: - programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) - - self._sort_formats(formats) - - return { - 'id': programme_id, - 'title': title, - 'description': description, - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } - - -class BBCNewsIE(BBCCoUkIE): - IE_NAME = 'bbc.com' - IE_DESC = 'BBC news' - _VALID_URL = r'https?://(?:www\.)?bbc\.com/.+?/(?P[^/]+)$' - - mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' - - _TESTS = [{ - 'url': 'http://www.bbc.com/news/world-europe-32668511', - 'info_dict': { - 'id': 'world-europe-32668511', - 'title': 'Russia stages massive WW2 parade despite Western boycott', - }, - 'playlist_count': 2, - },{ - 'url': 'http://www.bbc.com/news/business-28299555', - 'info_dict': { - 'id': 'business-28299555', - 'title': 'Farnborough Airshow: Video highlights', - }, - 'playlist_count': 9, - },{ - 'url': 'http://www.bbc.com/news/world-europe-32041533', - 'note': 'Video', - 'info_dict': { - 'id': 'p02mprgb', - 'ext': 'mp4', - 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'duration': 47, - }, - 'params': { - 'skip_download': True, - } - },{ - 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', - 'note': 'Video', - 'info_dict': { - 'id': 'NA', - 'ext': 'mp4', - 'title': 'YPG - Tel Abyad..n tamam. kontrol.m.zde', - 'duration': 47, - }, - 'params': { - 'skip_download': True, - } - },{ - 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', - 'note': 'Video', - 'info_dict': { - 'id': '39275083', - 'ext': 'mp4', - 'title': 'Honduras militariza sus hospitales por nuevo esc.ndalo de corrupci.n', - 'duration': 87, - }, - 'params': { - 'skip_download': True, - } - }] - - def _real_extract(self, url): - list_id = self._match_id(url) - webpage = self._download_webpage(url, list_id) - - list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'list title') - - pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) - if pubdate: - pubdate = pubdate.replace('-','') - - ret = [] - jsent = [] - - # works with bbc.com/news/something-something-123456 articles - jsent = map( - lambda m: self._parse_json(m,list_id), - re.findall(r"data-media-meta='({[^']+})'", webpage) - ) - - if len(jsent) == 0: - # http://www.bbc.com/news/video_and_audio/international - # and single-video articles - masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) - if masset: - jmasset = self._parse_json(masset,list_id) - for key, val in jmasset.get('videos',{}).items(): - for skey, sval in val.items(): - sval['id'] = skey - jsent.append(sval) - - if len(jsent) == 0: - # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} - # in http://www.bbc.com/news/video_and_audio/international - # prone to breaking if entries have sourceFiles list - jsent = map( - lambda m: self._parse_json(m,list_id), - re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - ) - - if len(jsent) == 0: - raise ExtractorError('No video found', expected=True) - - for jent in jsent: - programme_id = jent.get('externalId') - xml_url = jent.get('hxref') - - title = jent.get('caption',list_title) - - duration = parse_duration(jent.get('duration')) - description = list_title + ' - ' + jent.get('caption','') - thumbnail = None - if jent.has_key('image'): - thumbnail=jent['image'].get('href') - - formats = [] - subtitles = [] - - if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - elif jent.has_key('sourceFiles'): - # mediaselector not used at - # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu - for key, val in jent['sourceFiles'].items(): - formats.append( { - 'ext': val.get('encoding'), - 'url': val.get('url'), - 'filesize': int(val.get('filesize')), - 'format_id': key - } ) - elif xml_url: - # Cheap fallback - # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml - xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') - programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') - formats, subtitles = self._download_media_selector(programme_id) - - if len(formats) == 0: - raise ExtractorError('unsupported json media entry.\n '+str(jent)+'\n') - - self._sort_formats(formats) - - ret.append( { - 'id': jent.get('programme_id',jent.get('id')), - 'uploader': 'BBC News', - 'upload_date': pubdate, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } ) - - if len(ret) > 0: - return self.playlist_result(ret, list_id, list_title) - raise ExtractorError('No video found', expected=True) -- cgit v1.2.3 From 2a282a3b5f366ba0569bae477d5060329ba254fb Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 11:11:41 -0500 Subject: Unbreak breakage that was broken to test breakage --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 72e20857b..310db9d1d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -502,7 +502,7 @@ class BBCNewsIE(BBCCoUkIE): for jent in jsent: programme_id = jent.get('externalId') - xml_url = jent.get('hxref') + xml_url = jent.get('href') title = jent.get('caption',list_title) -- cgit v1.2.3 From a9dcf4a860214e37971ab05f27f74bbae65ff8ae Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 23 Jun 2015 01:08:07 -0500 Subject: Prefer externalId over non-mediaserver-specific hashkey for video id. --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 310db9d1d..fed344ea0 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -540,7 +540,7 @@ class BBCNewsIE(BBCCoUkIE): self._sort_formats(formats) ret.append( { - 'id': jent.get('programme_id',jent.get('id')), + 'id': jent.get('id') if programme_id == None else programme_id, 'uploader': 'BBC News', 'upload_date': pubdate, 'title': title, -- cgit v1.2.3 From da92eeae42f556926cb676b3c14e270603b7e38e Mon Sep 17 00:00:00 2001 From: fnord Date: Thu, 25 Jun 2015 00:31:32 -0500 Subject: Fix tests, description formatting --- youtube_dl/extractor/bbc.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index fed344ea0..bb671d473 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -428,6 +428,8 @@ class BBCNewsIE(BBCCoUkIE): 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'duration': 47, + 'upload_date': '20150324', + 'uploader': 'BBC News', }, 'params': { 'skip_download': True, @@ -438,8 +440,11 @@ class BBCNewsIE(BBCCoUkIE): 'info_dict': { 'id': 'NA', 'ext': 'mp4', - 'title': 'YPG - Tel Abyad..n tamam. kontrol.m.zde', + 'title': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde', + 'description': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde', 'duration': 47, + 'upload_date': '20150615', + 'uploader': 'BBC News', }, 'params': { 'skip_download': True, @@ -450,8 +455,11 @@ class BBCNewsIE(BBCCoUkIE): 'info_dict': { 'id': '39275083', 'ext': 'mp4', - 'title': 'Honduras militariza sus hospitales por nuevo esc.ndalo de corrupci.n', + 'title': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n', + 'description': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n', 'duration': 87, + 'upload_date': '20150619', + 'uploader': 'BBC News', }, 'params': { 'skip_download': True, @@ -507,7 +515,9 @@ class BBCNewsIE(BBCCoUkIE): title = jent.get('caption',list_title) duration = parse_duration(jent.get('duration')) - description = list_title + ' - ' + jent.get('caption','') + description = list_title + if jent.get('caption'): + description += ' - ' + jent.get('caption') thumbnail = None if jent.has_key('image'): thumbnail=jent['image'].get('href') @@ -539,8 +549,12 @@ class BBCNewsIE(BBCCoUkIE): self._sort_formats(formats) + id = jent.get('id') if programme_id == None else programme_id + if id == None: + id = 'NA' + ret.append( { - 'id': jent.get('id') if programme_id == None else programme_id, + 'id': id, 'uploader': 'BBC News', 'upload_date': pubdate, 'title': title, -- cgit v1.2.3 From 36da48798a28b8261d2f39f73f2522651d58a364 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 02:27:50 -0500 Subject: handle titles and captions set to '' --- youtube_dl/extractor/bbc.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 471d865d2..c910eb55a 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -497,11 +497,13 @@ class BBCNewsIE(BBCCoUkIE): programme_id = jent.get('externalId') xml_url = jent.get('href') - title = jent.get('caption',list_title) + title = jent.get('caption','') + if title == '': + title = list_title duration = parse_duration(jent.get('duration')) description = list_title - if jent.get('caption'): + if jent.get('caption', '') != '': description += ' - ' + jent.get('caption') thumbnail = None if jent.has_key('image'): -- cgit v1.2.3 From a3bfddfa5ee33cf085b959536f1025c0aa53cc77 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 02:47:02 -0500 Subject: bbc.py: correct syntax --- youtube_dl/extractor/bbc.py | 106 ++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index c910eb55a..c8f285165 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -397,14 +397,14 @@ class BBCNewsIE(BBCCoUkIE): 'title': 'Russia stages massive WW2 parade despite Western boycott', }, 'playlist_count': 2, - },{ + }, { 'url': 'http://www.bbc.com/news/business-28299555', 'info_dict': { 'id': 'business-28299555', 'title': 'Farnborough Airshow: Video highlights', }, 'playlist_count': 9, - },{ + }, { 'url': 'http://www.bbc.com/news/world-europe-32041533', 'note': 'Video', 'info_dict': { @@ -419,7 +419,7 @@ class BBCNewsIE(BBCCoUkIE): 'params': { 'skip_download': True, } - },{ + }, { 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', 'note': 'Video', 'info_dict': { @@ -434,7 +434,7 @@ class BBCNewsIE(BBCCoUkIE): 'params': { 'skip_download': True, } - },{ + }, { 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'note': 'Video', 'info_dict': { @@ -459,88 +459,88 @@ class BBCNewsIE(BBCCoUkIE): pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) if pubdate: - pubdate = pubdate.replace('-','') + pubdate = pubdate.replace('-', '') ret = [] jsent = [] # works with bbc.com/news/something-something-123456 articles jsent = map( - lambda m: self._parse_json(m,list_id), - re.findall(r"data-media-meta='({[^']+})'", webpage) + lambda m: self._parse_json(m, list_id), + re.findall(r"data-media-meta='({[^']+})'", webpage) ) if len(jsent) == 0: - # http://www.bbc.com/news/video_and_audio/international - # and single-video articles - masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) - if masset: - jmasset = self._parse_json(masset,list_id) - for key, val in jmasset.get('videos',{}).items(): - for skey, sval in val.items(): - sval['id'] = skey - jsent.append(sval) + # http://www.bbc.com/news/video_and_audio/international + # and single-video articles + masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) + if masset: + jmasset = self._parse_json(masset, list_id) + for key, val in jmasset.get('videos', {}).items(): + for skey, sval in val.items(): + sval['id'] = skey + jsent.append(sval) if len(jsent) == 0: - # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} - # in http://www.bbc.com/news/video_and_audio/international - # prone to breaking if entries have sourceFiles list - jsent = map( - lambda m: self._parse_json(m,list_id), - re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - ) + # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} + # in http://www.bbc.com/news/video_and_audio/international + # prone to breaking if entries have sourceFiles list + jsent = map( + lambda m: self._parse_json(m, list_id), + re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + ) if len(jsent) == 0: - raise ExtractorError('No video found', expected=True) + raise ExtractorError('No video found', expected=True) for jent in jsent: programme_id = jent.get('externalId') xml_url = jent.get('href') - title = jent.get('caption','') + title = jent.get('caption', '') if title == '': - title = list_title + title = list_title duration = parse_duration(jent.get('duration')) description = list_title if jent.get('caption', '') != '': - description += ' - ' + jent.get('caption') + description += ' - ' + jent.get('caption') thumbnail = None - if jent.has_key('image'): - thumbnail=jent['image'].get('href') + if jent.get('image') is not None: + thumbnail = jent['image'].get('href') formats = [] subtitles = [] if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - elif jent.has_key('sourceFiles'): - # mediaselector not used at - # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu - for key, val in jent['sourceFiles'].items(): - formats.append( { - 'ext': val.get('encoding'), - 'url': val.get('url'), - 'filesize': int(val.get('filesize')), - 'format_id': key - } ) + formats, subtitles = self._download_media_selector(programme_id) + elif jent.get('sourceFiles') is not None: + # mediaselector not used at + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu + for key, val in jent['sourceFiles'].items(): + formats.append({ + 'ext': val.get('encoding'), + 'url': val.get('url'), + 'filesize': int(val.get('filesize')), + 'format_id': key + }) elif xml_url: - # Cheap fallback - # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml - xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') - programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') - formats, subtitles = self._download_media_selector(programme_id) + # Cheap fallback + # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml + xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') + programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') + formats, subtitles = self._download_media_selector(programme_id) if len(formats) == 0: - raise ExtractorError('unsupported json media entry.\n '+str(jent)+'\n') - + raise ExtractorError('unsupported json media entry.\n ' + str(jent) + '\n') + self._sort_formats(formats) - id = jent.get('id') if programme_id == None else programme_id - if id == None: - id = 'NA' + id = jent.get('id') if programme_id is None else programme_id + if id is None: + id = 'NA' - ret.append( { + ret.append({ 'id': id, 'uploader': 'BBC News', 'upload_date': pubdate, @@ -550,8 +550,8 @@ class BBCNewsIE(BBCCoUkIE): 'duration': duration, 'formats': formats, 'subtitles': subtitles, - } ) + }) if len(ret) > 0: - return self.playlist_result(ret, list_id, list_title) + return self.playlist_result(ret, list_id, list_title) raise ExtractorError('No video found', expected=True) -- cgit v1.2.3