aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/bbc.py
diff options
context:
space:
mode:
authorSergey M․ <dstftw@gmail.com>2015-10-11 00:25:09 +0600
committerSergey M․ <dstftw@gmail.com>2015-10-11 00:25:09 +0600
commitae8bdfd1a1548c83ab7df378096da927b5374a29 (patch)
tree7236ee056e91108b4d0b8986cc4efa10d6a1a706 /youtube_dl/extractor/bbc.py
parent6a747190605229e9cfba5450cf0ecaf435b7a85e (diff)
downloadyoutube-dl-ae8bdfd1a1548c83ab7df378096da927b5374a29.tar.xz
[bbc] Extract article JSON and actualize tests
Diffstat (limited to 'youtube_dl/extractor/bbc.py')
-rw-r--r--youtube_dl/extractor/bbc.py45
1 files changed, 29 insertions, 16 deletions
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index 4eae4f52e..b98db95b9 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -11,6 +11,7 @@ from ..utils import (
int_or_none,
parse_duration,
parse_iso8601,
+ remove_end,
unescapeHTML,
)
from ..compat import compat_HTTPError
@@ -533,7 +534,7 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.com/news/world-europe-32041533',
'info_dict': {
'id': 'p02mprgb',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
'duration': 47,
'timestamp': 1427219242,
@@ -552,7 +553,6 @@ class BBCIE(BBCCoUkIE):
'id': '150615_telabyad_kentin_cogu',
'ext': 'mp4',
'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
- 'duration': 47,
'timestamp': 1434397334,
'upload_date': '20150615',
},
@@ -566,7 +566,6 @@ class BBCIE(BBCCoUkIE):
'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
'ext': 'mp4',
'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
- 'duration': 87,
'timestamp': 1434713142,
'upload_date': '20150619',
},
@@ -578,7 +577,7 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
'info_dict': {
'id': 'p02w6qjc',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
'duration': 56,
},
@@ -605,11 +604,11 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
'info_dict': {
'id': 'p018zqqg',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Hyundai Santa Fe Sport: Rock star',
'description': 'md5:b042a26142c4154a6e472933cf20793d',
- 'timestamp': 1368473503,
- 'upload_date': '20130513',
+ 'timestamp': 1415867444,
+ 'upload_date': '20141113',
},
'params': {
# rtmp download
@@ -620,9 +619,8 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.com/sport/0/football/33653409',
'info_dict': {
'id': 'p02xycnp',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
- 'description': 'md5:398fca0e2e701c609d726e034fa1fc89',
'duration': 140,
},
'params': {
@@ -697,11 +695,26 @@ class BBCIE(BBCCoUkIE):
webpage = self._download_webpage(url, playlist_id)
- timestamp = parse_iso8601(self._search_regex(
- [r'"datePublished":\s*"([^"]+)',
- r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
- r'itemprop="datePublished"[^>]+datetime="([^"]+)"'],
- webpage, 'date', default=None))
+ timestamp = None
+ playlist_title = None
+ playlist_description = None
+
+ ld = self._parse_json(
+ self._search_regex(
+ r'(?s)<script type="application/ld\+json">(.+?)</script>',
+ webpage, 'ld json', default='{}'),
+ playlist_id, fatal=False)
+ if ld:
+ timestamp = parse_iso8601(ld.get('datePublished'))
+ playlist_title = ld.get('headline')
+ playlist_description = ld.get('articleBody')
+
+ if not timestamp:
+ timestamp = parse_iso8601(self._search_regex(
+ [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
+ r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
+ r'"datePublished":\s*"([^"]+)',],
+ webpage, 'date', default=None))
entries = []
@@ -754,8 +767,8 @@ class BBCIE(BBCCoUkIE):
playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
if entries:
- playlist_title = self._og_search_title(webpage)
- playlist_description = self._og_search_description(webpage, default=None)
+ playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News')
+ playlist_description = playlist_description or self._og_search_description(webpage, default=None)
return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
# single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)