aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/bbc.py34
-rw-r--r--youtube_dl/extractor/common.py11
-rw-r--r--youtube_dl/extractor/generic.py8
-rw-r--r--youtube_dl/extractor/youtube.py11
4 files changed, 48 insertions, 16 deletions
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index 0f0ea7cfd..3d9366644 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -458,6 +458,14 @@ class BBCIE(BBCCoUkIE):
'playlist_count': 9,
'skip': 'Save time',
}, {
+ # article with multiple videos embedded with `new SMP()`
+ 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
+ 'info_dict': {
+ 'id': '3662a707-0af9-3149-963f-47bea720b460',
+ 'title': 'BBC Blogs - Adam Curtis - BUGGER',
+ },
+ 'playlist_count': 18,
+ }, {
# single video embedded with mediaAssetPage.init()
'url': 'http://www.bbc.com/news/world-europe-32041533',
'info_dict': {
@@ -644,12 +652,30 @@ class BBCIE(BBCCoUkIE):
playlist_title = self._html_search_regex(
r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title')
- playlist_description = self._og_search_description(webpage)
+ playlist_description = self._og_search_description(webpage, default=None)
+
+ def extract_all(pattern):
+ return list(filter(None, map(
+ lambda s: self._parse_json(s, playlist_id, fatal=False),
+ re.findall(pattern, webpage))))
+
+ # Multiple video article (e.g.
+ # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
+ EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?'
+ entries = []
+ for match in extract_all(r'new\s+SMP\(({.+?})\)'):
+ embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
+ if embed_url and re.match(EMBED_URL, embed_url):
+ entries.append(embed_url)
+ entries.extend(re.findall(
+ r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
+ if entries:
+ return self.playlist_result(
+ [self.url_result(entry, 'BBCCoUk') for entry in entries],
+ playlist_id, playlist_title, playlist_description)
# Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
- medias = list(filter(None, map(
- lambda s: self._parse_json(s, playlist_id, fatal=False),
- re.findall(r"data-media-meta='({[^']+})'", webpage))))
+ medias = extract_all(r"data-media-meta='({[^']+})'")
if not medias:
# Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 14b9b4fe2..d54866d1f 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -181,6 +181,7 @@ class InfoExtractor(object):
by YoutubeDL if it's missing)
categories: A list of categories that the video falls in, for example
["Sports", "Berlin"]
+ tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
is_live: True, False, or None (=unknown). Whether this video is a
live stream that goes on instead of a fixed-length video.
start_time: Time in seconds where the reproduction should start, as
@@ -630,6 +631,12 @@ class InfoExtractor(object):
template % (content_re, property_re),
]
+ @staticmethod
+ def _meta_regex(prop):
+ return r'''(?isx)<meta
+ (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
+ [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
+
def _og_search_property(self, prop, html, name=None, **kargs):
if name is None:
name = 'OpenGraph %s' % prop
@@ -660,9 +667,7 @@ class InfoExtractor(object):
if display_name is None:
display_name = name
return self._html_search_regex(
- r'''(?isx)<meta
- (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
- [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
+ self._meta_regex(name),
html, display_name, fatal=fatal, group='content', **kwargs)
def _dc_search_uploader(self, html):
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 6d2efb22e..8cef61c3c 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -276,14 +276,6 @@ class GenericIE(InfoExtractor):
'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
},
},
- # BBC iPlayer embeds
- {
- 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
- 'info_dict': {
- 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
- },
- 'playlist_mincount': 18,
- },
# RUTV embed
{
'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 0e411bfb6..4c449fd74 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -329,6 +329,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20121002',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
+ 'tags': ['youtube-dl'],
'like_count': int,
'dislike_count': int,
'start_time': 1,
@@ -343,7 +344,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'upload_date': '20120506',
'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
- 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
+ 'description': 'md5:782e8651347686cba06e58f71ab51773',
+ 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
+ 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
+ 'iconic ep', 'iconic', 'love', 'it'],
'uploader': 'Icona Pop',
'uploader_id': 'IconaPop',
}
@@ -1072,6 +1076,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else:
video_categories = None
+ video_tags = [
+ unescapeHTML(m.group('content'))
+ for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
+
# description
video_description = get_element_by_id("eow-description", video_webpage)
if video_description:
@@ -1260,6 +1268,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'thumbnail': video_thumbnail,
'description': video_description,
'categories': video_categories,
+ 'tags': video_tags,
'subtitles': video_subtitles,
'automatic_captions': automatic_captions,
'duration': video_duration,