diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/bbc.py | 34 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 11 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 8 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 11 |
4 files changed, 48 insertions, 16 deletions
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 0f0ea7cfd..3d9366644 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -458,6 +458,14 @@ class BBCIE(BBCCoUkIE): 'playlist_count': 9, 'skip': 'Save time', }, { + # article with multiple videos embedded with `new SMP()` + 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460', + 'info_dict': { + 'id': '3662a707-0af9-3149-963f-47bea720b460', + 'title': 'BBC Blogs - Adam Curtis - BUGGER', + }, + 'playlist_count': 18, + }, { # single video embedded with mediaAssetPage.init() 'url': 'http://www.bbc.com/news/world-europe-32041533', 'info_dict': { @@ -644,12 +652,30 @@ class BBCIE(BBCCoUkIE): playlist_title = self._html_search_regex( r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title') - playlist_description = self._og_search_description(webpage) + playlist_description = self._og_search_description(webpage, default=None) + + def extract_all(pattern): + return list(filter(None, map( + lambda s: self._parse_json(s, playlist_id, fatal=False), + re.findall(pattern, webpage)))) + + # Multiple video article (e.g. + # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) + EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?' + entries = [] + for match in extract_all(r'new\s+SMP\(({.+?})\)'): + embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') + if embed_url and re.match(EMBED_URL, embed_url): + entries.append(embed_url) + entries.extend(re.findall( + r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage)) + if entries: + return self.playlist_result( + [self.url_result(entry, 'BBCCoUk') for entry in entries], + playlist_id, playlist_title, playlist_description) # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) - medias = list(filter(None, map( - lambda s: self._parse_json(s, playlist_id, fatal=False), - re.findall(r"data-media-meta='({[^']+})'", webpage)))) + medias = extract_all(r"data-media-meta='({[^']+})'") if not medias: # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 14b9b4fe2..d54866d1f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -181,6 +181,7 @@ class InfoExtractor(object): by YoutubeDL if it's missing) categories: A list of categories that the video falls in, for example ["Sports", "Berlin"] + tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] is_live: True, False, or None (=unknown). Whether this video is a live stream that goes on instead of a fixed-length video. start_time: Time in seconds where the reproduction should start, as @@ -630,6 +631,12 @@ class InfoExtractor(object): template % (content_re, property_re), ] + @staticmethod + def _meta_regex(prop): + return r'''(?isx)<meta + (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) + [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) + def _og_search_property(self, prop, html, name=None, **kargs): if name is None: name = 'OpenGraph %s' % prop @@ -660,9 +667,7 @@ class InfoExtractor(object): if display_name is None: display_name = name return self._html_search_regex( - r'''(?isx)<meta - (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) - [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name), + self._meta_regex(name), html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6d2efb22e..8cef61c3c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -276,14 +276,6 @@ class GenericIE(InfoExtractor): 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.', }, }, - # BBC iPlayer embeds - { - 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER', - 'info_dict': { - 'title': 'BBC - Blogs - Adam Curtis - BUGGER', - }, - 'playlist_mincount': 18, - }, # RUTV embed { 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html', diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0e411bfb6..4c449fd74 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -329,6 +329,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20121002', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], + 'tags': ['youtube-dl'], 'like_count': int, 'dislike_count': int, 'start_time': 1, @@ -343,7 +344,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', - 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f', + 'description': 'md5:782e8651347686cba06e58f71ab51773', + 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', + 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', + 'iconic ep', 'iconic', 'love', 'it'], 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', } @@ -1072,6 +1076,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_categories = None + video_tags = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] + # description video_description = get_element_by_id("eow-description", video_webpage) if video_description: @@ -1260,6 +1268,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'thumbnail': video_thumbnail, 'description': video_description, 'categories': video_categories, + 'tags': video_tags, 'subtitles': video_subtitles, 'automatic_captions': automatic_captions, 'duration': video_duration, |