aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
authorSergey M․ <dstftw@gmail.com>2015-07-29 03:43:55 +0600
committerSergey M․ <dstftw@gmail.com>2015-07-29 03:43:55 +0600
commit23e7f53bd3c75ead7d2e474adff9df47b5e1c34a (patch)
treed801ec84d217b86fcac647bb878b5dede6d6a531 /youtube_dl/extractor
parent5d8df28d2728299d5550355f929e1282c0301464 (diff)
parent000b6b5ae5cc214906effe4ac5b78b579bc7db70 (diff)
Merge branch 'purdeaandrei-save_tags_simpler_only_saves_tags_to_info_json'
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/common.py11
-rw-r--r--youtube_dl/extractor/youtube.py11
2 files changed, 18 insertions, 4 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 14b9b4fe2..d54866d1f 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -181,6 +181,7 @@ class InfoExtractor(object):
by YoutubeDL if it's missing)
categories: A list of categories that the video falls in, for example
["Sports", "Berlin"]
+ tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
is_live: True, False, or None (=unknown). Whether this video is a
live stream that goes on instead of a fixed-length video.
start_time: Time in seconds where the reproduction should start, as
@@ -630,6 +631,12 @@ class InfoExtractor(object):
template % (content_re, property_re),
]
+ @staticmethod
+ def _meta_regex(prop):
+ return r'''(?isx)<meta
+ (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
+ [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
+
def _og_search_property(self, prop, html, name=None, **kargs):
if name is None:
name = 'OpenGraph %s' % prop
@@ -660,9 +667,7 @@ class InfoExtractor(object):
if display_name is None:
display_name = name
return self._html_search_regex(
- r'''(?isx)<meta
- (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
- [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
+ self._meta_regex(name),
html, display_name, fatal=fatal, group='content', **kwargs)
def _dc_search_uploader(self, html):
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 0e411bfb6..4c449fd74 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -329,6 +329,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20121002',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
+ 'tags': ['youtube-dl'],
'like_count': int,
'dislike_count': int,
'start_time': 1,
@@ -343,7 +344,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'upload_date': '20120506',
'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
- 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
+ 'description': 'md5:782e8651347686cba06e58f71ab51773',
+ 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
+ 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
+ 'iconic ep', 'iconic', 'love', 'it'],
'uploader': 'Icona Pop',
'uploader_id': 'IconaPop',
}
@@ -1072,6 +1076,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else:
video_categories = None
+ video_tags = [
+ unescapeHTML(m.group('content'))
+ for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
+
# description
video_description = get_element_by_id("eow-description", video_webpage)
if video_description:
@@ -1260,6 +1268,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'thumbnail': video_thumbnail,
'description': video_description,
'categories': video_categories,
+ 'tags': video_tags,
'subtitles': video_subtitles,
'automatic_captions': automatic_captions,
'duration': video_duration,