aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Hagemeister <phihag@phihag.de>2014-05-15 12:41:42 +0200
committerPhilipp Hagemeister <phihag@phihag.de>2014-05-15 12:41:42 +0200
commitad3bc6acd5d6724875b9fa59f9b5cdb9b904ec91 (patch)
treeed3d097e27e7946c9a93fb9bd82b1072202e82ed
parent5afa7f8beefcd9b34035f821ad1cecbcf49a6db8 (diff)
Document and test categories (#2923)
-rw-r--r--youtube_dl/extractor/common.py2
-rw-r--r--youtube_dl/extractor/youtube.py16
2 files changed, 11 insertions, 7 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 1e366a13c..db472aace 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -113,6 +113,8 @@ class InfoExtractor(object):
webpage_url: The url to the video webpage, if given to youtube-dl it
should allow to get the same result again. (It will be set
by YoutubeDL if it's missing)
+ categories: A list of categories that the video falls in, for example
+ ["Sports", "Berlin"]
Unless mentioned otherwise, the fields should be Unicode strings.
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index f0a92d182..3c8f7f7a2 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -242,7 +242,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
u"uploader": u"Philipp Hagemeister",
u"uploader_id": u"phihag",
u"upload_date": u"20121002",
- u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
+ u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
+ u"categories": [u'Science & Technology'],
}
},
{
@@ -1136,18 +1137,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# upload date
upload_date = None
- mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
+ mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
if mobj is not None:
upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
upload_date = unified_strdate(upload_date)
-
- video_categories = []
- # categories
m_cat_container = get_element_by_id("eow-category", video_webpage)
if m_cat_container:
- video_categories = re.findall(r'<a[^<]+>(.*?)</a>',
- m_cat_container, re.DOTALL)
+ category = self._html_search_regex(
+ r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'cateory',
+ default=None)
+ video_categories = None if category is None else [category]
+ else:
+ video_categories = None
# description
video_description = get_element_by_id("eow-description", video_webpage)