aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey M․ <dstftw@gmail.com>2015-04-12 23:19:00 +0600
committerSergey M․ <dstftw@gmail.com>2015-04-12 23:19:00 +0600
commitfb69240ca0934299583bf6c7a855d5c602a4a7e0 (patch)
tree5e75e7da0d43924de02cf27ea29496733605775e
parent830d53bfae7a665b55656dd50c9f35f0d0b0161d (diff)
[youtube] Extract video titles for channel playlist if possible (Closes #4971)
-rw-r--r--youtube_dl/extractor/youtube.py29
1 files changed, 20 insertions, 9 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 2774ec30b..791e1fe62 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1370,10 +1370,18 @@ class YoutubeChannelIE(InfoExtractor):
def extract_videos_from_page(self, page):
ids_in_page = []
- for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
- if mobj.group(1) not in ids_in_page:
- ids_in_page.append(mobj.group(1))
- return ids_in_page
+ titles_in_page = []
+ for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
+ video_id = mobj.group('id')
+ video_title = unescapeHTML(mobj.group('title'))
+ try:
+ idx = ids_in_page.index(video_id)
+ if video_title and not titles_in_page[idx]:
+ titles_in_page[idx] = video_title
+ except ValueError:
+ ids_in_page.append(video_id)
+ titles_in_page.append(video_title)
+ return zip(ids_in_page, titles_in_page)
def _real_extract(self, url):
channel_id = self._match_id(url)
@@ -1390,10 +1398,12 @@ class YoutubeChannelIE(InfoExtractor):
if autogenerated:
# The videos are contained in a single page
# the ajax pages can't be used, they are empty
- video_ids = self.extract_videos_from_page(channel_page)
+ videos = self.extract_videos_from_page(channel_page)
entries = [
- self.url_result(video_id, 'Youtube', video_id=video_id)
- for video_id in video_ids]
+ self.url_result(
+ video_id, 'Youtube', video_id=video_id,
+ video_title=video_title)
+ for video_id, video_title in videos]
return self.playlist_result(entries, channel_id)
def _entries():
@@ -1401,9 +1411,10 @@ class YoutubeChannelIE(InfoExtractor):
for pagenum in itertools.count(1):
ids_in_page = self.extract_videos_from_page(content_html)
- for video_id in ids_in_page:
+ for video_id, video_title in ids_in_page:
yield self.url_result(
- video_id, 'Youtube', video_id=video_id)
+ video_id, 'Youtube', video_id=video_id,
+ video_title=video_title)
mobj = re.search(
r'data-uix-load-more-href="/?(?P<more>[^"]+)"',