diff options
author | Sergey M․ <dstftw@gmail.com> | 2015-04-12 23:19:00 +0600 |
---|---|---|
committer | Sergey M․ <dstftw@gmail.com> | 2015-04-12 23:19:00 +0600 |
commit | fb69240ca0934299583bf6c7a855d5c602a4a7e0 (patch) | |
tree | 5e75e7da0d43924de02cf27ea29496733605775e | |
parent | 830d53bfae7a665b55656dd50c9f35f0d0b0161d (diff) |
[youtube] Extract video titles for channel playlist if possible (Closes #4971)
-rw-r--r-- | youtube_dl/extractor/youtube.py | 29 |
1 files changed, 20 insertions, 9 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2774ec30b..791e1fe62 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1370,10 +1370,18 @@ class YoutubeChannelIE(InfoExtractor): def extract_videos_from_page(self, page): ids_in_page = [] - for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(mobj.group(1)) - return ids_in_page + titles_in_page = [] + for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page): + video_id = mobj.group('id') + video_title = unescapeHTML(mobj.group('title')) + try: + idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: + titles_in_page[idx] = video_title + except ValueError: + ids_in_page.append(video_id) + titles_in_page.append(video_title) + return zip(ids_in_page, titles_in_page) def _real_extract(self, url): channel_id = self._match_id(url) @@ -1390,10 +1398,12 @@ class YoutubeChannelIE(InfoExtractor): if autogenerated: # The videos are contained in a single page # the ajax pages can't be used, they are empty - video_ids = self.extract_videos_from_page(channel_page) + videos = self.extract_videos_from_page(channel_page) entries = [ - self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in video_ids] + self.url_result( + video_id, 'Youtube', video_id=video_id, + video_title=video_title) + for video_id, video_title in videos] return self.playlist_result(entries, channel_id) def _entries(): @@ -1401,9 +1411,10 @@ class YoutubeChannelIE(InfoExtractor): for pagenum in itertools.count(1): ids_in_page = self.extract_videos_from_page(content_html) - for video_id in ids_in_page: + for video_id, video_title in ids_in_page: yield self.url_result( - video_id, 'Youtube', video_id=video_id) + video_id, 'Youtube', video_id=video_id, + video_title=video_title) mobj = re.search( r'data-uix-load-more-href="/?(?P<more>[^"]+)"', |