[youtube] Extract video titles for channel playlist if possible (Closes #4971)

author: Sergey M․ <dstftw@gmail.com> 2015-04-12 23:19:00 +0600
committer: Sergey M․ <dstftw@gmail.com> 2015-04-12 23:19:00 +0600
commit: fb69240ca0934299583bf6c7a855d5c602a4a7e0 (patch)
tree: 5e75e7da0d43924de02cf27ea29496733605775e /youtube_dl/extractor/youtube.py
parent: 830d53bfae7a665b55656dd50c9f35f0d0b0161d (diff)
1 files changed, 20 insertions, 9 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 2774ec30b..791e1fe62 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1370,10 +1370,18 @@ class YoutubeChannelIE(InfoExtractor):
 
     def extract_videos_from_page(self, page):
         ids_in_page = []
-        for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
-            if mobj.group(1) not in ids_in_page:
-                ids_in_page.append(mobj.group(1))
-        return ids_in_page
+        titles_in_page = []
+        for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
+            video_id = mobj.group('id')
+            video_title = unescapeHTML(mobj.group('title'))
+            try:
+                idx = ids_in_page.index(video_id)
+                if video_title and not titles_in_page[idx]:
+                    titles_in_page[idx] = video_title
+            except ValueError:
+                ids_in_page.append(video_id)
+                titles_in_page.append(video_title)
+        return zip(ids_in_page, titles_in_page)
 
     def _real_extract(self, url):
         channel_id = self._match_id(url)
@@ -1390,10 +1398,12 @@ class YoutubeChannelIE(InfoExtractor):
         if autogenerated:
             # The videos are contained in a single page
             # the ajax pages can't be used, they are empty
-            video_ids = self.extract_videos_from_page(channel_page)
+            videos = self.extract_videos_from_page(channel_page)
             entries = [
-                self.url_result(video_id, 'Youtube', video_id=video_id)
-                for video_id in video_ids]
+                self.url_result(
+                    video_id, 'Youtube', video_id=video_id,
+                    video_title=video_title)
+                for video_id, video_title in videos]
             return self.playlist_result(entries, channel_id)
 
         def _entries():
@@ -1401,9 +1411,10 @@ class YoutubeChannelIE(InfoExtractor):
             for pagenum in itertools.count(1):
 
                 ids_in_page = self.extract_videos_from_page(content_html)
-                for video_id in ids_in_page:
+                for video_id, video_title in ids_in_page:
                     yield self.url_result(
-                        video_id, 'Youtube', video_id=video_id)
+                        video_id, 'Youtube', video_id=video_id,
+                        video_title=video_title)
 
                 mobj = re.search(
                     r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
author	Sergey M․ <dstftw@gmail.com>	2015-04-12 23:19:00 +0600
committer	Sergey M․ <dstftw@gmail.com>	2015-04-12 23:19:00 +0600
commit	fb69240ca0934299583bf6c7a855d5c602a4a7e0 (patch)
tree	5e75e7da0d43924de02cf27ea29496733605775e /youtube_dl/extractor/youtube.py
parent	830d53bfae7a665b55656dd50c9f35f0d0b0161d (diff)