[youtube] Add support for downloading top lists (fixes #1868)

It needs to know the channel and the title of the list, because the ids change every time you browse the channels and are attached to a 'VISITOR_INFO1_LIVE' cookie.
author: Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> 2013-11-30 14:56:51 +0100
committer: Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> 2013-11-30 14:56:51 +0100
commit: 0a688bc0b28c970e9af965b3fa0c7927507eeb97 (patch)
tree: c4f6db17e33552a5dd58699ebe0e73bfaa284bb8
parent: b138de72f2f0fc197fe46154bcaeceddb5713e7f (diff)
download: youtube-dl-0a688bc0b28c970e9af965b3fa0c7927507eeb97.tar.xz
3 files changed, 44 insertions, 0 deletions
diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py
index 95f07d129..33db09f43 100644
--- a/test/test_youtube_lists.py
+++ b/test/test_youtube_lists.py
@@ -15,6 +15,7 @@ from youtube_dl.extractor import (
     YoutubeIE,
     YoutubeChannelIE,
     YoutubeShowIE,
+    YoutubeTopListIE,
 )
 
 
@@ -116,5 +117,12 @@ class TestYoutubeLists(unittest.TestCase):
         original_video = entries[0]
         self.assertEqual(original_video['id'], 'rjFaenf1T-Y')
 
+    def test_youtube_toplist(self):
+        dl = FakeYDL()
+        ie = YoutubeTopListIE(dl)
+        result = ie.extract('yttoplist:music:Top Tracks')
+        entries = result['entries']
+        self.assertTrue(len(entries) >= 9)
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 664639b53..0abf86e44 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -194,6 +194,7 @@ from .youtube import (
     YoutubeWatchLaterIE,
     YoutubeFavouritesIE,
     YoutubeHistoryIE,
+    YoutubeTopListIE,
 )
 from .zdf import ZDFIE
 
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 765b4a9bf..a1a4d896d 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1576,6 +1576,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
         if len(playlist_id) == 13:  # 'RD' + 11 characters for the video id
             # Mixes require a custom extraction process
             return self._extract_mix(playlist_id)
+        if playlist_id.startswith('TL'):
+            raise ExtractorError(u'For downloading YouTube.com top lists, use '
+                u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
 
         # Extract the video ids from the playlist pages
         ids = []
@@ -1598,6 +1601,38 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
         return self.playlist_result(url_results, playlist_id, playlist_title)
 
 
+class YoutubeTopListIE(YoutubePlaylistIE):
+    IE_NAME = u'youtube:toplist'
+    IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
+        u' (Example: "yttoplist:music:Top Tracks")')
+    _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        channel = mobj.group('chann')
+        title = mobj.group('title')
+        query = compat_urllib_parse.urlencode({'title': title})
+        playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
+        channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
+        link = self._html_search_regex(playlist_re, channel_page, u'list')
+        url = compat_urlparse.urljoin('https://www.youtube.com/', link)
+        
+        video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
+        ids = []
+        # sometimes the webpage doesn't contain the videos
+        # retry until we get them
+        for i in itertools.count(0):
+            msg = u'Downloading Youtube mix'
+            if i > 0:
+                msg += ', retry #%d' % i
+            webpage = self._download_webpage(url, title, msg)
+            ids = orderedSet(re.findall(video_re, webpage))
+            if ids:
+                break
+        url_results = self._ids_to_results(ids)
+        return self.playlist_result(url_results, playlist_title=title)
+
+
 class YoutubeChannelIE(InfoExtractor):
     IE_DESC = u'YouTube.com channels'
     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
author	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>	2013-11-30 14:56:51 +0100
committer	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>	2013-11-30 14:56:51 +0100
commit	0a688bc0b28c970e9af965b3fa0c7927507eeb97 (patch)
tree	c4f6db17e33552a5dd58699ebe0e73bfaa284bb8
parent	b138de72f2f0fc197fe46154bcaeceddb5713e7f (diff)
download	youtube-dl-0a688bc0b28c970e9af965b3fa0c7927507eeb97.tar.xz