diff options
| author | Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> | 2014-03-05 13:22:10 +0100 | 
|---|---|---|
| committer | Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> | 2014-03-05 13:22:10 +0100 | 
| commit | ca1fee34f22e3fac9cc7a55c55c7aa7519f788b3 (patch) | |
| tree | 43149ad6dffd451c3d19a85c8abbaece15f5b9ef | |
| parent | 6dadaa99300ae7a123b2ca5bd306ada32f48a632 (diff) | |
[ted] Fix playlist extraction and add a test
| -rw-r--r-- | test/test_playlists.py | 10 | ||||
| -rw-r--r-- | youtube_dl/extractor/ted.py | 37 | 
2 files changed, 28 insertions, 19 deletions
| diff --git a/test/test_playlists.py b/test/test_playlists.py index 07c85b322..4bd815a0e 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -36,6 +36,7 @@ from youtube_dl.extractor import (      RutubeChannelIE,      GoogleSearchIE,      GenericIE, +    TEDIE,  ) @@ -259,5 +260,14 @@ class TestPlaylists(unittest.TestCase):          self.assertEqual(result['title'], 'Zero Punctuation')          self.assertTrue(len(result['entries']) > 10) +    def test_ted_playlist(self): +        dl = FakeYDL() +        ie = TEDIE(dl) +        result = ie.extract('http://www.ted.com/playlists/who_are_the_hackers') +        self.assertIsPlaylist(result) +        self.assertEqual(result['id'], '10') +        self.assertEqual(result['title'], 'Who are the hackers?') +        self.assertTrue(len(result['entries']) >= 6) +  if __name__ == '__main__':      unittest.main() diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 10cb1e4be..f3cb85ab0 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -6,6 +6,7 @@ import re  from .subtitles import SubtitlesInfoExtractor  from ..utils import ( +    compat_str,      RegexNotFoundError,  ) @@ -13,7 +14,7 @@ from ..utils import (  class TEDIE(SubtitlesInfoExtractor):      _VALID_URL=r'''(?x)http://www\.ted\.com/                     ( -                        ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist +                        (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist                          |                          ((?P<type_talk>talks)) # We have a simple talk                     ) @@ -37,35 +38,35 @@ class TEDIE(SubtitlesInfoExtractor):          'high': 3,      } +    def _extract_info(self, webpage): +        info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>', webpage, 'info json') +        return json.loads(info_json) +      def _real_extract(self, url):          m=re.match(self._VALID_URL, url, re.VERBOSE)          if m.group('type_talk'):              return self._talk_info(url)          else : -            playlist_id=m.group('playlist_id')              name=m.group('name') -            self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name)) -            return [self._playlist_videos_info(url,name,playlist_id)] +            return self._playlist_videos_info(url, name) -    def _playlist_videos_info(self, url, name, playlist_id): +    def _playlist_videos_info(self, url, name):          '''Returns the videos of the playlist''' -        webpage = self._download_webpage( -            url, playlist_id, 'Downloading playlist webpage') -        matches = re.finditer( -            r'<p\s+class="talk-title[^"]*"><a\s+href="(?P<talk_url>/talks/[^"]+\.html)">[^<]*</a></p>', -            webpage) - -        playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>', -                                                 webpage, 'playlist title') +        webpage = self._download_webpage(url, name, +            'Downloading playlist webpage') +        info = self._extract_info(webpage) +        playlist_info = info['playlist']          playlist_entries = [ -            self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED') -            for m in matches +            self.url_result(u'http://www.ted.com/talks/' + talk['slug'], self.ie_key()) +            for talk in info['talks']          ]          return self.playlist_result( -            playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title) +            playlist_entries, +            playlist_id=compat_str(playlist_info['id']), +            playlist_title=playlist_info['title'])      def _talk_info(self, url, video_id=0):          """Return the video for the talk in the url""" @@ -74,9 +75,7 @@ class TEDIE(SubtitlesInfoExtractor):          webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)          self.report_extraction(video_name) -        info_json = self._search_regex(r'"talkPage.init",({.+})\)</script>', webpage, 'info json') -        info = json.loads(info_json) -        talk_info = info['talks'][0] +        talk_info = self._extract_info(webpage)['talks'][0]          formats = [{              'ext': 'mp4', | 
