diff options
| author | Sergey M․ <dstftw@gmail.com> | 2019-08-13 05:02:52 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2019-08-13 05:02:52 +0700 | 
| commit | 351f37c022b24144c064fab39bd6d134e166c31c (patch) | |
| tree | a58f4404dedc63a453e5c233da6bbfc92133d740 | |
| parent | 3bce4ff7d96d845ec67ffe8e9e2715474f190d89 (diff) | |
[youtube:playlist] Improve flat extraction (closes #21927)
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 49 | 
1 files changed, 43 insertions, 6 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0a0d2f41a..b63f19bb0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -31,6 +31,7 @@ from ..utils import (      clean_html,      dict_get,      error_to_compat_str, +    extract_attributes,      ExtractorError,      float_or_none,      get_element_by_attribute, @@ -324,17 +325,18 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):          for video_id, video_title in self.extract_videos_from_page(content):              yield self.url_result(video_id, 'Youtube', video_id, video_title) -    def extract_videos_from_page(self, page): -        ids_in_page = [] -        titles_in_page = [] -        for mobj in re.finditer(self._VIDEO_RE, page): +    def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page): +        for mobj in re.finditer(video_re, page):              # The link with index 0 is not the first video of the playlist (not sure if still actual)              if 'index' in mobj.groupdict() and mobj.group('id') == '0':                  continue              video_id = mobj.group('id') -            video_title = unescapeHTML(mobj.group('title')) +            video_title = unescapeHTML( +                mobj.group('title')) if 'title' in mobj.groupdict() else None              if video_title:                  video_title = video_title.strip() +            if video_title == '► Play all': +                video_title = None              try:                  idx = ids_in_page.index(video_id)                  if video_title and not titles_in_page[idx]: @@ -342,6 +344,12 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):              except ValueError:                  ids_in_page.append(video_id)                  titles_in_page.append(video_title) + +    def extract_videos_from_page(self, page): +        ids_in_page = [] +        titles_in_page = [] +        self.extract_videos_from_page_impl( +            self._VIDEO_RE, page, ids_in_page, titles_in_page)          return zip(ids_in_page, titles_in_page) @@ -2438,7 +2446,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):                          (%(playlist_id)s)                       )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}      _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' -    _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?' +    _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?' +    _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'      IE_NAME = 'youtube:playlist'      _TESTS = [{          'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', @@ -2603,6 +2612,34 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):      def _real_initialize(self):          self._login() +    def extract_videos_from_page(self, page): +        ids_in_page = [] +        titles_in_page = [] + +        for item in re.findall( +                r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page): +            attrs = extract_attributes(item) +            video_id = attrs['data-video-id'] +            video_title = unescapeHTML(attrs.get('data-title')) +            if video_title: +                video_title = video_title.strip() +            ids_in_page.append(video_id) +            titles_in_page.append(video_title) + +        # Fallback with old _VIDEO_RE +        self.extract_videos_from_page_impl( +            self._VIDEO_RE, page, ids_in_page, titles_in_page) + +        # Relaxed fallbacks +        self.extract_videos_from_page_impl( +            r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page, +            ids_in_page, titles_in_page) +        self.extract_videos_from_page_impl( +            r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page, +            ids_in_page, titles_in_page) + +        return zip(ids_in_page, titles_in_page) +      def _extract_mix(self, playlist_id):          # The mixes are generated from a single video          # the id of the playlist is just 'RD' + video_id  | 
