diff options
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 221 | 
1 files changed, 172 insertions, 49 deletions
| diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e2a43299f..4aac2cc03 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -20,13 +20,13 @@ from ..compat import (      compat_urllib_parse_unquote,      compat_urllib_parse_unquote_plus,      compat_urllib_parse_urlparse, -    compat_urllib_request,      compat_urlparse,      compat_str,  )  from ..utils import (      clean_html,      encode_dict, +    error_to_compat_str,      ExtractorError,      float_or_none,      get_element_by_attribute, @@ -34,7 +34,9 @@ from ..utils import (      int_or_none,      orderedSet,      parse_duration, +    remove_quotes,      remove_start, +    sanitized_Request,      smuggle_url,      str_to_int,      unescapeHTML, @@ -114,7 +116,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):          login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii') -        req = compat_urllib_request.Request(self._LOGIN_URL, login_data) +        req = sanitized_Request(self._LOGIN_URL, login_data)          login_results = self._download_webpage(              req, None,              note='Logging in', errnote='unable to log in', fatal=False) @@ -147,7 +149,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):              tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii') -            tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data) +            tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)              tfa_results = self._download_webpage(                  tfa_req, None,                  note='Submitting TFA code', errnote='unable to submit tfa', fatal=False) @@ -178,15 +180,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor):              return -class YoutubePlaylistBaseInfoExtractor(InfoExtractor): -    # Extract the video ids from the playlist pages +class YoutubeEntryListBaseInfoExtractor(InfoExtractor): +    # Extract entries from page with "Load more" button      def _entries(self, page, playlist_id):          more_widget_html = content_html = page          for page_num in itertools.count(1): -            for video_id, video_title in self.extract_videos_from_page(content_html): -                yield self.url_result( -                    video_id, 'Youtube', video_id=video_id, -                    video_title=video_title) +            for entry in self._process_page(content_html): +                yield entry              mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)              if not mobj: @@ -203,6 +203,12 @@ class YoutubePlaylistBaseInfoExtractor(InfoExtractor):                  break              more_widget_html = more['load_more_widget_html'] + +class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): +    def _process_page(self, content): +        for video_id, video_title in self.extract_videos_from_page(content): +            yield self.url_result(video_id, 'Youtube', video_id, video_title) +      def extract_videos_from_page(self, page):          ids_in_page = []          titles_in_page = [] @@ -224,6 +230,19 @@ class YoutubePlaylistBaseInfoExtractor(InfoExtractor):          return zip(ids_in_page, titles_in_page) +class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): +    def _process_page(self, content): +        for playlist_id in re.findall(r'href="/?playlist\?list=(.+?)"', content): +            yield self.url_result( +                'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') + +    def _real_extract(self, url): +        playlist_id = self._match_id(url) +        webpage = self._download_webpage(url, playlist_id) +        title = self._og_search_title(webpage, fatal=False) +        return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title) + +  class YoutubeIE(YoutubeBaseInfoExtractor):      IE_DESC = 'YouTube.com'      _VALID_URL = r"""(?x)^ @@ -241,7 +260,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                               |(?:                                             # or the v= param in all its forms                                   (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)                                   (?:\?|\#!?)                                  # the params delimiter ? or # or #! -                                 (?:.*?&)??                                   # any other preceding param (like /?s=tuff&v=xxxx) +                                 (?:.*?[&;])??                                # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)                                   v=                               )                           )) @@ -329,6 +348,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, +        # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)          '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},          '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, @@ -377,12 +397,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'ext': 'mp4',                  'upload_date': '20120506',                  'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', +                'alt_title': 'I Love It (feat. Charli XCX)',                  'description': 'md5:782e8651347686cba06e58f71ab51773',                  'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',                           'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',                           'iconic ep', 'iconic', 'love', 'it'],                  'uploader': 'Icona Pop',                  'uploader_id': 'IconaPop', +                'creator': 'Icona Pop',              }          },          { @@ -393,9 +415,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'ext': 'mp4',                  'upload_date': '20130703',                  'title': 'Justin Timberlake - Tunnel Vision (Explicit)', +                'alt_title': 'Tunnel Vision',                  'description': 'md5:64249768eec3bc4276236606ea996373',                  'uploader': 'justintimberlakeVEVO',                  'uploader_id': 'justintimberlakeVEVO', +                'creator': 'Justin Timberlake',                  'age_limit': 18,              }          }, @@ -409,7 +433,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',                  'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',                  'uploader': 'SET India', -                'uploader_id': 'setindia' +                'uploader_id': 'setindia', +                'age_limit': 18,              }          },          { @@ -473,10 +498,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'id': 'nfWlot6h_JM',                  'ext': 'm4a',                  'title': 'Taylor Swift - Shake It Off', +                'alt_title': 'Shake It Off',                  'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',                  'uploader': 'TaylorSwiftVEVO',                  'uploader_id': 'TaylorSwiftVEVO',                  'upload_date': '20140818', +                'creator': 'Taylor Swift',              },              'params': {                  'youtube_include_dash_manifest': True, @@ -532,9 +559,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'ext': 'mp4',                  'upload_date': '20100430',                  'uploader_id': 'deadmau5', +                'creator': 'deadmau5',                  'description': 'md5:12c56784b8032162bb936a5f76d55360',                  'uploader': 'deadmau5',                  'title': 'Deadmau5 - Some Chords (HD)', +                'alt_title': 'Some Chords',              },              'expected_warnings': [                  'DASH manifest missing', @@ -546,7 +575,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              'info_dict': {                  'id': 'lqQg6PlCWgI',                  'ext': 'mp4', -                'upload_date': '20120724', +                'upload_date': '20150827',                  'uploader_id': 'olympic',                  'description': 'HO09  - Women -  GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',                  'uploader': 'Olympics', @@ -674,6 +703,49 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          {              'url': 'http://vid.plus/FlRa-iH7PGw',              'only_matching': True, +        }, +        { +            # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468) +            'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg', +            'info_dict': { +                'id': 'lsguqyKfVQg', +                'ext': 'mp4', +                'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', +                'alt_title': 'Dark Walk', +                'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', +                'upload_date': '20151119', +                'uploader_id': 'IronSoulElf', +                'uploader': 'IronSoulElf', +                'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan', +            }, +            'params': { +                'skip_download': True, +            }, +        }, +        { +            # Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468) +            'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8', +            'only_matching': True, +        }, +        { +            # Video with yt:stretch=17:0 +            'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM', +            'info_dict': { +                'id': 'Q39EVAstoRM', +                'ext': 'mp4', +                'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4', +                'description': 'md5:ee18a25c350637c8faff806845bddee9', +                'upload_date': '20151107', +                'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA', +                'uploader': 'CH GAMER DROID', +            }, +            'params': { +                'skip_download': True, +            }, +        }, +        { +            'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY', +            'only_matching': True,          }      ] @@ -703,7 +775,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):      def _extract_signature_function(self, video_id, player_url, example_sig):          id_m = re.match( -            r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?)?\.(?P<ext>[a-z]+)$', +            r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',              player_url)          if not id_m:              raise ExtractorError('Cannot identify player %r' % player_url) @@ -832,7 +904,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,                  video_id, note=False)          except ExtractorError as err: -            self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err)) +            self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))              return {}          sub_lang_list = {} @@ -858,16 +930,33 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              return {}          return sub_lang_list +    def _get_ytplayer_config(self, video_id, webpage): +        patterns = ( +            # User data may contain arbitrary character sequences that may affect +            # JSON extraction with regex, e.g. when '};' is contained the second +            # regex won't capture the whole JSON. Yet working around by trying more +            # concrete regex first keeping in mind proper quoted string handling +            # to be implemented in future that will replace this workaround (see +            # https://github.com/rg3/youtube-dl/issues/7468, +            # https://github.com/rg3/youtube-dl/pull/7599) +            r';ytplayer\.config\s*=\s*({.+?});ytplayer', +            r';ytplayer\.config\s*=\s*({.+?});', +        ) +        config = self._search_regex( +            patterns, webpage, 'ytplayer.config', default=None) +        if config: +            return self._parse_json( +                uppercase_escape(config), video_id, fatal=False) +      def _get_automatic_captions(self, video_id, webpage):          """We need the webpage for getting the captions url, pass it as an             argument to speed up the process."""          self.to_screen('%s: Looking for automatic captions' % video_id) -        mobj = re.search(r';ytplayer.config = ({.*?});', webpage) +        player_config = self._get_ytplayer_config(video_id, webpage)          err_msg = 'Couldn\'t find automatic captions for %s' % video_id -        if mobj is None: +        if not player_config:              self._downloader.report_warning(err_msg)              return {} -        player_config = json.loads(mobj.group(1))          try:              args = player_config['args']              caption_url = args['ttsurl'] @@ -1074,10 +1163,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              age_gate = False              video_info = None              # Try looking directly into the video webpage -            mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage) -            if mobj: -                json_code = uppercase_escape(mobj.group(1)) -                ytplayer_config = json.loads(json_code) +            ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) +            if ytplayer_config:                  args = ytplayer_config['args']                  if args.get('url_encoded_fmt_stream_map'):                      # Convert to the same format returned by compat_parse_qs @@ -1233,6 +1320,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())          upload_date = unified_strdate(upload_date) +        m_music = re.search( +            r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li', +            video_webpage) +        if m_music: +            video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) +            video_creator = clean_html(m_music.group('creator')) +        else: +            video_alt_title = video_creator = None +          m_cat_container = self._search_regex(              r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',              video_webpage, 'categories', default=None) @@ -1343,7 +1439,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                                  player_desc = 'flash player %s' % player_version                              else:                                  player_version = self._search_regex( -                                    r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', +                                    [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'],                                      player_url,                                      'html5 player', fatal=False)                                  player_desc = 'html5 player %s' % player_version @@ -1405,6 +1501,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              manifest_url = video_info['hlsvp'][0]              url_map = self._extract_from_m3u8(manifest_url, video_id)              formats = _map_to_format_list(url_map) +            # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming +            for a_format in formats: +                a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'          else:              raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') @@ -1442,10 +1541,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',              video_webpage)          if stretched_m: -            ratio = float(stretched_m.group('w')) / float(stretched_m.group('h')) -            for f in formats: -                if f.get('vcodec') != 'none': -                    f['stretched_ratio'] = ratio +            w = float(stretched_m.group('w')) +            h = float(stretched_m.group('h')) +            # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0). +            # We will only process correct ratios. +            if w > 0 and h > 0: +                ratio = w / h +                for f in formats: +                    if f.get('vcodec') != 'none': +                        f['stretched_ratio'] = ratio          self._sort_formats(formats) @@ -1454,7 +1558,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              'uploader': video_uploader,              'uploader_id': video_uploader_id,              'upload_date': upload_date, +            'creator': video_creator,              'title': video_title, +            'alt_title': video_alt_title,              'thumbnail': video_thumbnail,              'description': video_description,              'categories': video_categories, @@ -1484,7 +1590,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtract                          youtube\.com/                          (?:                             (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) -                           \? (?:.*?&)*? (?:p|a|list)= +                           \? (?:.*?[&;])*? (?:p|a|list)=                          |  p/                          )                          ( @@ -1615,7 +1721,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtract                  self.report_warning('Youtube gives an alert message: ' + match)          playlist_title = self._html_search_regex( -            r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>', +            r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',              page, 'title')          return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title) @@ -1669,6 +1775,10 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):          },      }] +    @classmethod +    def suitable(cls, url): +        return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url) +      def _real_extract(self, url):          channel_id = self._match_id(url) @@ -1742,6 +1852,36 @@ class YoutubeUserIE(YoutubeChannelIE):              return super(YoutubeUserIE, cls).suitable(url) +class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): +    IE_DESC = 'YouTube.com user/channel playlists' +    _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists' +    IE_NAME = 'youtube:playlists' + +    _TESTS = [{ +        'url': 'http://www.youtube.com/user/ThirstForScience/playlists', +        'playlist_mincount': 4, +        'info_dict': { +            'id': 'ThirstForScience', +            'title': 'Thirst for Science', +        }, +    }, { +        # with "Load more" button +        'url': 'http://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', +        'playlist_mincount': 70, +        'info_dict': { +            'id': 'igorkle1', +            'title': 'Игорь Клейнер', +        }, +    }, { +        'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists', +        'playlist_mincount': 17, +        'info_dict': { +            'id': 'UCiU1dHvZObB2iP6xkJ__Icw', +            'title': 'Chem Player', +        }, +    }] + +  class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):      IE_DESC = 'YouTube.com searches'      # there doesn't appear to be a real limit, for example if you search for @@ -1837,7 +1977,7 @@ class YoutubeSearchURLIE(InfoExtractor):          } -class YoutubeShowIE(InfoExtractor): +class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):      IE_DESC = 'YouTube.com (multi-season) shows'      _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'      IE_NAME = 'youtube:show' @@ -1851,26 +1991,9 @@ class YoutubeShowIE(InfoExtractor):      }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        playlist_id = mobj.group('id') -        webpage = self._download_webpage( -            'https://www.youtube.com/show/%s/playlists' % playlist_id, playlist_id, 'Downloading show webpage') -        # There's one playlist for each season of the show -        m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage)) -        self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons))) -        entries = [ -            self.url_result( -                'https://www.youtube.com' + season.group(1), 'YoutubePlaylist') -            for season in m_seasons -        ] -        title = self._og_search_title(webpage, fatal=False) - -        return { -            '_type': 'playlist', -            'id': playlist_id, -            'title': title, -            'entries': entries, -        } +        playlist_id = self._match_id(url) +        return super(YoutubeShowIE, self)._real_extract( +            'https://www.youtube.com/show/%s/playlists' % playlist_id)  class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): | 
