diff options
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
-rw-r--r-- | youtube_dl/extractor/youtube.py | 73 |
1 files changed, 46 insertions, 27 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0301682b8..a3da56c14 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -234,6 +234,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '44': {'ext': 'webm', 'width': 854, 'height': 480}, '45': {'ext': 'webm', 'width': 1280, 'height': 720}, '46': {'ext': 'webm', 'width': 1920, 'height': 1080}, + '59': {'ext': 'mp4', 'width': 854, 'height': 480}, + '78': {'ext': 'mp4', 'width': 854, 'height': 480}, # 3d videos @@ -785,7 +787,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): s = mobj.group(1) dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) return '/signature/%s' % dec_s - dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url) + dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url) dash_doc = self._download_xml( dash_manifest_url, video_id, note='Downloading DASH manifest', @@ -1290,7 +1292,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): def _extract_playlist(self, playlist_id): url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) - more_widget_html = content_html = page for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page): match = match.strip() @@ -1310,36 +1311,36 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): self.report_warning('Youtube gives an alert message: ' + match) # Extract the video ids from the playlist pages - ids = [] - - for page_num in itertools.count(1): - matches = re.finditer(self._VIDEO_RE, content_html) - # We remove the duplicates and the link with index 0 - # (it's not the first video of the playlist) - new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') - ids.extend(new_ids) - - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break + def _entries(): + more_widget_html = content_html = page + for page_num in itertools.count(1): + matches = re.finditer(self._VIDEO_RE, content_html) + # We remove the duplicates and the link with index 0 + # (it's not the first video of the playlist) + new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') + for vid_id in new_ids: + yield self.url_result(vid_id, 'Youtube', video_id=vid_id) + + mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) + if not mobj: + break - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) - content_html = more['content_html'] - if not content_html.strip(): - # Some webpages show a "Load more" button but they don't - # have more videos - break - more_widget_html = more['load_more_widget_html'] + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape) + content_html = more['content_html'] + if not content_html.strip(): + # Some webpages show a "Load more" button but they don't + # have more videos + break + more_widget_html = more['load_more_widget_html'] playlist_title = self._html_search_regex( r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>', page, 'title') - url_results = self._ids_to_results(ids) - return self.playlist_result(url_results, playlist_id, playlist_title) + return self.playlist_result(_entries(), playlist_id, playlist_title) def _real_extract(self, url): # Extract playlist id @@ -1399,6 +1400,24 @@ class YoutubeChannelIE(InfoExtractor): channel_id = self._match_id(url) url = self._TEMPLATE_URL % channel_id + + # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) + # Workaround by extracting as a playlist if managed to obtain channel playlist URL + # otherwise fallback on channel by page extraction + channel_page = self._download_webpage( + url + '?view=57', channel_id, + 'Downloading channel page', fatal=False) + channel_playlist_id = self._html_search_meta( + 'channelId', channel_page, 'channel id', default=None) + if not channel_playlist_id: + channel_playlist_id = self._search_regex( + r'data-channel-external-id="([^"]+)"', + channel_page, 'channel id', default=None) + if channel_playlist_id and channel_playlist_id.startswith('UC'): + playlist_id = 'UU' + channel_playlist_id[2:] + return self.url_result( + compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist') + channel_page = self._download_webpage(url, channel_id, 'Downloading page #1') autogenerated = re.search(r'''(?x) class="[^"]*?(?: @@ -1487,7 +1506,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): for pagenum in itertools.count(1): url_query = { - 'search_query': query, + 'search_query': query.encode('utf-8'), 'page': pagenum, 'spf': 'navigate', } |