diff options
author | Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> | 2013-11-06 21:42:33 +0100 |
---|---|---|
committer | Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com> | 2013-11-06 21:42:33 +0100 |
commit | 4f045eef8fe63bcb925faaa5108a42d00b83f93a (patch) | |
tree | c917deb3b19552732fe6e7905b933476f4995680 | |
parent | 5d7b253ea071e4bf16e99242a84f1b0eab2d6f32 (diff) |
[youtube:channel] Fix the extraction
The page don't include the 'load more' button anymore, now we directly get the 'c4_browse_ajax' pages.
-rw-r--r-- | youtube_dl/extractor/youtube.py | 31 |
1 files changed, 10 insertions, 21 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 74a381fe2..f745b8b14 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1572,7 +1572,6 @@ class YoutubePlaylistIE(InfoExtractor): class YoutubeChannelIE(InfoExtractor): IE_DESC = u'YouTube.com channels' _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" - _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en' _MORE_PAGES_INDICATOR = 'yt-uix-load-more' _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' IE_NAME = u'youtube:channel' @@ -1593,30 +1592,20 @@ class YoutubeChannelIE(InfoExtractor): # Download channel page channel_id = mobj.group(1) video_ids = [] - pagenum = 1 - url = self._TEMPLATE_URL % (channel_id, pagenum) - page = self._download_webpage(url, channel_id, - u'Downloading page #%s' % pagenum) + # Download all channel pages using the json-based channel_ajax query + for pagenum in itertools.count(1): + url = self._MORE_PAGES_URL % (pagenum, channel_id) + page = self._download_webpage(url, channel_id, + u'Downloading page #%s' % pagenum) - # Extract video identifiers - ids_in_page = self.extract_videos_from_page(page) - video_ids.extend(ids_in_page) + page = json.loads(page) - # Download any subsequent channel pages using the json-based channel_ajax query - if self._MORE_PAGES_INDICATOR in page: - for pagenum in itertools.count(1): - url = self._MORE_PAGES_URL % (pagenum, channel_id) - page = self._download_webpage(url, channel_id, - u'Downloading page #%s' % pagenum) - - page = json.loads(page) - - ids_in_page = self.extract_videos_from_page(page['content_html']) - video_ids.extend(ids_in_page) + ids_in_page = self.extract_videos_from_page(page['content_html']) + video_ids.extend(ids_in_page) - if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']: - break + if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']: + break self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) |