diff options
author | Philipp Hagemeister <phihag@phihag.de> | 2013-11-24 15:18:44 +0100 |
---|---|---|
committer | Philipp Hagemeister <phihag@phihag.de> | 2013-11-24 15:18:44 +0100 |
commit | e03db0a0773e078d9b677d396ad78362654956af (patch) | |
tree | 17192285892d996eb7b9819e79e04729e166e81a /youtube_dl/extractor | |
parent | a1ee09e815cb413d67cee17ad686224b26182dfb (diff) | |
parent | 267ed0c5d3547c68f1d34203c2ae4b0d826a29d9 (diff) |
Merge branch 'master' into opener-to-ydl
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/bandcamp.py | 53 | ||||
-rw-r--r-- | youtube_dl/extractor/brightcove.py | 18 | ||||
-rw-r--r-- | youtube_dl/extractor/clipfish.py | 53 | ||||
-rw-r--r-- | youtube_dl/extractor/collegehumor.py | 7 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 10 | ||||
-rw-r--r-- | youtube_dl/extractor/howcast.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/mixcloud.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/niconico.py | 131 | ||||
-rw-r--r-- | youtube_dl/extractor/streamcloud.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/viki.py | 91 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 86 |
12 files changed, 377 insertions, 81 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 02f9e2546..1fbd10bc5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -20,6 +20,7 @@ from .c56 import C56IE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .cinemassacre import CinemassacreIE +from .clipfish import ClipfishIE from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE @@ -98,6 +99,7 @@ from .nba import NBAIE from .nbc import NBCNewsIE from .newgrounds import NewgroundsIE from .nhl import NHLIE, NHLVideocenterIE +from .niconico import NiconicoIE from .nowvideo import NowVideoIE from .ooyala import OoyalaIE from .orf import ORFIE @@ -156,6 +158,7 @@ from .videofyme import VideofyMeIE from .videopremium import VideoPremiumIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE +from .viki import VikiIE from .vk import VKIE from .wat import WatIE from .websurg import WeBSurgIE @@ -183,6 +186,7 @@ from .youtube import ( YoutubeTruncatedURLIE, YoutubeWatchLaterIE, YoutubeFavouritesIE, + YoutubeHistoryIE, ) from .zdf import ZDFIE diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 359d4174b..3a32c14c5 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -20,28 +20,6 @@ class BandcampIE(InfoExtractor): u"title": u"youtube-dl test song \"'/\\\u00e4\u21ad" }, u'skip': u'There is a limit of 200 free downloads / month for the test song' - }, { - u'url': u'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', - u'playlist': [ - { - u'file': u'1353101989.mp3', - u'md5': u'39bc1eded3476e927c724321ddf116cf', - u'info_dict': { - u'title': u'Intro', - } - }, - { - u'file': u'38097443.mp3', - u'md5': u'1a2c32e2691474643e912cc6cd4bffaa', - u'info_dict': { - u'title': u'Kero One - Keep It Alive (Blazo remix)', - } - }, - ], - u'params': { - u'playlistend': 2 - }, - u'skip': u'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test' }] def _real_extract(self, url): @@ -56,20 +34,17 @@ class BandcampIE(InfoExtractor): json_code = m_trackinfo.group(1) data = json.loads(json_code) - entries = [] for d in data: formats = [{ 'format_id': 'format_id', 'url': format_url, 'ext': format_id.partition('-')[0] } for format_id, format_url in sorted(d['file'].items())] - entries.append({ + return { 'id': compat_str(d['id']), 'title': d['title'], 'formats': formats, - }) - - return self.playlist_result(entries, title, title) + } else: raise ExtractorError(u'No free songs found') @@ -112,6 +87,30 @@ class BandcampAlbumIE(InfoExtractor): IE_NAME = u'Bandcamp:album' _VALID_URL = r'http://.*?\.bandcamp\.com/album/(?P<title>.*)' + _TEST = { + u'url': u'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', + u'playlist': [ + { + u'file': u'1353101989.mp3', + u'md5': u'39bc1eded3476e927c724321ddf116cf', + u'info_dict': { + u'title': u'Intro', + } + }, + { + u'file': u'38097443.mp3', + u'md5': u'1a2c32e2691474643e912cc6cd4bffaa', + u'info_dict': { + u'title': u'Kero One - Keep It Alive (Blazo remix)', + } + }, + ], + u'params': { + u'playlistend': 2 + }, + u'skip': u'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test' + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) title = mobj.group('title') diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index d8c35465a..66fe0ac9a 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -75,16 +75,22 @@ class BrightcoveIE(InfoExtractor): params = {'flashID': object_doc.attrib['id'], 'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'], } - playerKey = find_xpath_attr(object_doc, './param', 'name', 'playerKey') + def find_param(name): + node = find_xpath_attr(object_doc, './param', 'name', name) + if node is not None: + return node.attrib['value'] + return None + playerKey = find_param('playerKey') # Not all pages define this value if playerKey is not None: - params['playerKey'] = playerKey.attrib['value'] - videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer') + params['playerKey'] = playerKey + # The three fields hold the id of the video + videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') if videoPlayer is not None: - params['@videoPlayer'] = videoPlayer.attrib['value'] - linkBase = find_xpath_attr(object_doc, './param', 'name', 'linkBaseURL') + params['@videoPlayer'] = videoPlayer + linkBase = find_param('linkBaseURL') if linkBase is not None: - params['linkBaseURL'] = linkBase.attrib['value'] + params['linkBaseURL'] = linkBase data = compat_urllib_parse.urlencode(params) return cls._FEDERATED_URL_TEMPLATE % data diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py new file mode 100644 index 000000000..95449da3c --- /dev/null +++ b/youtube_dl/extractor/clipfish.py @@ -0,0 +1,53 @@ +import re +import time +import xml.etree.ElementTree + +from .common import InfoExtractor + + +class ClipfishIE(InfoExtractor): + IE_NAME = u'clipfish' + + _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/' + _TEST = { + u'url': u'http://www.clipfish.de/special/supertalent/video/4028320/supertalent-2013-ivana-opacak-singt-nobodys-perfect/', + u'file': u'4028320.f4v', + u'md5': u'5e38bda8c329fbfb42be0386a3f5a382', + u'info_dict': { + u'title': u'Supertalent 2013: Ivana Opacak singt Nobody\'s Perfect', + u'duration': 399, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + + info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' % + (video_id, int(time.time()))) + info_xml = self._download_webpage( + info_url, video_id, note=u'Downloading info page') + doc = xml.etree.ElementTree.fromstring(info_xml) + title = doc.find('title').text + video_url = doc.find('filename').text + thumbnail = doc.find('imageurl').text + duration_str = doc.find('duration').text + m = re.match( + r'^(?P<hours>[0-9]+):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}):(?P<ms>[0-9]*)$', + duration_str) + if m: + duration = ( + (int(m.group('hours')) * 60 * 60) + + (int(m.group('minutes')) * 60) + + (int(m.group('seconds'))) + ) + else: + duration = None + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': thumbnail, + 'duration': duration, + } diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 0c29acfb1..b27c1dfc5 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -46,11 +45,10 @@ class CollegeHumorIE(InfoExtractor): self.report_extraction(video_id) xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id - metaXml = self._download_webpage(xmlUrl, video_id, + mdoc = self._download_xml(xmlUrl, video_id, u'Downloading info XML', u'Unable to download video info XML') - mdoc = xml.etree.ElementTree.fromstring(metaXml) try: videoNode = mdoc.findall('./video')[0] youtubeIdNode = videoNode.find('./youtubeID') @@ -65,11 +63,10 @@ class CollegeHumorIE(InfoExtractor): if next_url.endswith(u'manifest.f4m'): manifest_url = next_url + '?hdcore=2.10.3' - manifestXml = self._download_webpage(manifest_url, video_id, + adoc = self._download_xml(manifest_url, video_id, u'Downloading XML manifest', u'Unable to download video info XML') - adoc = xml.etree.ElementTree.fromstring(manifestXml) try: video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text except IndexError: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 423e54cea..6ec835f8a 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -4,6 +4,7 @@ import re import socket import sys import netrc +import xml.etree.ElementTree from ..utils import ( compat_http_client, @@ -208,6 +209,11 @@ class InfoExtractor(object): """ Returns the data of the page as a string """ return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0] + def _download_xml(self, url_or_request, video_id, note=u'Downloading XML', errnote=u'Unable to downloand XML'): + """Return the xml as an xml.etree.ElementTree.Element""" + xml_string = self._download_webpage(url_or_request, video_id, note, errnote) + return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + def to_screen(self, msg): """Print msg to screen, prefixing it with '[ie_name]'""" self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg)) @@ -229,12 +235,14 @@ class InfoExtractor(object): self.to_screen(u'Logging in') #Methods for following #608 - def url_result(self, url, ie=None): + def url_result(self, url, ie=None, video_id=None): """Returns a url that points to a page that should be processed""" #TODO: ie should be the class used for getting the info video_info = {'_type': 'url', 'url': url, 'ie_key': ie} + if video_id is not None: + video_info['id'] = video_id return video_info def playlist_result(self, entries, playlist_id=None, playlist_title=None): """Returns a playlist""" diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py index 46954337f..bafc5826f 100644 --- a/youtube_dl/extractor/howcast.py +++ b/youtube_dl/extractor/howcast.py @@ -8,7 +8,7 @@ class HowcastIE(InfoExtractor): _TEST = { u'url': u'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly', u'file': u'390161.mp4', - u'md5': u'1d7ba54e2c9d7dc6935ef39e00529138', + u'md5': u'8b743df908c42f60cf6496586c7f12c3', u'info_dict': { u"description": u"The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here's the proper way to tie a square knot.", u"title": u"How to Tie a Square Knot Properly" diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index a200dcd74..e2baf44d7 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -60,7 +60,7 @@ class MixcloudIE(InfoExtractor): 'title': info['name'], 'url': final_song_url, 'ext': 'mp3', - 'description': info['description'], + 'description': info.get('description'), 'thumbnail': info['pictures'].get('extra_large'), 'uploader': info['user']['name'], 'uploader_id': info['user']['username'], diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py new file mode 100644 index 000000000..729607ea3 --- /dev/null +++ b/youtube_dl/extractor/niconico.py @@ -0,0 +1,131 @@ +# encoding: utf-8 + +import re +import socket +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + compat_http_client, + compat_urllib_error, + compat_urllib_parse, + compat_urllib_request, + compat_urlparse, + compat_str, + + ExtractorError, + unified_strdate, +) + + +class NiconicoIE(InfoExtractor): + IE_NAME = u'niconico' + IE_DESC = u'ニコニコ動画' + + _TEST = { + u'url': u'http://www.nicovideo.jp/watch/sm22312215', + u'file': u'sm22312215.mp4', + u'md5': u'd1a75c0823e2f629128c43e1212760f9', + u'info_dict': { + u'title': u'Big Buck Bunny', + u'uploader': u'takuya0301', + u'uploader_id': u'2698420', + u'upload_date': u'20131123', + u'description': u'(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', + }, + u'params': { + u'username': u'ydl.niconico@gmail.com', + u'password': u'youtube-dl', + }, + } + + _VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$' + _NETRC_MACHINE = 'niconico' + # If True it will raise an error if no login info is provided + _LOGIN_REQUIRED = True + + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + # No authentication to be performed + if username is None: + if self._LOGIN_REQUIRED: + raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True) + return False + + # Log in + login_form_strs = { + u'mail': username, + u'password': password, + } + # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode + # chokes on unicode + login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items()) + login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8') + request = compat_urllib_request.Request( + u'https://secure.nicovideo.jp/secure/login', login_data) + login_results = self._download_webpage( + request, u'', note=u'Logging in', errnote=u'Unable to log in') + if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None: + self._downloader.report_warning(u'unable to log in: bad username or password') + return False + return True + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + + # Get video webpage. We are not actually interested in it, but need + # the cookies in order to be able to download the info webpage + self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id) + + video_info_webpage = self._download_webpage( + 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, + note=u'Downloading video info page') + + # Get flv info + flv_info_webpage = self._download_webpage( + u'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, + video_id, u'Downloading flv info') + video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0] + + # Start extracting information + video_info = xml.etree.ElementTree.fromstring(video_info_webpage) + video_title = video_info.find('.//title').text + video_extension = video_info.find('.//movie_type').text + video_format = video_extension.upper() + video_thumbnail = video_info.find('.//thumbnail_url').text + video_description = video_info.find('.//description').text + video_uploader_id = video_info.find('.//user_id').text + video_upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0]) + video_view_count = video_info.find('.//view_counter').text + video_webpage_url = video_info.find('.//watch_url').text + + # uploader + video_uploader = video_uploader_id + url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id + try: + user_info_webpage = self._download_webpage( + url, video_id, note=u'Downloading user information') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err)) + else: + user_info = xml.etree.ElementTree.fromstring(user_info_webpage) + video_uploader = user_info.find('.//nickname').text + + return { + 'id': video_id, + 'url': video_real_url, + 'title': video_title, + 'ext': video_extension, + 'format': video_format, + 'thumbnail': video_thumbnail, + 'description': video_description, + 'uploader': video_uploader, + 'upload_date': video_upload_date, + 'uploader_id': video_uploader_id, + 'view_count': video_view_count, + 'webpage_url': video_webpage_url, + } diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py index d476693ec..9faf3a5e3 100644 --- a/youtube_dl/extractor/streamcloud.py +++ b/youtube_dl/extractor/streamcloud.py @@ -21,6 +21,7 @@ class StreamcloudIE(InfoExtractor): u'title': u'youtube-dl test video \'/\\ ä ↭', u'duration': 9, }, + u'skip': u'Only available from the EU' } def _real_extract(self, url): diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py new file mode 100644 index 000000000..78d03c079 --- /dev/null +++ b/youtube_dl/extractor/viki.py @@ -0,0 +1,91 @@ +import re + +from ..utils import ( + unified_strdate, +) +from .subtitles import SubtitlesInfoExtractor + + +class VikiIE(SubtitlesInfoExtractor): + IE_NAME = u'viki' + + _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)' + _TEST = { + u'url': u'http://www.viki.com/videos/1023585v-heirs-episode-14', + u'file': u'1023585v.mp4', + u'md5': u'a21454021c2646f5433514177e2caa5f', + u'info_dict': { + u'title': u'Heirs Episode 14', + u'uploader': u'SBS', + u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e', + u'upload_date': u'20131121', + u'age_limit': 13, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + uploader = self._html_search_regex( + r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage, + u'uploader') + if uploader is not None: + uploader = uploader.strip() + + rating_str = self._html_search_regex( + r'<strong>Rating: </strong>\s*([^<]*)<', webpage, + u'rating information', default='').strip() + RATINGS = { + 'G': 0, + 'PG': 10, + 'PG-13': 13, + 'R': 16, + 'NC': 18, + } + age_limit = RATINGS.get(rating_str) + + info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id + info_webpage = self._download_webpage(info_url, video_id) + video_url = self._html_search_regex( + r'<source[^>]+src="([^"]+)"', info_webpage, u'video URL') + + upload_date_str = self._html_search_regex( + r'"created_at":"([^"]+)"', info_webpage, u'upload date') + upload_date = ( + unified_strdate(upload_date_str) + if upload_date_str is not None + else None + ) + + # subtitles + video_subtitles = self.extract_subtitles(video_id, info_webpage) + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, info_webpage) + return + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'description': description, + 'thumbnail': thumbnail, + 'age_limit': age_limit, + 'uploader': uploader, + 'subtitles': video_subtitles, + 'upload_date': upload_date, + } + + def _get_available_subtitles(self, video_id, info_webpage): + res = {} + for sturl in re.findall(r'<track src="([^"]+)"/>'): + m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl) + if not m: + continue + res[m.group('lang')] = sturl + return res diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 41838237c..64d4c2445 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1510,7 +1510,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): }) return results -class YoutubePlaylistIE(InfoExtractor): +class YoutubePlaylistIE(YoutubeBaseInfoExtractor): IE_DESC = u'YouTube.com playlists' _VALID_URL = r"""(?: (?:https?://)? @@ -1526,8 +1526,9 @@ class YoutubePlaylistIE(InfoExtractor): | ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,}) )""" - _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none' - _MAX_RESULTS = 50 + _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s' + _MORE_PAGES_INDICATOR = r'data-link-type="next"' + _VIDEO_RE = r'href="/watch\?v=([0-9A-Za-z_-]{11})&' IE_NAME = u'youtube:playlist' @classmethod @@ -1535,6 +1536,9 @@ class YoutubePlaylistIE(InfoExtractor): """Receives a URL and returns True if suitable for this IE.""" return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + def _real_initialize(self): + self._login() + def _real_extract(self, url): # Extract playlist id mobj = re.match(self._VALID_URL, url, re.VERBOSE) @@ -1548,45 +1552,28 @@ class YoutubePlaylistIE(InfoExtractor): video_id = query_dict['v'][0] if self._downloader.params.get('noplaylist'): self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id) - return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube') + return self.url_result(video_id, 'Youtube', video_id=video_id) else: self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - # Download playlist videos from API - videos = [] + # Extract the video ids from the playlist pages + ids = [] for page_num in itertools.count(1): - start_index = self._MAX_RESULTS * (page_num - 1) + 1 - if start_index >= 1000: - self._downloader.report_warning(u'Max number of results reached') - break - url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index) + url = self._TEMPLATE_URL % (playlist_id, page_num) page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num) + # The ids are duplicated + new_ids = orderedSet(re.findall(self._VIDEO_RE, page)) + ids.extend(new_ids) - try: - response = json.loads(page) - except ValueError as err: - raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) - - if 'feed' not in response: - raise ExtractorError(u'Got a malformed response from YouTube API') - playlist_title = response['feed']['title']['$t'] - if 'entry' not in response['feed']: - # Number of videos is a multiple of self._MAX_RESULTS + if re.search(self._MORE_PAGES_INDICATOR, page) is None: break - for entry in response['feed']['entry']: - index = entry['yt$position']['$t'] - if 'media$group' in entry and 'yt$videoid' in entry['media$group']: - videos.append(( - index, - 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t'] - )) + playlist_title = self._og_search_title(page) - videos = [v[1] for v in sorted(videos)] - - url_results = [self.url_result(vurl, 'Youtube') for vurl in videos] - return [self.playlist_result(url_results, playlist_id, playlist_title)] + url_results = [self.url_result(vid_id, 'Youtube', video_id=vid_id) + for vid_id in ids] + return self.playlist_result(url_results, playlist_id, playlist_title) class YoutubeChannelIE(InfoExtractor): @@ -1640,9 +1627,9 @@ class YoutubeChannelIE(InfoExtractor): self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) - urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids] - url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls] - return [self.playlist_result(url_entries, channel_id)] + url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in video_ids] + return self.playlist_result(url_entries, channel_id) class YoutubeUserIE(InfoExtractor): @@ -1706,9 +1693,11 @@ class YoutubeUserIE(InfoExtractor): if len(ids_in_page) < self._GDATA_PAGE_SIZE: break - urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids] - url_results = [self.url_result(rurl, 'Youtube') for rurl in urls] - return [self.playlist_result(url_results, playlist_title = username)] + url_results = [ + self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in video_ids] + return self.playlist_result(url_results, playlist_title=username) + class YoutubeSearchIE(SearchInfoExtractor): IE_DESC = u'YouTube.com searches' @@ -1749,7 +1738,8 @@ class YoutubeSearchIE(SearchInfoExtractor): if len(video_ids) > n: video_ids = video_ids[:n] - videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids] + videos = [self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in video_ids] return self.playlist_result(videos, query) class YoutubeSearchDateIE(YoutubeSearchIE): @@ -1809,7 +1799,9 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): feed_html = info['feed_html'] m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) ids = orderedSet(m.group(1) for m in m_ids) - feed_entries.extend(self.url_result(id, 'Youtube') for id in ids) + feed_entries.extend( + self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in ids) if info['paging'] is None: break return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) @@ -1834,6 +1826,20 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): _PAGING_STEP = 100 _PERSONAL_FEED = True +class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): + IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)' + _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory' + _FEED_NAME = 'history' + _PERSONAL_FEED = True + _PLAYLIST_TITLE = u'Youtube Watch History' + + def _real_extract(self, url): + webpage = self._download_webpage('https://www.youtube.com/feed/history', u'History') + data_paging = self._search_regex(r'data-paging="(\d+)"', webpage, u'data-paging') + # The step is actually a ridiculously big number (like 1374343569725646) + self._PAGING_STEP = int(data_paging) + return super(YoutubeHistoryIE, self)._real_extract(url) + class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = u'youtube:favorites' IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' |