diff options
-rw-r--r-- | AUTHORS | 1 | ||||
-rw-r--r-- | test/test_utils.py | 9 | ||||
-rw-r--r-- | youtube_dl/compat.py | 6 | ||||
-rw-r--r-- | youtube_dl/extractor/bbc.py | 47 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 19 | ||||
-rw-r--r-- | youtube_dl/extractor/gdcvault.py | 33 | ||||
-rw-r--r-- | youtube_dl/extractor/soundcloud.py | 135 | ||||
-rw-r--r-- | youtube_dl/extractor/vidme.py | 37 | ||||
-rw-r--r-- | youtube_dl/extractor/viewster.py | 7 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 142 | ||||
-rw-r--r-- | youtube_dl/utils.py | 17 |
11 files changed, 361 insertions, 92 deletions
@@ -135,3 +135,4 @@ Bernhard Minks sceext Zach Bruggeman Tjark Saul +slangangular diff --git a/test/test_utils.py b/test/test_utils.py index 65692a9fb..a759b2da9 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -235,12 +235,21 @@ class TestUtil(unittest.TestCase): <node x="a"/> <node x="a" y="c" /> <node x="b" y="d" /> + <node x="" /> </root>''' doc = xml.etree.ElementTree.fromstring(testxml) + self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None) self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None) + self.assertEqual(find_xpath_attr(doc, './/node', 'n'), None) + self.assertEqual(find_xpath_attr(doc, './/node', 'n', 'v'), None) + self.assertEqual(find_xpath_attr(doc, './/node', 'x'), doc[1]) self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1]) + self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'b'), doc[3]) + self.assertEqual(find_xpath_attr(doc, './/node', 'y'), doc[2]) self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2]) + self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'd'), doc[3]) + self.assertEqual(find_xpath_attr(doc, './/node', 'x', ''), doc[4]) def test_xpath_with_ns(self): testxml = '''<root xmlns:media="http://example.com/"> diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 0c57c7aeb..e4b9286c0 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -43,6 +43,11 @@ except ImportError: # Python 2 import cookielib as compat_cookiejar try: + import http.cookies as compat_cookies +except ImportError: # Python 2 + import Cookie as compat_cookies + +try: import html.entities as compat_html_entities except ImportError: # Python 2 import htmlentitydefs as compat_html_entities @@ -436,6 +441,7 @@ __all__ = [ 'compat_basestring', 'compat_chr', 'compat_cookiejar', + 'compat_cookies', 'compat_expanduser', 'compat_get_terminal_size', 'compat_getenv', diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 01d07c9c0..9a1b6e3dc 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -20,7 +20,9 @@ class BBCCoUkIE(InfoExtractor): IE_DESC = 'BBC iPlayer' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})' - _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' + _MEDIASELECTOR_URLS = [ + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', + ] _TESTS = [ { @@ -162,6 +164,10 @@ class BBCCoUkIE(InfoExtractor): } ] + class MediaSelectionError(Exception): + def __init__(self, id): + self.id = id + def _extract_asx_playlist(self, connection, programme_id): asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') return [ref.get('href') for ref in asx.findall('./Entry/ref')] @@ -212,8 +218,7 @@ class BBCCoUkIE(InfoExtractor): def _extract_medias(self, media_selection): error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error') if error is not None: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True) + raise BBCCoUkIE.MediaSelectionError(error.get('id')) return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media') def _extract_connections(self, media): @@ -270,9 +275,23 @@ class BBCCoUkIE(InfoExtractor): ] return subtitles + def _raise_extractor_error(self, media_selection_error): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, media_selection_error.id), + expected=True) + def _download_media_selector(self, programme_id): - return self._download_media_selector_url( - self._MEDIASELECTOR_URL % programme_id, programme_id) + last_exception = None + for mediaselector_url in self._MEDIASELECTOR_URLS: + try: + return self._download_media_selector_url( + mediaselector_url % programme_id, programme_id) + except BBCCoUkIE.MediaSelectionError as e: + if e.id == 'notukerror': + last_exception = e + continue + self._raise_extractor_error(e) + self._raise_extractor_error(last_exception) def _download_media_selector_url(self, url, programme_id=None): try: @@ -297,7 +316,6 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_video(media, programme_id)) elif kind == 'captions': subtitles = self.extract_subtitles(media, programme_id) - return formats, subtitles def _download_playlist(self, playlist_id): @@ -426,9 +444,14 @@ class BBCIE(BBCCoUkIE): IE_DESC = 'BBC' _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)' - # fails with notukerror for some videos - # _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' - _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s' + _MEDIASELECTOR_URLS = [ + # Provides more formats, namely direct mp4 links, but fails on some videos with + # notukerror for non UK (?) users (e.g. + # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) + 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s', + # Provides fewer formats, but works everywhere for everybody (hopefully) + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s', + ] _TESTS = [{ # article with multiple videos embedded with data-media-meta containing @@ -463,7 +486,7 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/news/world-europe-32041533', 'info_dict': { 'id': 'p02mprgb', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'duration': 47, 'timestamp': 1427219242, @@ -523,7 +546,7 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', 'info_dict': { 'id': 'p018zqqg', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Hyundai Santa Fe Sport: Rock star', 'description': 'md5:b042a26142c4154a6e472933cf20793d', 'timestamp': 1368473503, @@ -538,7 +561,7 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { 'id': 'p02xycnp', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', 'description': 'md5:398fca0e2e701c609d726e034fa1fc89', 'duration': 140, diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 14b9b4fe2..dc5080504 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -14,10 +14,12 @@ import xml.etree.ElementTree from ..compat import ( compat_cookiejar, + compat_cookies, compat_HTTPError, compat_http_client, compat_urllib_error, compat_urllib_parse_urlparse, + compat_urllib_request, compat_urlparse, compat_str, ) @@ -181,6 +183,7 @@ class InfoExtractor(object): by YoutubeDL if it's missing) categories: A list of categories that the video falls in, for example ["Sports", "Berlin"] + tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] is_live: True, False, or None (=unknown). Whether this video is a live stream that goes on instead of a fixed-length video. start_time: Time in seconds where the reproduction should start, as @@ -630,6 +633,12 @@ class InfoExtractor(object): template % (content_re, property_re), ] + @staticmethod + def _meta_regex(prop): + return r'''(?isx)<meta + (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) + [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) + def _og_search_property(self, prop, html, name=None, **kargs): if name is None: name = 'OpenGraph %s' % prop @@ -660,9 +669,7 @@ class InfoExtractor(object): if display_name is None: display_name = name return self._html_search_regex( - r'''(?isx)<meta - (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) - [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name), + self._meta_regex(name), html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): @@ -1069,6 +1076,12 @@ class InfoExtractor(object): None, '/', True, False, expire_time, '', None, None, None) self._downloader.cookiejar.set_cookie(cookie) + def _get_cookies(self, url): + """ Return a compat_cookies.SimpleCookie with the cookies for the url """ + req = compat_urllib_request.Request(url) + self._downloader.cookiejar.add_cookie_header(req) + return compat_cookies.SimpleCookie(req.get_header('Cookie')) + def get_testcases(self, include_onlymatching=False): t = getattr(self, '_TEST', None) if t: diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 43f916412..a6834db43 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -7,7 +7,10 @@ from ..compat import ( compat_urllib_parse, compat_urllib_request, ) -from ..utils import remove_end +from ..utils import ( + remove_end, + HEADRequest, +) class GDCVaultIE(InfoExtractor): @@ -73,10 +76,20 @@ class GDCVaultIE(InfoExtractor): return video_formats def _parse_flv(self, xml_description): - video_formats = [] + formats = [] akamai_url = xml_description.find('./metadata/akamaiHost').text + audios = xml_description.find('./metadata/audios') + if audios is not None: + for audio in audios: + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(audio.get('url'), '.flv'), + 'ext': 'flv', + 'vcodec': 'none', + 'format_id': audio.get('code'), + }) slide_video_path = xml_description.find('./metadata/slideVideo').text - video_formats.append({ + formats.append({ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, 'play_path': remove_end(slide_video_path, '.flv'), 'ext': 'flv', @@ -86,7 +99,7 @@ class GDCVaultIE(InfoExtractor): 'format_id': 'slides', }) speaker_video_path = xml_description.find('./metadata/speakerVideo').text - video_formats.append({ + formats.append({ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, 'play_path': remove_end(speaker_video_path, '.flv'), 'ext': 'flv', @@ -95,7 +108,7 @@ class GDCVaultIE(InfoExtractor): 'preference': -1, 'format_id': 'speaker', }) - return video_formats + return formats def _login(self, webpage_url, display_id): (username, password) = self._get_login_info() @@ -133,16 +146,18 @@ class GDCVaultIE(InfoExtractor): r's1\.addVariable\("file",\s*encodeURIComponent\("(/[^"]+)"\)\);', start_page, 'url', default=None) if direct_url: - video_url = 'http://www.gdcvault.com/' + direct_url title = self._html_search_regex( r'<td><strong>Session Name</strong></td>\s*<td>(.*?)</td>', start_page, 'title') + video_url = 'http://www.gdcvault.com' + direct_url + # resolve the url so that we can detect the correct extension + head = self._request_webpage(HEADRequest(video_url), video_id) + video_url = head.geturl() return { 'id': video_id, 'display_id': display_id, 'url': video_url, - 'ext': 'flv', 'title': title, } @@ -168,8 +183,8 @@ class GDCVaultIE(InfoExtractor): # Fallback to the older format xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename') - xml_decription_url = xml_root + 'xml/' + xml_name - xml_description = self._download_xml(xml_decription_url, display_id) + xml_description_url = xml_root + 'xml/' + xml_name + xml_description = self._download_xml(xml_description_url, display_id) video_title = xml_description.find('./metadata/title').text video_formats = self._parse_mp4(xml_description) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 0a6c9fe72..6ce86cbcd 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -29,7 +29,7 @@ class SoundcloudIE(InfoExtractor): _VALID_URL = r'''(?x)^(?:https?://)? (?:(?:(?:www\.|m\.)?soundcloud\.com/ (?P<uploader>[\w\d-]+)/ - (?!sets/|(?:likes|tracks)/?(?:$|[?#])) + (?!(?:tracks|sets(?:/[^/?#]+)?|reposts|likes|spotlight)/?(?:$|[?#])) (?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) @@ -293,60 +293,139 @@ class SoundcloudSetIE(SoundcloudIE): class SoundcloudUserIE(SoundcloudIE): - _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$' + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|m)\.)?soundcloud\.com/ + (?P<user>[^/]+) + (?:/ + (?P<rsrc>tracks|sets|reposts|likes|spotlight) + )? + /?(?:[?#].*)?$ + ''' IE_NAME = 'soundcloud:user' _TESTS = [{ - 'url': 'https://soundcloud.com/the-concept-band', + 'url': 'https://soundcloud.com/the-akashic-chronicler', 'info_dict': { - 'id': '9615865', - 'title': 'The Royal Concept', + 'id': '114582580', + 'title': 'The Akashic Chronicler (All)', }, - 'playlist_mincount': 12 + 'playlist_mincount': 112, }, { - 'url': 'https://soundcloud.com/the-concept-band/likes', + 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks', 'info_dict': { - 'id': '9615865', - 'title': 'The Royal Concept', + 'id': '114582580', + 'title': 'The Akashic Chronicler (Tracks)', }, - 'playlist_mincount': 1, + 'playlist_mincount': 50, }, { - 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks', - 'only_matching': True, + 'url': 'https://soundcloud.com/the-akashic-chronicler/sets', + 'info_dict': { + 'id': '114582580', + 'title': 'The Akashic Chronicler (Playlists)', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://soundcloud.com/the-akashic-chronicler/reposts', + 'info_dict': { + 'id': '114582580', + 'title': 'The Akashic Chronicler (Reposts)', + }, + 'playlist_mincount': 9, + }, { + 'url': 'https://soundcloud.com/the-akashic-chronicler/likes', + 'info_dict': { + 'id': '114582580', + 'title': 'The Akashic Chronicler (Likes)', + }, + 'playlist_mincount': 333, + }, { + 'url': 'https://soundcloud.com/grynpyret/spotlight', + 'info_dict': { + 'id': '7098329', + 'title': 'Grynpyret (Spotlight)', + }, + 'playlist_mincount': 1, }] + _API_BASE = 'https://api.soundcloud.com' + _API_V2_BASE = 'https://api-v2.soundcloud.com' + + _BASE_URL_MAP = { + 'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE, + 'tracks': '%s/users/%%s/tracks' % _API_BASE, + 'sets': '%s/users/%%s/playlists' % _API_V2_BASE, + 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE, + 'likes': '%s/users/%%s/likes' % _API_V2_BASE, + 'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE, + } + + _TITLE_MAP = { + 'all': 'All', + 'tracks': 'Tracks', + 'sets': 'Playlists', + 'reposts': 'Reposts', + 'likes': 'Likes', + 'spotlight': 'Spotlight', + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader = mobj.group('user') - resource = mobj.group('rsrc') - if resource is None: - resource = 'tracks' - elif resource == 'likes': - resource = 'favorites' url = 'http://soundcloud.com/%s/' % uploader resolv_url = self._resolv_url(url) user = self._download_json( resolv_url, uploader, 'Downloading user info') - base_url = 'http://api.soundcloud.com/users/%s/%s.json?' % (uploader, resource) + + resource = mobj.group('rsrc') or 'all' + base_url = self._BASE_URL_MAP[resource] % user['id'] + + next_href = None entries = [] for i in itertools.count(): - data = compat_urllib_parse.urlencode({ - 'offset': i * 50, - 'limit': 50, - 'client_id': self._CLIENT_ID, - }) - new_entries = self._download_json( - base_url + data, uploader, 'Downloading track page %s' % (i + 1)) - if len(new_entries) == 0: + if not next_href: + data = compat_urllib_parse.urlencode({ + 'offset': i * 50, + 'limit': 50, + 'client_id': self._CLIENT_ID, + 'linked_partitioning': '1', + 'representation': 'speedy', + }) + next_href = base_url + '?' + data + + response = self._download_json( + next_href, uploader, 'Downloading track page %s' % (i + 1)) + + collection = response['collection'] + + if not collection: self.to_screen('%s: End page received' % uploader) break - entries.extend(self.url_result(e['permalink_url'], 'Soundcloud') for e in new_entries) + + def resolve_permalink_url(candidates): + for cand in candidates: + if isinstance(cand, dict): + permalink_url = cand.get('permalink_url') + if permalink_url and permalink_url.startswith('http'): + return permalink_url + + for e in collection: + permalink_url = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) + if permalink_url: + entries.append(self.url_result(permalink_url)) + + if 'next_href' in response: + next_href = response['next_href'] + if not next_href: + break + else: + next_href = None return { '_type': 'playlist', 'id': compat_str(user['id']), - 'title': user['username'], + 'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]), 'entries': entries, } diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index e0b55078b..157bb74fe 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -22,6 +22,27 @@ class VidmeIE(InfoExtractor): 'timestamp': 1406313244, 'upload_date': '20140725', 'thumbnail': 're:^https?://.*\.jpg', + 'view_count': int, + 'like_count': int, + }, + }, { + # tests uploader field + 'url': 'https://vid.me/4Iib', + 'info_dict': { + 'id': '4Iib', + 'ext': 'mp4', + 'title': 'The Carver', + 'description': 'md5:e9c24870018ae8113be936645b93ba3c', + 'duration': 97.859999999999999, + 'timestamp': 1433203629, + 'upload_date': '20150602', + 'uploader': 'Thomas', + 'thumbnail': 're:^https?://.*\.jpg', + 'view_count': int, + 'like_count': int, + }, + 'params': { + 'skip_download': True, }, }, { # From http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching @@ -40,16 +61,23 @@ class VidmeIE(InfoExtractor): title = self._og_search_title(webpage) description = self._og_search_description(webpage, default='') thumbnail = self._og_search_thumbnail(webpage) - timestamp = int_or_none(self._og_search_property('updated_time', webpage, fatal=False)) - width = int_or_none(self._og_search_property('video:width', webpage, fatal=False)) - height = int_or_none(self._og_search_property('video:height', webpage, fatal=False)) + timestamp = int_or_none(self._og_search_property( + 'updated_time', webpage, fatal=False)) + width = int_or_none(self._og_search_property( + 'video:width', webpage, fatal=False)) + height = int_or_none(self._og_search_property( + 'video:height', webpage, fatal=False)) duration = float_or_none(self._html_search_regex( r'data-duration="([^"]+)"', webpage, 'duration', fatal=False)) view_count = str_to_int(self._html_search_regex( - r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False)) + r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', + webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">', webpage, 'like count', fatal=False)) + uploader = self._html_search_regex( + 'class="video_author_username"[^>]*>([^<]+)', + webpage, 'uploader', default=None) return { 'id': video_id, @@ -63,4 +91,5 @@ class VidmeIE(InfoExtractor): 'duration': duration, 'view_count': view_count, 'like_count': like_count, + 'uploader': uploader, } diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 6ef36290b..cda02ba24 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -5,11 +5,13 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_request, compat_urllib_parse, + compat_urllib_parse_unquote, ) from ..utils import ( determine_ext, int_or_none, parse_iso8601, + HEADRequest, ) @@ -62,7 +64,6 @@ class ViewsterIE(InfoExtractor): }] _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01' - _AUTH_TOKEN = '/YqhSYsx8EaU9Bsta3ojlA==' def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True): request = compat_urllib_request.Request(url) @@ -72,6 +73,10 @@ class ViewsterIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + # Get 'api_token' cookie + self._request_webpage(HEADRequest(url), video_id) + cookies = self._get_cookies(url) + self._AUTH_TOKEN = compat_urllib_parse_unquote(cookies['api_token'].value) info = self._download_json( 'https://public-api.viewster.com/search/%s' % video_id, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0e411bfb6..67a1df9a0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -33,9 +33,11 @@ from ..utils import ( int_or_none, orderedSet, parse_duration, + smuggle_url, str_to_int, unescapeHTML, unified_strdate, + unsmuggle_url, uppercase_escape, ISO3166Utils, ) @@ -329,6 +331,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20121002', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], + 'tags': ['youtube-dl'], 'like_count': int, 'dislike_count': int, 'start_time': 1, @@ -343,7 +346,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', - 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f', + 'description': 'md5:782e8651347686cba06e58f71ab51773', + 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', + 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', + 'iconic ep', 'iconic', 'love', 'it'], 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', } @@ -558,6 +564,59 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'format': '135', # bestvideo } }, + { + # Multifeed videos (multiple cameras), URL is for Main Camera + 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs', + 'info_dict': { + 'id': 'jqWvoWXjCVs', + 'title': 'teamPGP: Rocket League Noob Stream', + 'description': 'md5:dc7872fb300e143831327f1bae3af010', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'jqWvoWXjCVs', + 'ext': 'mp4', + 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)', + 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'upload_date': '20150721', + 'uploader': 'Beer Games Beer', + 'uploader_id': 'beergamesbeer', + }, + }, { + 'info_dict': { + 'id': '6h8e8xoXJzg', + 'ext': 'mp4', + 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)', + 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'upload_date': '20150721', + 'uploader': 'Beer Games Beer', + 'uploader_id': 'beergamesbeer', + }, + }, { + 'info_dict': { + 'id': 'PUOgX5z9xZw', + 'ext': 'mp4', + 'title': 'teamPGP: Rocket League Noob Stream (grizzle)', + 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'upload_date': '20150721', + 'uploader': 'Beer Games Beer', + 'uploader_id': 'beergamesbeer', + }, + }, { + 'info_dict': { + 'id': 'teuwxikvS5k', + 'ext': 'mp4', + 'title': 'teamPGP: Rocket League Noob Stream (zim)', + 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'upload_date': '20150721', + 'uploader': 'Beer Games Beer', + 'uploader_id': 'beergamesbeer', + }, + }], + 'params': { + 'skip_download': True, + }, + } ] def __init__(self, *args, **kwargs): @@ -889,6 +948,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return formats def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + proto = ( 'http' if self._downloader.params.get('prefer_insecure', False) else 'https') @@ -1005,6 +1066,55 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '"token" parameter not in video info for unknown reason', video_id=video_id) + # title + if 'title' in video_info: + video_title = video_info['title'][0] + else: + self._downloader.report_warning('Unable to extract video title') + video_title = '_' + + # description + video_description = get_element_by_id("eow-description", video_webpage) + if video_description: + video_description = re.sub(r'''(?x) + <a\s+ + (?:[a-zA-Z-]+="[^"]+"\s+)*? + title="([^"]+)"\s+ + (?:[a-zA-Z-]+="[^"]+"\s+)*? + class="yt-uix-redirect-link"\s*> + [^<]+ + </a> + ''', r'\1', video_description) + video_description = clean_html(video_description) + else: + fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage) + if fd_mobj: + video_description = unescapeHTML(fd_mobj.group(1)) + else: + video_description = '' + + if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False): + if not self._downloader.params.get('noplaylist'): + entries = [] + feed_ids = [] + multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0]) + for feed in multifeed_metadata_list.split(','): + feed_data = compat_parse_qs(feed) + entries.append({ + '_type': 'url_transparent', + 'ie_key': 'Youtube', + 'url': smuggle_url( + '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), + {'force_singlefeed': True}), + 'title': '%s (%s)' % (video_title, feed_data['title'][0]), + }) + feed_ids.append(feed_data['id'][0]) + self.to_screen( + 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' + % (', '.join(feed_ids), video_id)) + return self.playlist_result(entries, video_id, video_title, video_description) + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + if 'view_count' in video_info: view_count = int(video_info['view_count'][0]) else: @@ -1030,13 +1140,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: self._downloader.report_warning('unable to extract uploader nickname') - # title - if 'title' in video_info: - video_title = video_info['title'][0] - else: - self._downloader.report_warning('Unable to extract video title') - video_title = '_' - # thumbnail image # We try first to get a high quality image: m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', @@ -1072,25 +1175,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_categories = None - # description - video_description = get_element_by_id("eow-description", video_webpage) - if video_description: - video_description = re.sub(r'''(?x) - <a\s+ - (?:[a-zA-Z-]+="[^"]+"\s+)*? - title="([^"]+)"\s+ - (?:[a-zA-Z-]+="[^"]+"\s+)*? - class="yt-uix-redirect-link"\s*> - [^<]+ - </a> - ''', r'\1', video_description) - video_description = clean_html(video_description) - else: - fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage) - if fd_mobj: - video_description = unescapeHTML(fd_mobj.group(1)) - else: - video_description = '' + video_tags = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] def _extract_count(count_name): return str_to_int(self._search_regex( @@ -1260,6 +1347,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'thumbnail': video_thumbnail, 'description': video_description, 'categories': video_categories, + 'tags': video_tags, 'subtitles': video_subtitles, 'automatic_captions': automatic_captions, 'duration': video_duration, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ae813099d..78dc2b449 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -139,21 +139,24 @@ def write_json_file(obj, fn): if sys.version_info >= (2, 7): - def find_xpath_attr(node, xpath, key, val): + def find_xpath_attr(node, xpath, key, val=None): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z-]+$', key) - assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) - expr = xpath + "[@%s='%s']" % (key, val) + if val: + assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) + expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val)) return node.find(expr) else: - def find_xpath_attr(node, xpath, key, val): + def find_xpath_attr(node, xpath, key, val=None): # Here comes the crazy part: In 2.6, if the xpath is a unicode, # .//node does not match if a node is a direct child of . ! if isinstance(xpath, compat_str): xpath = xpath.encode('ascii') for f in node.findall(xpath): - if f.attrib.get(key) == val: + if key not in f.attrib: + continue + if val is None or f.attrib.get(key) == val: return f return None @@ -576,11 +579,9 @@ class ContentTooShortError(Exception): download is too small for what the server announced first, indicating the connection was probably interrupted. """ - # Both in bytes - downloaded = None - expected = None def __init__(self, downloaded, expected): + # Both in bytes self.downloaded = downloaded self.expected = expected |