aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--AUTHORS1
-rw-r--r--test/test_utils.py9
-rw-r--r--youtube_dl/compat.py6
-rw-r--r--youtube_dl/extractor/bbc.py47
-rw-r--r--youtube_dl/extractor/common.py19
-rw-r--r--youtube_dl/extractor/gdcvault.py33
-rw-r--r--youtube_dl/extractor/soundcloud.py135
-rw-r--r--youtube_dl/extractor/vidme.py37
-rw-r--r--youtube_dl/extractor/viewster.py7
-rw-r--r--youtube_dl/extractor/youtube.py142
-rw-r--r--youtube_dl/utils.py17
11 files changed, 361 insertions, 92 deletions
diff --git a/AUTHORS b/AUTHORS
index e75e9885d..aa6b88cc0 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -135,3 +135,4 @@ Bernhard Minks
sceext
Zach Bruggeman
Tjark Saul
+slangangular
diff --git a/test/test_utils.py b/test/test_utils.py
index 65692a9fb..a759b2da9 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -235,12 +235,21 @@ class TestUtil(unittest.TestCase):
<node x="a"/>
<node x="a" y="c" />
<node x="b" y="d" />
+ <node x="" />
</root>'''
doc = xml.etree.ElementTree.fromstring(testxml)
+ self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None)
self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None)
+ self.assertEqual(find_xpath_attr(doc, './/node', 'n'), None)
+ self.assertEqual(find_xpath_attr(doc, './/node', 'n', 'v'), None)
+ self.assertEqual(find_xpath_attr(doc, './/node', 'x'), doc[1])
self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1])
+ self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'b'), doc[3])
+ self.assertEqual(find_xpath_attr(doc, './/node', 'y'), doc[2])
self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2])
+ self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'd'), doc[3])
+ self.assertEqual(find_xpath_attr(doc, './/node', 'x', ''), doc[4])
def test_xpath_with_ns(self):
testxml = '''<root xmlns:media="http://example.com/">
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py
index 0c57c7aeb..e4b9286c0 100644
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@@ -43,6 +43,11 @@ except ImportError: # Python 2
import cookielib as compat_cookiejar
try:
+ import http.cookies as compat_cookies
+except ImportError: # Python 2
+ import Cookie as compat_cookies
+
+try:
import html.entities as compat_html_entities
except ImportError: # Python 2
import htmlentitydefs as compat_html_entities
@@ -436,6 +441,7 @@ __all__ = [
'compat_basestring',
'compat_chr',
'compat_cookiejar',
+ 'compat_cookies',
'compat_expanduser',
'compat_get_terminal_size',
'compat_getenv',
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index 01d07c9c0..9a1b6e3dc 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -20,7 +20,9 @@ class BBCCoUkIE(InfoExtractor):
IE_DESC = 'BBC iPlayer'
_VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
- _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s'
+ _MEDIASELECTOR_URLS = [
+ 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
+ ]
_TESTS = [
{
@@ -162,6 +164,10 @@ class BBCCoUkIE(InfoExtractor):
}
]
+ class MediaSelectionError(Exception):
+ def __init__(self, id):
+ self.id = id
+
def _extract_asx_playlist(self, connection, programme_id):
asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
return [ref.get('href') for ref in asx.findall('./Entry/ref')]
@@ -212,8 +218,7 @@ class BBCCoUkIE(InfoExtractor):
def _extract_medias(self, media_selection):
error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error')
if error is not None:
- raise ExtractorError(
- '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True)
+ raise BBCCoUkIE.MediaSelectionError(error.get('id'))
return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
def _extract_connections(self, media):
@@ -270,9 +275,23 @@ class BBCCoUkIE(InfoExtractor):
]
return subtitles
+ def _raise_extractor_error(self, media_selection_error):
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
+ expected=True)
+
def _download_media_selector(self, programme_id):
- return self._download_media_selector_url(
- self._MEDIASELECTOR_URL % programme_id, programme_id)
+ last_exception = None
+ for mediaselector_url in self._MEDIASELECTOR_URLS:
+ try:
+ return self._download_media_selector_url(
+ mediaselector_url % programme_id, programme_id)
+ except BBCCoUkIE.MediaSelectionError as e:
+ if e.id == 'notukerror':
+ last_exception = e
+ continue
+ self._raise_extractor_error(e)
+ self._raise_extractor_error(last_exception)
def _download_media_selector_url(self, url, programme_id=None):
try:
@@ -297,7 +316,6 @@ class BBCCoUkIE(InfoExtractor):
formats.extend(self._extract_video(media, programme_id))
elif kind == 'captions':
subtitles = self.extract_subtitles(media, programme_id)
-
return formats, subtitles
def _download_playlist(self, playlist_id):
@@ -426,9 +444,14 @@ class BBCIE(BBCCoUkIE):
IE_DESC = 'BBC'
_VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
- # fails with notukerror for some videos
- # _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s'
- _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s'
+ _MEDIASELECTOR_URLS = [
+ # Provides more formats, namely direct mp4 links, but fails on some videos with
+ # notukerror for non UK (?) users (e.g.
+ # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
+ 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
+ # Provides fewer formats, but works everywhere for everybody (hopefully)
+ 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
+ ]
_TESTS = [{
# article with multiple videos embedded with data-media-meta containing
@@ -463,7 +486,7 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.com/news/world-europe-32041533',
'info_dict': {
'id': 'p02mprgb',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
'duration': 47,
'timestamp': 1427219242,
@@ -523,7 +546,7 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
'info_dict': {
'id': 'p018zqqg',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Hyundai Santa Fe Sport: Rock star',
'description': 'md5:b042a26142c4154a6e472933cf20793d',
'timestamp': 1368473503,
@@ -538,7 +561,7 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.com/sport/0/football/33653409',
'info_dict': {
'id': 'p02xycnp',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
'description': 'md5:398fca0e2e701c609d726e034fa1fc89',
'duration': 140,
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 14b9b4fe2..dc5080504 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -14,10 +14,12 @@ import xml.etree.ElementTree
from ..compat import (
compat_cookiejar,
+ compat_cookies,
compat_HTTPError,
compat_http_client,
compat_urllib_error,
compat_urllib_parse_urlparse,
+ compat_urllib_request,
compat_urlparse,
compat_str,
)
@@ -181,6 +183,7 @@ class InfoExtractor(object):
by YoutubeDL if it's missing)
categories: A list of categories that the video falls in, for example
["Sports", "Berlin"]
+ tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
is_live: True, False, or None (=unknown). Whether this video is a
live stream that goes on instead of a fixed-length video.
start_time: Time in seconds where the reproduction should start, as
@@ -630,6 +633,12 @@ class InfoExtractor(object):
template % (content_re, property_re),
]
+ @staticmethod
+ def _meta_regex(prop):
+ return r'''(?isx)<meta
+ (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
+ [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
+
def _og_search_property(self, prop, html, name=None, **kargs):
if name is None:
name = 'OpenGraph %s' % prop
@@ -660,9 +669,7 @@ class InfoExtractor(object):
if display_name is None:
display_name = name
return self._html_search_regex(
- r'''(?isx)<meta
- (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
- [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
+ self._meta_regex(name),
html, display_name, fatal=fatal, group='content', **kwargs)
def _dc_search_uploader(self, html):
@@ -1069,6 +1076,12 @@ class InfoExtractor(object):
None, '/', True, False, expire_time, '', None, None, None)
self._downloader.cookiejar.set_cookie(cookie)
+ def _get_cookies(self, url):
+ """ Return a compat_cookies.SimpleCookie with the cookies for the url """
+ req = compat_urllib_request.Request(url)
+ self._downloader.cookiejar.add_cookie_header(req)
+ return compat_cookies.SimpleCookie(req.get_header('Cookie'))
+
def get_testcases(self, include_onlymatching=False):
t = getattr(self, '_TEST', None)
if t:
diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py
index 43f916412..a6834db43 100644
--- a/youtube_dl/extractor/gdcvault.py
+++ b/youtube_dl/extractor/gdcvault.py
@@ -7,7 +7,10 @@ from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
-from ..utils import remove_end
+from ..utils import (
+ remove_end,
+ HEADRequest,
+)
class GDCVaultIE(InfoExtractor):
@@ -73,10 +76,20 @@ class GDCVaultIE(InfoExtractor):
return video_formats
def _parse_flv(self, xml_description):
- video_formats = []
+ formats = []
akamai_url = xml_description.find('./metadata/akamaiHost').text
+ audios = xml_description.find('./metadata/audios')
+ if audios is not None:
+ for audio in audios:
+ formats.append({
+ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+ 'play_path': remove_end(audio.get('url'), '.flv'),
+ 'ext': 'flv',
+ 'vcodec': 'none',
+ 'format_id': audio.get('code'),
+ })
slide_video_path = xml_description.find('./metadata/slideVideo').text
- video_formats.append({
+ formats.append({
'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
'play_path': remove_end(slide_video_path, '.flv'),
'ext': 'flv',
@@ -86,7 +99,7 @@ class GDCVaultIE(InfoExtractor):
'format_id': 'slides',
})
speaker_video_path = xml_description.find('./metadata/speakerVideo').text
- video_formats.append({
+ formats.append({
'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
'play_path': remove_end(speaker_video_path, '.flv'),
'ext': 'flv',
@@ -95,7 +108,7 @@ class GDCVaultIE(InfoExtractor):
'preference': -1,
'format_id': 'speaker',
})
- return video_formats
+ return formats
def _login(self, webpage_url, display_id):
(username, password) = self._get_login_info()
@@ -133,16 +146,18 @@ class GDCVaultIE(InfoExtractor):
r's1\.addVariable\("file",\s*encodeURIComponent\("(/[^"]+)"\)\);',
start_page, 'url', default=None)
if direct_url:
- video_url = 'http://www.gdcvault.com/' + direct_url
title = self._html_search_regex(
r'<td><strong>Session Name</strong></td>\s*<td>(.*?)</td>',
start_page, 'title')
+ video_url = 'http://www.gdcvault.com' + direct_url
+ # resolve the url so that we can detect the correct extension
+ head = self._request_webpage(HEADRequest(video_url), video_id)
+ video_url = head.geturl()
return {
'id': video_id,
'display_id': display_id,
'url': video_url,
- 'ext': 'flv',
'title': title,
}
@@ -168,8 +183,8 @@ class GDCVaultIE(InfoExtractor):
# Fallback to the older format
xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename')
- xml_decription_url = xml_root + 'xml/' + xml_name
- xml_description = self._download_xml(xml_decription_url, display_id)
+ xml_description_url = xml_root + 'xml/' + xml_name
+ xml_description = self._download_xml(xml_description_url, display_id)
video_title = xml_description.find('./metadata/title').text
video_formats = self._parse_mp4(xml_description)
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index 0a6c9fe72..6ce86cbcd 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -29,7 +29,7 @@ class SoundcloudIE(InfoExtractor):
_VALID_URL = r'''(?x)^(?:https?://)?
(?:(?:(?:www\.|m\.)?soundcloud\.com/
(?P<uploader>[\w\d-]+)/
- (?!sets/|(?:likes|tracks)/?(?:$|[?#]))
+ (?!(?:tracks|sets(?:/[^/?#]+)?|reposts|likes|spotlight)/?(?:$|[?#]))
(?P<title>[\w\d-]+)/?
(?P<token>[^?]+?)?(?:[?].*)?$)
|(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)
@@ -293,60 +293,139 @@ class SoundcloudSetIE(SoundcloudIE):
class SoundcloudUserIE(SoundcloudIE):
- _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:(?:www|m)\.)?soundcloud\.com/
+ (?P<user>[^/]+)
+ (?:/
+ (?P<rsrc>tracks|sets|reposts|likes|spotlight)
+ )?
+ /?(?:[?#].*)?$
+ '''
IE_NAME = 'soundcloud:user'
_TESTS = [{
- 'url': 'https://soundcloud.com/the-concept-band',
+ 'url': 'https://soundcloud.com/the-akashic-chronicler',
'info_dict': {
- 'id': '9615865',
- 'title': 'The Royal Concept',
+ 'id': '114582580',
+ 'title': 'The Akashic Chronicler (All)',
},
- 'playlist_mincount': 12
+ 'playlist_mincount': 112,
}, {
- 'url': 'https://soundcloud.com/the-concept-band/likes',
+ 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks',
'info_dict': {
- 'id': '9615865',
- 'title': 'The Royal Concept',
+ 'id': '114582580',
+ 'title': 'The Akashic Chronicler (Tracks)',
},
- 'playlist_mincount': 1,
+ 'playlist_mincount': 50,
}, {
- 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks',
- 'only_matching': True,
+ 'url': 'https://soundcloud.com/the-akashic-chronicler/sets',
+ 'info_dict': {
+ 'id': '114582580',
+ 'title': 'The Akashic Chronicler (Playlists)',
+ },
+ 'playlist_mincount': 3,
+ }, {
+ 'url': 'https://soundcloud.com/the-akashic-chronicler/reposts',
+ 'info_dict': {
+ 'id': '114582580',
+ 'title': 'The Akashic Chronicler (Reposts)',
+ },
+ 'playlist_mincount': 9,
+ }, {
+ 'url': 'https://soundcloud.com/the-akashic-chronicler/likes',
+ 'info_dict': {
+ 'id': '114582580',
+ 'title': 'The Akashic Chronicler (Likes)',
+ },
+ 'playlist_mincount': 333,
+ }, {
+ 'url': 'https://soundcloud.com/grynpyret/spotlight',
+ 'info_dict': {
+ 'id': '7098329',
+ 'title': 'Grynpyret (Spotlight)',
+ },
+ 'playlist_mincount': 1,
}]
+ _API_BASE = 'https://api.soundcloud.com'
+ _API_V2_BASE = 'https://api-v2.soundcloud.com'
+
+ _BASE_URL_MAP = {
+ 'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE,
+ 'tracks': '%s/users/%%s/tracks' % _API_BASE,
+ 'sets': '%s/users/%%s/playlists' % _API_V2_BASE,
+ 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE,
+ 'likes': '%s/users/%%s/likes' % _API_V2_BASE,
+ 'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE,
+ }
+
+ _TITLE_MAP = {
+ 'all': 'All',
+ 'tracks': 'Tracks',
+ 'sets': 'Playlists',
+ 'reposts': 'Reposts',
+ 'likes': 'Likes',
+ 'spotlight': 'Spotlight',
+ }
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
uploader = mobj.group('user')
- resource = mobj.group('rsrc')
- if resource is None:
- resource = 'tracks'
- elif resource == 'likes':
- resource = 'favorites'
url = 'http://soundcloud.com/%s/' % uploader
resolv_url = self._resolv_url(url)
user = self._download_json(
resolv_url, uploader, 'Downloading user info')
- base_url = 'http://api.soundcloud.com/users/%s/%s.json?' % (uploader, resource)
+
+ resource = mobj.group('rsrc') or 'all'
+ base_url = self._BASE_URL_MAP[resource] % user['id']
+
+ next_href = None
entries = []
for i in itertools.count():
- data = compat_urllib_parse.urlencode({
- 'offset': i * 50,
- 'limit': 50,
- 'client_id': self._CLIENT_ID,
- })
- new_entries = self._download_json(
- base_url + data, uploader, 'Downloading track page %s' % (i + 1))
- if len(new_entries) == 0:
+ if not next_href:
+ data = compat_urllib_parse.urlencode({
+ 'offset': i * 50,
+ 'limit': 50,
+ 'client_id': self._CLIENT_ID,
+ 'linked_partitioning': '1',
+ 'representation': 'speedy',
+ })
+ next_href = base_url + '?' + data
+
+ response = self._download_json(
+ next_href, uploader, 'Downloading track page %s' % (i + 1))
+
+ collection = response['collection']
+
+ if not collection:
self.to_screen('%s: End page received' % uploader)
break
- entries.extend(self.url_result(e['permalink_url'], 'Soundcloud') for e in new_entries)
+
+ def resolve_permalink_url(candidates):
+ for cand in candidates:
+ if isinstance(cand, dict):
+ permalink_url = cand.get('permalink_url')
+ if permalink_url and permalink_url.startswith('http'):
+ return permalink_url
+
+ for e in collection:
+ permalink_url = resolve_permalink_url((e, e.get('track'), e.get('playlist')))
+ if permalink_url:
+ entries.append(self.url_result(permalink_url))
+
+ if 'next_href' in response:
+ next_href = response['next_href']
+ if not next_href:
+ break
+ else:
+ next_href = None
return {
'_type': 'playlist',
'id': compat_str(user['id']),
- 'title': user['username'],
+ 'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]),
'entries': entries,
}
diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py
index e0b55078b..157bb74fe 100644
--- a/youtube_dl/extractor/vidme.py
+++ b/youtube_dl/extractor/vidme.py
@@ -22,6 +22,27 @@ class VidmeIE(InfoExtractor):
'timestamp': 1406313244,
'upload_date': '20140725',
'thumbnail': 're:^https?://.*\.jpg',
+ 'view_count': int,
+ 'like_count': int,
+ },
+ }, {
+ # tests uploader field
+ 'url': 'https://vid.me/4Iib',
+ 'info_dict': {
+ 'id': '4Iib',
+ 'ext': 'mp4',
+ 'title': 'The Carver',
+ 'description': 'md5:e9c24870018ae8113be936645b93ba3c',
+ 'duration': 97.859999999999999,
+ 'timestamp': 1433203629,
+ 'upload_date': '20150602',
+ 'uploader': 'Thomas',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'view_count': int,
+ 'like_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
},
}, {
# From http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching
@@ -40,16 +61,23 @@ class VidmeIE(InfoExtractor):
title = self._og_search_title(webpage)
description = self._og_search_description(webpage, default='')
thumbnail = self._og_search_thumbnail(webpage)
- timestamp = int_or_none(self._og_search_property('updated_time', webpage, fatal=False))
- width = int_or_none(self._og_search_property('video:width', webpage, fatal=False))
- height = int_or_none(self._og_search_property('video:height', webpage, fatal=False))
+ timestamp = int_or_none(self._og_search_property(
+ 'updated_time', webpage, fatal=False))
+ width = int_or_none(self._og_search_property(
+ 'video:width', webpage, fatal=False))
+ height = int_or_none(self._og_search_property(
+ 'video:height', webpage, fatal=False))
duration = float_or_none(self._html_search_regex(
r'data-duration="([^"]+)"', webpage, 'duration', fatal=False))
view_count = str_to_int(self._html_search_regex(
- r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False))
+ r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?',
+ webpage, 'view count', fatal=False))
like_count = str_to_int(self._html_search_regex(
r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">',
webpage, 'like count', fatal=False))
+ uploader = self._html_search_regex(
+ 'class="video_author_username"[^>]*>([^<]+)',
+ webpage, 'uploader', default=None)
return {
'id': video_id,
@@ -63,4 +91,5 @@ class VidmeIE(InfoExtractor):
'duration': duration,
'view_count': view_count,
'like_count': like_count,
+ 'uploader': uploader,
}
diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py
index 6ef36290b..cda02ba24 100644
--- a/youtube_dl/extractor/viewster.py
+++ b/youtube_dl/extractor/viewster.py
@@ -5,11 +5,13 @@ from .common import InfoExtractor
from ..compat import (
compat_urllib_request,
compat_urllib_parse,
+ compat_urllib_parse_unquote,
)
from ..utils import (
determine_ext,
int_or_none,
parse_iso8601,
+ HEADRequest,
)
@@ -62,7 +64,6 @@ class ViewsterIE(InfoExtractor):
}]
_ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01'
- _AUTH_TOKEN = '/YqhSYsx8EaU9Bsta3ojlA=='
def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True):
request = compat_urllib_request.Request(url)
@@ -72,6 +73,10 @@ class ViewsterIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
+ # Get 'api_token' cookie
+ self._request_webpage(HEADRequest(url), video_id)
+ cookies = self._get_cookies(url)
+ self._AUTH_TOKEN = compat_urllib_parse_unquote(cookies['api_token'].value)
info = self._download_json(
'https://public-api.viewster.com/search/%s' % video_id,
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 0e411bfb6..67a1df9a0 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -33,9 +33,11 @@ from ..utils import (
int_or_none,
orderedSet,
parse_duration,
+ smuggle_url,
str_to_int,
unescapeHTML,
unified_strdate,
+ unsmuggle_url,
uppercase_escape,
ISO3166Utils,
)
@@ -329,6 +331,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20121002',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
+ 'tags': ['youtube-dl'],
'like_count': int,
'dislike_count': int,
'start_time': 1,
@@ -343,7 +346,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'upload_date': '20120506',
'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
- 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
+ 'description': 'md5:782e8651347686cba06e58f71ab51773',
+ 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
+ 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
+ 'iconic ep', 'iconic', 'love', 'it'],
'uploader': 'Icona Pop',
'uploader_id': 'IconaPop',
}
@@ -558,6 +564,59 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'format': '135', # bestvideo
}
},
+ {
+ # Multifeed videos (multiple cameras), URL is for Main Camera
+ 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
+ 'info_dict': {
+ 'id': 'jqWvoWXjCVs',
+ 'title': 'teamPGP: Rocket League Noob Stream',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'jqWvoWXjCVs',
+ 'ext': 'mp4',
+ 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ 'upload_date': '20150721',
+ 'uploader': 'Beer Games Beer',
+ 'uploader_id': 'beergamesbeer',
+ },
+ }, {
+ 'info_dict': {
+ 'id': '6h8e8xoXJzg',
+ 'ext': 'mp4',
+ 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ 'upload_date': '20150721',
+ 'uploader': 'Beer Games Beer',
+ 'uploader_id': 'beergamesbeer',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'PUOgX5z9xZw',
+ 'ext': 'mp4',
+ 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ 'upload_date': '20150721',
+ 'uploader': 'Beer Games Beer',
+ 'uploader_id': 'beergamesbeer',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'teuwxikvS5k',
+ 'ext': 'mp4',
+ 'title': 'teamPGP: Rocket League Noob Stream (zim)',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ 'upload_date': '20150721',
+ 'uploader': 'Beer Games Beer',
+ 'uploader_id': 'beergamesbeer',
+ },
+ }],
+ 'params': {
+ 'skip_download': True,
+ },
+ }
]
def __init__(self, *args, **kwargs):
@@ -889,6 +948,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return formats
def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+
proto = (
'http' if self._downloader.params.get('prefer_insecure', False)
else 'https')
@@ -1005,6 +1066,55 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'"token" parameter not in video info for unknown reason',
video_id=video_id)
+ # title
+ if 'title' in video_info:
+ video_title = video_info['title'][0]
+ else:
+ self._downloader.report_warning('Unable to extract video title')
+ video_title = '_'
+
+ # description
+ video_description = get_element_by_id("eow-description", video_webpage)
+ if video_description:
+ video_description = re.sub(r'''(?x)
+ <a\s+
+ (?:[a-zA-Z-]+="[^"]+"\s+)*?
+ title="([^"]+)"\s+
+ (?:[a-zA-Z-]+="[^"]+"\s+)*?
+ class="yt-uix-redirect-link"\s*>
+ [^<]+
+ </a>
+ ''', r'\1', video_description)
+ video_description = clean_html(video_description)
+ else:
+ fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
+ if fd_mobj:
+ video_description = unescapeHTML(fd_mobj.group(1))
+ else:
+ video_description = ''
+
+ if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
+ if not self._downloader.params.get('noplaylist'):
+ entries = []
+ feed_ids = []
+ multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0])
+ for feed in multifeed_metadata_list.split(','):
+ feed_data = compat_parse_qs(feed)
+ entries.append({
+ '_type': 'url_transparent',
+ 'ie_key': 'Youtube',
+ 'url': smuggle_url(
+ '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
+ {'force_singlefeed': True}),
+ 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
+ })
+ feed_ids.append(feed_data['id'][0])
+ self.to_screen(
+ 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
+ % (', '.join(feed_ids), video_id))
+ return self.playlist_result(entries, video_id, video_title, video_description)
+ self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+
if 'view_count' in video_info:
view_count = int(video_info['view_count'][0])
else:
@@ -1030,13 +1140,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else:
self._downloader.report_warning('unable to extract uploader nickname')
- # title
- if 'title' in video_info:
- video_title = video_info['title'][0]
- else:
- self._downloader.report_warning('Unable to extract video title')
- video_title = '_'
-
# thumbnail image
# We try first to get a high quality image:
m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
@@ -1072,25 +1175,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else:
video_categories = None
- # description
- video_description = get_element_by_id("eow-description", video_webpage)
- if video_description:
- video_description = re.sub(r'''(?x)
- <a\s+
- (?:[a-zA-Z-]+="[^"]+"\s+)*?
- title="([^"]+)"\s+
- (?:[a-zA-Z-]+="[^"]+"\s+)*?
- class="yt-uix-redirect-link"\s*>
- [^<]+
- </a>
- ''', r'\1', video_description)
- video_description = clean_html(video_description)
- else:
- fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
- if fd_mobj:
- video_description = unescapeHTML(fd_mobj.group(1))
- else:
- video_description = ''
+ video_tags = [
+ unescapeHTML(m.group('content'))
+ for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
def _extract_count(count_name):
return str_to_int(self._search_regex(
@@ -1260,6 +1347,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'thumbnail': video_thumbnail,
'description': video_description,
'categories': video_categories,
+ 'tags': video_tags,
'subtitles': video_subtitles,
'automatic_captions': automatic_captions,
'duration': video_duration,
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index ae813099d..78dc2b449 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -139,21 +139,24 @@ def write_json_file(obj, fn):
if sys.version_info >= (2, 7):
- def find_xpath_attr(node, xpath, key, val):
+ def find_xpath_attr(node, xpath, key, val=None):
""" Find the xpath xpath[@key=val] """
assert re.match(r'^[a-zA-Z-]+$', key)
- assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
- expr = xpath + "[@%s='%s']" % (key, val)
+ if val:
+ assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
+ expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
return node.find(expr)
else:
- def find_xpath_attr(node, xpath, key, val):
+ def find_xpath_attr(node, xpath, key, val=None):
# Here comes the crazy part: In 2.6, if the xpath is a unicode,
# .//node does not match if a node is a direct child of . !
if isinstance(xpath, compat_str):
xpath = xpath.encode('ascii')
for f in node.findall(xpath):
- if f.attrib.get(key) == val:
+ if key not in f.attrib:
+ continue
+ if val is None or f.attrib.get(key) == val:
return f
return None
@@ -576,11 +579,9 @@ class ContentTooShortError(Exception):
download is too small for what the server announced first, indicating
the connection was probably interrupted.
"""
- # Both in bytes
- downloaded = None
- expected = None
def __init__(self, downloaded, expected):
+ # Both in bytes
self.downloaded = downloaded
self.expected = expected