aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl')
-rw-r--r--youtube_dl/extractor/__init__.py2
-rw-r--r--youtube_dl/extractor/canalc2.py43
-rw-r--r--youtube_dl/extractor/common.py3
-rw-r--r--youtube_dl/extractor/crunchyroll.py76
-rw-r--r--youtube_dl/extractor/dailymotion.py17
-rw-r--r--youtube_dl/extractor/eagleplatform.py2
-rw-r--r--youtube_dl/extractor/imdb.py29
-rw-r--r--youtube_dl/extractor/lynda.py15
-rw-r--r--youtube_dl/extractor/rte.py12
-rw-r--r--youtube_dl/extractor/twitch.py34
-rw-r--r--youtube_dl/extractor/twitter.py122
-rw-r--r--youtube_dl/extractor/vidme.py10
-rw-r--r--youtube_dl/extractor/viewster.py7
-rw-r--r--youtube_dl/extractor/vimeo.py12
-rw-r--r--youtube_dl/extractor/vine.py66
-rw-r--r--youtube_dl/extractor/youtube.py121
16 files changed, 388 insertions, 183 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 462717b1e..bd6eb6ae0 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -690,7 +690,7 @@ from .twitch import (
TwitchBookmarksIE,
TwitchStreamIE,
)
-from .twitter import TwitterCardIE
+from .twitter import TwitterCardIE, TwitterIE
from .ubu import UbuIE
from .udemy import (
UdemyIE,
diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py
index c4fefefe4..f6a1ff381 100644
--- a/youtube_dl/extractor/canalc2.py
+++ b/youtube_dl/extractor/canalc2.py
@@ -4,38 +4,53 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import parse_duration
class Canalc2IE(InfoExtractor):
IE_NAME = 'canalc2.tv'
- _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?canalc2\.tv/video/(?P<id>\d+)'
_TEST = {
- 'url': 'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui',
+ 'url': 'http://www.canalc2.tv/video/12163',
'md5': '060158428b650f896c542dfbb3d6487f',
'info_dict': {
'id': '12163',
- 'ext': 'mp4',
- 'title': 'Terrasses du Numérique'
+ 'ext': 'flv',
+ 'title': 'Terrasses du Numérique',
+ 'duration': 122,
+ },
+ 'params': {
+ 'skip_download': True, # Requires rtmpdump
}
}
def _real_extract(self, url):
- video_id = re.match(self._VALID_URL, url).group('id')
- # We need to set the voir field for getting the file name
- url = 'http://www.canalc2.tv/video.asp?idVideo=%s&voir=oui' % video_id
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- file_name = self._search_regex(
- r"so\.addVariable\('file','(.*?)'\);",
- webpage, 'file name')
- video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name
+ video_url = self._search_regex(
+ r'jwplayer\((["\'])Player\1\)\.setup\({[^}]*file\s*:\s*(["\'])(?P<file>.+?)\2',
+ webpage, 'video_url', group='file')
+ formats = [{'url': video_url}]
+ if video_url.startswith('rtmp://'):
+ rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+/))(?P<play_path>mp4:.+)$', video_url)
+ formats[0].update({
+ 'url': rtmp.group('url'),
+ 'ext': 'flv',
+ 'app': rtmp.group('app'),
+ 'play_path': rtmp.group('play_path'),
+ 'page_url': url,
+ })
title = self._html_search_regex(
- r'class="evenement8">(.*?)</a>', webpage, 'title')
+ r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.*?)</h3>', webpage, 'title')
+ duration = parse_duration(self._search_regex(
+ r'id=["\']video_duree["\'][^>]*>([^<]+)',
+ webpage, 'duration', fatal=False))
return {
'id': video_id,
- 'ext': 'mp4',
- 'url': video_url,
'title': title,
+ 'duration': duration,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index a0c4af92f..6169fbbeb 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -172,6 +172,7 @@ class InfoExtractor(object):
view_count: How many users have watched the video on the platform.
like_count: Number of positive ratings of the video
dislike_count: Number of negative ratings of the video
+ repost_count: Number of reposts of the video
average_rating: Average rating give by users, the scale used depends on the webpage
comment_count: Number of comments on the video
comments: A list of comments, each with one or more of the following
@@ -645,7 +646,7 @@ class InfoExtractor(object):
# Helper functions for extracting OpenGraph info
@staticmethod
def _og_regexes(prop):
- content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\'|\s*([^\s"\'=<>`]+?))'
+ content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
% {'prop': re.escape(prop)})
template = r'<meta[^>]+?%s[^>]+?%s'
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 95952bc29..cecd0c784 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -32,6 +32,26 @@ from ..aes import (
class CrunchyrollBaseIE(InfoExtractor):
+ _NETRC_MACHINE = 'crunchyroll'
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+ self.report_login()
+ login_url = 'https://www.crunchyroll.com/?a=formhandler'
+ data = urlencode_postdata({
+ 'formname': 'RpcApiUser_Login',
+ 'name': username,
+ 'password': password,
+ })
+ login_request = compat_urllib_request.Request(login_url, data)
+ login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ self._download_webpage(login_request, None, False, 'Wrong login info')
+
+ def _real_initialize(self):
+ self._login()
+
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request)
else compat_urllib_request.Request(url_or_request))
@@ -46,10 +66,22 @@ class CrunchyrollBaseIE(InfoExtractor):
return super(CrunchyrollBaseIE, self)._download_webpage(
request, video_id, note, errnote, fatal, tries, timeout, encoding)
+ @staticmethod
+ def _add_skip_wall(url):
+ parsed_url = compat_urlparse.urlparse(url)
+ qs = compat_urlparse.parse_qs(parsed_url.query)
+ # Always force skip_wall to bypass maturity wall, namely 18+ confirmation message:
+ # > This content may be inappropriate for some people.
+ # > Are you sure you want to continue?
+ # since it's not disabled by default in crunchyroll account's settings.
+ # See https://github.com/rg3/youtube-dl/issues/7202.
+ qs['skip_wall'] = ['1']
+ return compat_urlparse.urlunparse(
+ parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True)))
+
class CrunchyrollIE(CrunchyrollBaseIE):
_VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
- _NETRC_MACHINE = 'crunchyroll'
_TESTS = [{
'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
'info_dict': {
@@ -81,10 +113,13 @@ class CrunchyrollIE(CrunchyrollBaseIE):
# rtmp
'skip_download': True,
},
-
}, {
'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697',
'only_matching': True,
+ }, {
+ # geo-restricted (US), 18+ maturity wall, non-premium available
+ 'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617',
+ 'only_matching': True,
}]
_FORMAT_IDS = {
@@ -94,24 +129,6 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'1080': ('80', '108'),
}
- def _login(self):
- (username, password) = self._get_login_info()
- if username is None:
- return
- self.report_login()
- login_url = 'https://www.crunchyroll.com/?a=formhandler'
- data = urlencode_postdata({
- 'formname': 'RpcApiUser_Login',
- 'name': username,
- 'password': password,
- })
- login_request = compat_urllib_request.Request(login_url, data)
- login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- self._download_webpage(login_request, None, False, 'Wrong login info')
-
- def _real_initialize(self):
- self._login()
-
def _decrypt_subtitles(self, data, iv, id):
data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8')))
@@ -254,7 +271,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
else:
webpage_url = 'http://www.' + mobj.group('url')
- webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage')
+ webpage = self._download_webpage(self._add_skip_wall(webpage_url), video_id, 'Downloading webpage')
note_m = self._html_search_regex(
r'<div class="showmedia-trailer-notice">(.+?)</div>',
webpage, 'trailer-notice', default='')
@@ -352,7 +369,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
IE_NAME = "crunchyroll:playlist"
- _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?$'
+ _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?(?:\?|$)'
_TESTS = [{
'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
@@ -361,12 +378,25 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi'
},
'playlist_count': 13,
+ }, {
+ # geo-restricted (US), 18+ maturity wall, non-premium available
+ 'url': 'http://www.crunchyroll.com/cosplay-complex-ova',
+ 'info_dict': {
+ 'id': 'cosplay-complex-ova',
+ 'title': 'Cosplay Complex OVA'
+ },
+ 'playlist_count': 3,
+ 'skip': 'Georestricted',
+ }, {
+ # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14
+ 'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1',
+ 'only_matching': True,
}]
def _real_extract(self, url):
show_id = self._match_id(url)
- webpage = self._download_webpage(url, show_id)
+ webpage = self._download_webpage(self._add_skip_wall(url), show_id)
title = self._html_search_regex(
r'(?s)<h1[^>]*>\s*<span itemprop="name">(.*?)</span>',
webpage, 'title')
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 80a05cfee..9cd9ff17d 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -96,6 +96,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'uploader': 'HotWaves1012',
'age_limit': 18,
}
+ },
+ # geo-restricted, player v5
+ {
+ 'url': 'http://www.dailymotion.com/video/xhza0o',
+ 'only_matching': True,
}
]
@@ -124,6 +129,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
if player_v5:
player = self._parse_json(player_v5, video_id)
metadata = player['metadata']
+
+ self._check_error(metadata)
+
formats = []
for quality, media_list in metadata['qualities'].items():
for media in media_list:
@@ -201,9 +209,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'video info', flags=re.MULTILINE),
video_id)
- if info.get('error') is not None:
- msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title']
- raise ExtractorError(msg, expected=True)
+ self._check_error(info)
formats = []
for (key, format_id) in self._FORMATS:
@@ -246,6 +252,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'duration': info['duration']
}
+ def _check_error(self, info):
+ if info.get('error') is not None:
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, info['error']['title']), expected=True)
+
def _get_subtitles(self, video_id, webpage):
try:
sub_list = self._download_webpage(
diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py
index e529b9b96..7bbf617d4 100644
--- a/youtube_dl/extractor/eagleplatform.py
+++ b/youtube_dl/extractor/eagleplatform.py
@@ -87,7 +87,7 @@ class EaglePlatformIE(InfoExtractor):
m3u8_url = self._get_video_url(secure_m3u8, video_id, 'Downloading m3u8 JSON')
formats = self._extract_m3u8_formats(
m3u8_url, video_id,
- 'mp4', entry_protocol='m3u8_native')
+ 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
mp4_url = self._get_video_url(
# Secure mp4 URL is constructed according to Player.prototype.mp4 from
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index 4bb574cf3..02e1e428e 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -4,8 +4,8 @@ import re
import json
from .common import InfoExtractor
-from ..compat import (
- compat_urlparse,
+from ..utils import (
+ qualities,
)
@@ -30,24 +30,33 @@ class ImdbIE(InfoExtractor):
descr = self._html_search_regex(
r'(?s)<span itemprop="description">(.*?)</span>',
webpage, 'description', fatal=False)
- available_formats = re.findall(
- r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage,
- flags=re.MULTILINE)
+ player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id
+ player_page = self._download_webpage(
+ player_url, video_id, 'Downloading player page')
+ # the player page contains the info for the default format, we have to
+ # fetch other pages for the rest of the formats
+ extra_formats = re.findall(r'href="(?P<url>%s.*?)".*?>(?P<name>.*?)<' % re.escape(player_url), player_page)
+ format_pages = [
+ self._download_webpage(
+ f_url, video_id, 'Downloading info for %s format' % f_name)
+ for f_url, f_name in extra_formats]
+ format_pages.append(player_page)
+
+ quality = qualities(['SD', '480p', '720p'])
formats = []
- for f_id, f_path in available_formats:
- f_path = f_path.strip()
- format_page = self._download_webpage(
- compat_urlparse.urljoin(url, f_path),
- 'Downloading info for %s format' % f_id)
+ for format_page in format_pages:
json_data = self._search_regex(
r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',
format_page, 'json data', flags=re.DOTALL)
info = json.loads(json_data)
format_info = info['videoPlayerObject']['video']
+ f_id = format_info['ffname']
formats.append({
'format_id': f_id,
'url': format_info['videoInfoList'][0]['videoUrl'],
+ 'quality': quality(f_id),
})
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py
index 378117270..5c973e75c 100644
--- a/youtube_dl/extractor/lynda.py
+++ b/youtube_dl/extractor/lynda.py
@@ -140,13 +140,14 @@ class LyndaIE(LyndaBaseIE):
prioritized_streams = video_json.get('PrioritizedStreams')
if prioritized_streams:
- formats.extend([
- {
- 'url': video_url,
- 'width': int_or_none(format_id),
- 'format_id': format_id,
- } for format_id, video_url in prioritized_streams['0'].items()
- ])
+ for prioritized_stream_id, prioritized_stream in prioritized_streams.items():
+ formats.extend([
+ {
+ 'url': video_url,
+ 'width': int_or_none(format_id),
+ 'format_id': '%s-%s' % (prioritized_stream_id, format_id),
+ } for format_id, video_url in prioritized_stream.items()
+ ])
self._check_formats(formats, video_id)
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py
index 427c70866..d9cfbf180 100644
--- a/youtube_dl/extractor/rte.py
+++ b/youtube_dl/extractor/rte.py
@@ -9,16 +9,16 @@ from ..utils import (
class RteIE(InfoExtractor):
- _VALID_URL = r'http?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P<id>[0-9]+)'
_TEST = {
- 'url': 'http://www.rte.ie/player/de/show/10363114/',
+ 'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/',
'info_dict': {
- 'id': '10363114',
+ 'id': '10478715',
'ext': 'mp4',
- 'title': 'One News',
+ 'title': 'Watch iWitness online',
'thumbnail': 're:^https?://.*\.jpg$',
- 'description': 'The One O\'Clock News followed by Weather.',
- 'duration': 436.844,
+ 'description': 'iWitness : The spirit of Ireland, one voice and one minute at a time.',
+ 'duration': 60.046,
},
'params': {
'skip_download': 'f4m fails with --test atm'
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py
index 023911c41..3ec08b674 100644
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -15,6 +15,7 @@ from ..compat import (
compat_urlparse,
)
from ..utils import (
+ encode_dict,
ExtractorError,
int_or_none,
parse_duration,
@@ -27,8 +28,7 @@ class TwitchBaseIE(InfoExtractor):
_API_BASE = 'https://api.twitch.tv'
_USHER_BASE = 'http://usher.twitch.tv'
- _LOGIN_URL = 'https://secure.twitch.tv/login'
- _LOGIN_POST_URL = 'https://passport.twitch.tv/authentications/new'
+ _LOGIN_URL = 'http://www.twitch.tv/login'
_NETRC_MACHINE = 'twitch'
def _handle_error(self, response):
@@ -61,26 +61,28 @@ class TwitchBaseIE(InfoExtractor):
if username is None:
return
- login_page = self._download_webpage(
+ login_page, handle = self._download_webpage_handle(
self._LOGIN_URL, None, 'Downloading login page')
login_form = self._hidden_inputs(login_page)
login_form.update({
- 'login': username.encode('utf-8'),
- 'password': password.encode('utf-8'),
+ 'username': username,
+ 'password': password,
})
+ redirect_url = handle.geturl()
+
post_url = self._search_regex(
r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
- 'post url', default=self._LOGIN_POST_URL, group='url')
+ 'post url', default=redirect_url, group='url')
if not post_url.startswith('http'):
- post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
+ post_url = compat_urlparse.urljoin(redirect_url, post_url)
request = compat_urllib_request.Request(
- post_url, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
- request.add_header('Referer', self._LOGIN_URL)
+ post_url, compat_urllib_parse.urlencode(encode_dict(login_form)).encode('utf-8'))
+ request.add_header('Referer', redirect_url)
response = self._download_webpage(
request, None, 'Logging in as %s' % username)
@@ -238,14 +240,24 @@ class TwitchVodIE(TwitchItemBaseIE):
def _real_extract(self, url):
item_id = self._match_id(url)
+
info = self._download_info(self._ITEM_SHORTCUT, item_id)
access_token = self._download_json(
'%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
'Downloading %s access token' % self._ITEM_TYPE)
+
formats = self._extract_m3u8_formats(
- '%s/vod/%s?nauth=%s&nauthsig=%s&allow_source=true'
- % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']),
+ '%s/vod/%s?%s' % (
+ self._USHER_BASE, item_id,
+ compat_urllib_parse.urlencode({
+ 'allow_source': 'true',
+ 'allow_spectre': 'true',
+ 'player': 'twitchweb',
+ 'nauth': access_token['token'],
+ 'nauthsig': access_token['sig'],
+ })),
item_id, 'mp4')
+
self._prefer_source(formats)
info['formats'] = formats
diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py
index 1aaa06305..9d3e46b94 100644
--- a/youtube_dl/extractor/twitter.py
+++ b/youtube_dl/extractor/twitter.py
@@ -1,3 +1,4 @@
+# coding: utf-8
from __future__ import unicode_literals
import re
@@ -6,23 +7,51 @@ from .common import InfoExtractor
from ..compat import compat_urllib_request
from ..utils import (
float_or_none,
- unescapeHTML,
+ xpath_text,
+ remove_end,
)
class TwitterCardIE(InfoExtractor):
+ IE_NAME = 'twitter:card'
_VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)'
- _TEST = {
- 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
- 'md5': 'a74f50b310c83170319ba16de6955192',
- 'info_dict': {
- 'id': '560070183650213889',
- 'ext': 'mp4',
- 'title': 'TwitterCard',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'duration': 30.033,
+ _TESTS = [
+ {
+ 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
+ 'md5': '7d2f6b4d2eb841a7ccc893d479bfceb4',
+ 'info_dict': {
+ 'id': '560070183650213889',
+ 'ext': 'mp4',
+ 'title': 'TwitterCard',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 30.033,
+ }
},
- }
+ {
+ 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768',
+ 'md5': '7ee2a553b63d1bccba97fbed97d9e1c8',
+ 'info_dict': {
+ 'id': '623160978427936768',
+ 'ext': 'mp4',
+ 'title': 'TwitterCard',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 80.155,
+ },
+ },
+ {
+ 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977',
+ 'md5': 'b6f35e8b08a0bec6c8af77a2f4b3a814',
+ 'info_dict': {
+ 'id': 'dq4Oj5quskI',
+ 'ext': 'mp4',
+ 'title': 'Ubuntu 11.10 Overview',
+ 'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/',
+ 'upload_date': '20111013',
+ 'uploader': 'OMG! Ubuntu!',
+ 'uploader_id': 'omgubuntu',
+ },
+ }
+ ]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -40,10 +69,24 @@ class TwitterCardIE(InfoExtractor):
request.add_header('User-Agent', user_agent)
webpage = self._download_webpage(request, video_id)
- config = self._parse_json(
- unescapeHTML(self._search_regex(
- r'data-player-config="([^"]+)"', webpage, 'data player config')),
+ youtube_url = self._html_search_regex(
+ r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"',
+ webpage, 'youtube iframe', default=None)
+ if youtube_url:
+ return self.url_result(youtube_url, 'Youtube')
+
+ config = self._parse_json(self._html_search_regex(
+ r'data-player-config="([^"]+)"', webpage, 'data player config'),
video_id)
+ if 'playlist' not in config:
+ if 'vmapUrl' in config:
+ vmap_data = self._download_xml(config['vmapUrl'], video_id)
+ video_url = xpath_text(vmap_data, './/MediaFile').strip()
+ formats.append({
+ 'url': video_url,
+ })
+ break # same video regardless of UA
+ continue
video_url = config['playlist'][0]['source']
@@ -70,3 +113,54 @@ class TwitterCardIE(InfoExtractor):
'duration': duration,
'formats': formats,
}
+
+
+class TwitterIE(InfoExtractor):
+ IE_NAME = 'twitter'
+ _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P<user_id>[^/]+)/status/(?P<id>\d+)'
+ _TEMPLATE_URL = 'https://twitter.com/%s/status/%s'
+
+ _TEST = {
+ 'url': 'https://twitter.com/freethenipple/status/643211948184596480',
+ 'md5': '31cd83a116fc41f99ae3d909d4caf6a0',
+ 'info_dict': {
+ 'id': '643211948184596480',
+ 'ext': 'mp4',
+ 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 12.922,
+ 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"',
+ 'uploader': 'FREE THE NIPPLE',
+ 'uploader_id': 'freethenipple',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ user_id = mobj.group('user_id')
+ twid = mobj.group('id')
+
+ webpage = self._download_webpage(self._TEMPLATE_URL % (user_id, twid), twid)
+
+ username = remove_end(self._og_search_title(webpage), ' on Twitter')
+
+ title = self._og_search_description(webpage).strip('').replace('\n', ' ')
+
+ # strip 'https -_t.co_BJYgOjSeGA' junk from filenames
+ mobj = re.match(r'“(.*)\s+(https?://[^ ]+)”', title)
+ title, short_url = mobj.groups()
+
+ card_id = self._search_regex(
+ r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url')
+ card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id
+
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'TwitterCard',
+ 'uploader_id': user_id,
+ 'uploader': username,
+ 'url': card_url,
+ 'webpage_url': url,
+ 'description': '%s on Twitter: "%s %s"' % (username, title, short_url),
+ 'title': username + ' - ' + title,
+ }
diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py
index 078d283b2..382517a4a 100644
--- a/youtube_dl/extractor/vidme.py
+++ b/youtube_dl/extractor/vidme.py
@@ -93,6 +93,10 @@ class VidmeIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ }, {
+ # nsfw, user-disabled
+ 'url': 'https://vid.me/dzGJ',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -114,6 +118,12 @@ class VidmeIE(InfoExtractor):
video = response['video']
+ if video.get('state') == 'user-disabled':
+ raise ExtractorError(
+ 'Vidme said: This video has been suspended either due to a copyright claim, '
+ 'or for violating the terms of use.',
+ expected=True)
+
formats = [{
'format_id': f.get('type'),
'url': f['uri'],
diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py
index 632e57fb4..7cf930d69 100644
--- a/youtube_dl/extractor/viewster.py
+++ b/youtube_dl/extractor/viewster.py
@@ -131,10 +131,11 @@ class ViewsterIE(InfoExtractor):
formats.extend(self._extract_f4m_formats(
video_url, video_id, f4m_id='hds'))
elif ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
+ m3u8_formats = self._extract_m3u8_formats(
video_url, video_id, 'mp4', m3u8_id='hls',
- fatal=False # m3u8 sometimes fail
- ))
+ fatal=False) # m3u8 sometimes fail
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
else:
format_id = media.get('Bitrate')
f = {
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index fa1b22049..0f84656c0 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -286,7 +286,17 @@ class VimeoIE(VimeoBaseInfoExtractor):
try:
try:
config_url = self._html_search_regex(
- r' data-config-url="(.+?)"', webpage, 'config URL')
+ r' data-config-url="(.+?)"', webpage,
+ 'config URL', default=None)
+ if not config_url:
+ # Sometimes new react-based page is served instead of old one that require
+ # different config URL extraction approach (see
+ # https://github.com/rg3/youtube-dl/pull/7209)
+ vimeo_clip_page_config = self._search_regex(
+ r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage,
+ 'vimeo clip page config')
+ config_url = self._parse_json(
+ vimeo_clip_page_config, video_id)['player']['config_url']
config_json = self._download_webpage(config_url, video_id)
config = json.loads(config_json)
except RegexNotFoundError:
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py
index c733a48fa..be72f3147 100644
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -1,10 +1,14 @@
+# coding: utf-8
from __future__ import unicode_literals
import re
import itertools
from .common import InfoExtractor
-from ..utils import unified_strdate
+from ..utils import (
+ int_or_none,
+ unified_strdate,
+)
class VineIE(InfoExtractor):
@@ -17,10 +21,12 @@ class VineIE(InfoExtractor):
'ext': 'mp4',
'title': 'Chicken.',
'alt_title': 'Vine by Jack Dorsey',
- 'description': 'Chicken.',
'upload_date': '20130519',
'uploader': 'Jack Dorsey',
'uploader_id': '76',
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
},
}, {
'url': 'https://vine.co/v/MYxVapFvz2z',
@@ -29,11 +35,13 @@ class VineIE(InfoExtractor):
'id': 'MYxVapFvz2z',
'ext': 'mp4',
'title': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14',
- 'alt_title': 'Vine by Luna',
- 'description': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14',
+ 'alt_title': 'Vine by Mars Ruiz',
'upload_date': '20140815',
- 'uploader': 'Luna',
+ 'uploader': 'Mars Ruiz',
'uploader_id': '1102363502380728320',
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
},
}, {
'url': 'https://vine.co/v/bxVjBbZlPUH',
@@ -43,14 +51,33 @@ class VineIE(InfoExtractor):
'ext': 'mp4',
'title': '#mw3 #ac130 #killcam #angelofdeath',
'alt_title': 'Vine by Z3k3',
- 'description': '#mw3 #ac130 #killcam #angelofdeath',
'upload_date': '20130430',
'uploader': 'Z3k3',
'uploader_id': '936470460173008896',
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
},
}, {
'url': 'https://vine.co/oembed/MYxVapFvz2z.json',
'only_matching': True,
+ }, {
+ 'url': 'https://vine.co/v/e192BnZnZ9V',
+ 'info_dict': {
+ 'id': 'e192BnZnZ9V',
+ 'ext': 'mp4',
+ 'title': 'ยิ้ม~ เขิน~ อาย~ น่าร้ากอ้ะ >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2',
+ 'alt_title': 'Vine by Pimry_zaa',
+ 'upload_date': '20150705',
+ 'uploader': 'Pimry_zaa',
+ 'uploader_id': '1135760698325307392',
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
@@ -65,25 +92,26 @@ class VineIE(InfoExtractor):
formats = [{
'format_id': '%(format)s-%(rate)s' % f,
- 'vcodec': f['format'],
- 'quality': f['rate'],
+ 'vcodec': f.get('format'),
+ 'quality': f.get('rate'),
'url': f['videoUrl'],
- } for f in data['videoUrls']]
+ } for f in data['videoUrls'] if f.get('videoUrl')]
self._sort_formats(formats)
+ username = data.get('username')
+
return {
'id': video_id,
- 'title': self._og_search_title(webpage),
- 'alt_title': self._og_search_description(webpage, default=None),
- 'description': data['description'],
- 'thumbnail': data['thumbnailUrl'],
- 'upload_date': unified_strdate(data['created']),
- 'uploader': data['username'],
- 'uploader_id': data['userIdStr'],
- 'like_count': data['likes']['count'],
- 'comment_count': data['comments']['count'],
- 'repost_count': data['reposts']['count'],
+ 'title': data.get('description') or self._og_search_title(webpage),
+ 'alt_title': 'Vine by %s' % username if username else self._og_search_description(webpage, default=None),
+ 'thumbnail': data.get('thumbnailUrl'),
+ 'upload_date': unified_strdate(data.get('created')),
+ 'uploader': username,
+ 'uploader_id': data.get('userIdStr'),
+ 'like_count': int_or_none(data.get('likes', {}).get('count')),
+ 'comment_count': int_or_none(data.get('comments', {}).get('count')),
+ 'repost_count': int_or_none(data.get('reposts', {}).get('count')),
'formats': formats,
}
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index b252e36e1..08e821362 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -178,6 +178,52 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return
+class YoutubePlaylistBaseInfoExtractor(InfoExtractor):
+ # Extract the video ids from the playlist pages
+ def _entries(self, page, playlist_id):
+ more_widget_html = content_html = page
+ for page_num in itertools.count(1):
+ for video_id, video_title in self.extract_videos_from_page(content_html):
+ yield self.url_result(
+ video_id, 'Youtube', video_id=video_id,
+ video_title=video_title)
+
+ mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+ if not mobj:
+ break
+
+ more = self._download_json(
+ 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+ 'Downloading page #%s' % page_num,
+ transform_source=uppercase_escape)
+ content_html = more['content_html']
+ if not content_html.strip():
+ # Some webpages show a "Load more" button but they don't
+ # have more videos
+ break
+ more_widget_html = more['load_more_widget_html']
+
+ def extract_videos_from_page(self, page):
+ ids_in_page = []
+ titles_in_page = []
+ for mobj in re.finditer(self._VIDEO_RE, page):
+ # The link with index 0 is not the first video of the playlist (not sure if still actual)
+ if 'index' in mobj.groupdict() and mobj.group('id') == '0':
+ continue
+ video_id = mobj.group('id')
+ video_title = unescapeHTML(mobj.group('title'))
+ if video_title:
+ video_title = video_title.strip()
+ try:
+ idx = ids_in_page.index(video_id)
+ if video_title and not titles_in_page[idx]:
+ titles_in_page[idx] = video_title
+ except ValueError:
+ ids_in_page.append(video_id)
+ titles_in_page.append(video_title)
+ return zip(ids_in_page, titles_in_page)
+
+
class YoutubeIE(YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com'
_VALID_URL = r"""(?x)^
@@ -1419,7 +1465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}
-class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
+class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor):
IE_DESC = 'YouTube.com playlists'
_VALID_URL = r"""(?x)(?:
(?:https?://)?
@@ -1440,7 +1486,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
)"""
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
- _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
+ _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
IE_NAME = 'youtube:playlist'
_TESTS = [{
'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
@@ -1557,37 +1603,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
else:
self.report_warning('Youtube gives an alert message: ' + match)
- # Extract the video ids from the playlist pages
- def _entries():
- more_widget_html = content_html = page
- for page_num in itertools.count(1):
- matches = re.finditer(self._VIDEO_RE, content_html)
- # We remove the duplicates and the link with index 0
- # (it's not the first video of the playlist)
- new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
- for vid_id in new_ids:
- yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
-
- mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
- if not mobj:
- break
-
- more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
- 'Downloading page #%s' % page_num,
- transform_source=uppercase_escape)
- content_html = more['content_html']
- if not content_html.strip():
- # Some webpages show a "Load more" button but they don't
- # have more videos
- break
- more_widget_html = more['load_more_widget_html']
-
playlist_title = self._html_search_regex(
r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
page, 'title')
- return self.playlist_result(_entries(), playlist_id, playlist_title)
+ return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)
def _real_extract(self, url):
# Extract playlist id
@@ -1613,10 +1633,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
return self._extract_playlist(playlist_id)
-class YoutubeChannelIE(InfoExtractor):
+class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
IE_DESC = 'YouTube.com channels'
_VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
+ _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
IE_NAME = 'youtube:channel'
_TESTS = [{
'note': 'paginated channel',
@@ -1627,22 +1648,6 @@ class YoutubeChannelIE(InfoExtractor):
}
}]
- @staticmethod
- def extract_videos_from_page(page):
- ids_in_page = []
- titles_in_page = []
- for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
- video_id = mobj.group('id')
- video_title = unescapeHTML(mobj.group('title'))
- try:
- idx = ids_in_page.index(video_id)
- if video_title and not titles_in_page[idx]:
- titles_in_page[idx] = video_title
- except ValueError:
- ids_in_page.append(video_id)
- titles_in_page.append(video_title)
- return zip(ids_in_page, titles_in_page)
-
def _real_extract(self, url):
channel_id = self._match_id(url)
@@ -1685,29 +1690,7 @@ class YoutubeChannelIE(InfoExtractor):
for video_id, video_title in self.extract_videos_from_page(channel_page)]
return self.playlist_result(entries, channel_id)
- def _entries():
- more_widget_html = content_html = channel_page
- for pagenum in itertools.count(1):
-
- for video_id, video_title in self.extract_videos_from_page(content_html):
- yield self.url_result(
- video_id, 'Youtube', video_id=video_id,
- video_title=video_title)
-
- mobj = re.search(
- r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
- more_widget_html)
- if not mobj:
- break
-
- more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), channel_id,
- 'Downloading page #%s' % (pagenum + 1),
- transform_source=uppercase_escape)
- content_html = more['content_html']
- more_widget_html = more['load_more_widget_html']
-
- return self.playlist_result(_entries(), channel_id)
+ return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
class YoutubeUserIE(YoutubeChannelIE):