aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/aenetworks.py206
-rw-r--r--youtube_dl/extractor/amp.py18
-rw-r--r--youtube_dl/extractor/animeondemand.py110
-rw-r--r--youtube_dl/extractor/appletrailers.py53
-rw-r--r--youtube_dl/extractor/ard.py16
-rw-r--r--youtube_dl/extractor/arte.py1
-rw-r--r--youtube_dl/extractor/brightcove.py33
-rw-r--r--youtube_dl/extractor/buzzfeed.py32
-rw-r--r--youtube_dl/extractor/cbsinteractive.py5
-rw-r--r--youtube_dl/extractor/cliprs.py73
-rw-r--r--youtube_dl/extractor/common.py44
-rw-r--r--youtube_dl/extractor/ctv.py30
-rw-r--r--youtube_dl/extractor/ctvnews.py65
-rw-r--r--youtube_dl/extractor/dailymotion.py15
-rw-r--r--youtube_dl/extractor/daum.py20
-rw-r--r--youtube_dl/extractor/dcn.py47
-rw-r--r--youtube_dl/extractor/eagleplatform.py8
-rw-r--r--youtube_dl/extractor/extractors.py32
-rw-r--r--youtube_dl/extractor/facebook.py38
-rw-r--r--youtube_dl/extractor/flipagram.py115
-rw-r--r--youtube_dl/extractor/francetv.py28
-rw-r--r--youtube_dl/extractor/fusion.py35
-rw-r--r--youtube_dl/extractor/gametrailers.py62
-rw-r--r--youtube_dl/extractor/generic.py188
-rw-r--r--youtube_dl/extractor/hrti.py202
-rw-r--r--youtube_dl/extractor/iqiyi.py385
-rw-r--r--youtube_dl/extractor/kaltura.py75
-rw-r--r--youtube_dl/extractor/kamcord.py71
-rw-r--r--youtube_dl/extractor/kuwo.py7
-rw-r--r--youtube_dl/extractor/la7.py85
-rw-r--r--youtube_dl/extractor/leeco.py150
-rw-r--r--youtube_dl/extractor/lynda.py2
-rw-r--r--youtube_dl/extractor/m6.py35
-rw-r--r--youtube_dl/extractor/meta.py73
-rw-r--r--youtube_dl/extractor/metacafe.py89
-rw-r--r--youtube_dl/extractor/mgtv.py3
-rw-r--r--youtube_dl/extractor/mitele.py115
-rw-r--r--youtube_dl/extractor/mixcloud.py4
-rw-r--r--youtube_dl/extractor/msn.py122
-rw-r--r--youtube_dl/extractor/nationalgeographic.py15
-rw-r--r--youtube_dl/extractor/nbc.py157
-rw-r--r--youtube_dl/extractor/nick.py5
-rw-r--r--youtube_dl/extractor/ninecninemedia.py55
-rw-r--r--youtube_dl/extractor/onet.py172
-rw-r--r--youtube_dl/extractor/onionstudios.py59
-rw-r--r--youtube_dl/extractor/pbs.py7
-rw-r--r--youtube_dl/extractor/periscope.py7
-rw-r--r--youtube_dl/extractor/pladform.py2
-rw-r--r--youtube_dl/extractor/polskieradio.py99
-rw-r--r--youtube_dl/extractor/pornhub.py30
-rw-r--r--youtube_dl/extractor/prosiebensat1.py167
-rw-r--r--youtube_dl/extractor/radiocanada.py44
-rw-r--r--youtube_dl/extractor/rai.py286
-rw-r--r--youtube_dl/extractor/rds.py27
-rw-r--r--youtube_dl/extractor/roosterteeth.py148
-rw-r--r--youtube_dl/extractor/rtvnh.py34
-rw-r--r--youtube_dl/extractor/sandia.py100
-rw-r--r--youtube_dl/extractor/sixplay.py64
-rw-r--r--youtube_dl/extractor/skynewsarabia.py2
-rw-r--r--youtube_dl/extractor/skysports.py33
-rw-r--r--youtube_dl/extractor/slideshare.py5
-rw-r--r--youtube_dl/extractor/sohu.py16
-rw-r--r--youtube_dl/extractor/spiegel.py22
-rw-r--r--youtube_dl/extractor/srmediathek.py7
-rw-r--r--youtube_dl/extractor/stitcher.py2
-rw-r--r--youtube_dl/extractor/svt.py89
-rw-r--r--youtube_dl/extractor/telecinco.py85
-rw-r--r--youtube_dl/extractor/tf1.py2
-rw-r--r--youtube_dl/extractor/theplatform.py108
-rw-r--r--youtube_dl/extractor/threeqsdn.py9
-rw-r--r--youtube_dl/extractor/toutv.py71
-rw-r--r--youtube_dl/extractor/tvp.py59
-rw-r--r--youtube_dl/extractor/tweakers.py49
-rw-r--r--youtube_dl/extractor/twitch.py2
-rw-r--r--youtube_dl/extractor/urplay.py67
-rw-r--r--youtube_dl/extractor/vidbit.py84
-rw-r--r--youtube_dl/extractor/vimeo.py138
-rw-r--r--youtube_dl/extractor/vine.py9
-rw-r--r--youtube_dl/extractor/vk.py43
-rw-r--r--youtube_dl/extractor/vrt.py61
-rw-r--r--youtube_dl/extractor/xnxx.py22
-rw-r--r--youtube_dl/extractor/xtube.py49
-rw-r--r--youtube_dl/extractor/xuite.py30
-rw-r--r--youtube_dl/extractor/yahoo.py8
-rw-r--r--youtube_dl/extractor/youku.py9
-rw-r--r--youtube_dl/extractor/youtube.py100
86 files changed, 3758 insertions, 1593 deletions
diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py
index 1bbfe2641..8f53050c9 100644
--- a/youtube_dl/extractor/aenetworks.py
+++ b/youtube_dl/extractor/aenetworks.py
@@ -2,41 +2,33 @@ from __future__ import unicode_literals
import re
-from .common import InfoExtractor
+from .theplatform import ThePlatformIE
from ..utils import (
smuggle_url,
update_url_query,
unescapeHTML,
+ extract_attributes,
+ get_element_by_attribute,
)
+from ..compat import (
+ compat_urlparse,
+)
+
+class AENetworksBaseIE(ThePlatformIE):
+ _THEPLATFORM_KEY = 'crazyjava'
+ _THEPLATFORM_SECRET = 's3cr3t'
-class AENetworksIE(InfoExtractor):
+
+class AENetworksIE(AENetworksBaseIE):
IE_NAME = 'aenetworks'
IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network'
- _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?P<type>[^/]+)/(?:[^/]+/)+(?P<id>[^/]+?)(?:$|[?#])'
-
+ _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|movies/(?P<movie_display_id>[^/]+)/full-movie)'
_TESTS = [{
- 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false',
- 'info_dict': {
- 'id': 'g12m5Gyt3fdR',
- 'ext': 'mp4',
- 'title': "Bet You Didn't Know: Valentine's Day",
- 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7',
- 'timestamp': 1375819729,
- 'upload_date': '20130806',
- 'uploader': 'AENE-NEW',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- 'add_ie': ['ThePlatform'],
- 'expected_warnings': ['JSON-LD'],
- }, {
'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1',
'md5': '8ff93eb073449f151d6b90c0ae1ef0c7',
'info_dict': {
- 'id': 'eg47EERs_JsZ',
+ 'id': '22253814',
'ext': 'mp4',
'title': 'Winter Is Coming',
'description': 'md5:641f424b7a19d8e24f26dea22cf59d74',
@@ -46,42 +38,168 @@ class AENetworksIE(InfoExtractor):
},
'add_ie': ['ThePlatform'],
}, {
- 'url': 'http://www.aetv.com/shows/duck-dynasty/video/inlawful-entry',
+ 'url': 'http://www.history.com/shows/ancient-aliens/season-1',
+ 'info_dict': {
+ 'id': '71889446852',
+ },
+ 'playlist_mincount': 5,
+ }, {
+ 'url': 'http://www.mylifetime.com/shows/atlanta-plastic',
+ 'info_dict': {
+ 'id': 'SERIES4317',
+ 'title': 'Atlanta Plastic',
+ },
+ 'playlist_mincount': 2,
+ }, {
+ 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1',
'only_matching': True
}, {
- 'url': 'http://www.fyi.tv/shows/tiny-house-nation/videos/207-sq-ft-minnesota-prairie-cottage',
+ 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8',
'only_matching': True
}, {
- 'url': 'http://www.mylifetime.com/shows/project-runway-junior/video/season-1/episode-6/superstar-clients',
+ 'url': 'http://www.mylifetime.com/shows/project-runway-junior/season-1/episode-6',
+ 'only_matching': True
+ }, {
+ 'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie',
'only_matching': True
}]
+ _DOMAIN_TO_REQUESTOR_ID = {
+ 'history.com': 'HISTORY',
+ 'aetv.com': 'AETV',
+ 'mylifetime.com': 'LIFETIME',
+ 'fyi.tv': 'FYI',
+ }
def _real_extract(self, url):
- page_type, video_id = re.match(self._VALID_URL, url).groups()
+ domain, show_path, movie_display_id = re.match(self._VALID_URL, url).groups()
+ display_id = show_path or movie_display_id
+ webpage = self._download_webpage(url, display_id)
+ if show_path:
+ url_parts = show_path.split('/')
+ url_parts_len = len(url_parts)
+ if url_parts_len == 1:
+ entries = []
+ for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage):
+ entries.append(self.url_result(
+ compat_urlparse.urljoin(url, season_url_path), 'AENetworks'))
+ return self.playlist_result(
+ entries, self._html_search_meta('aetn:SeriesId', webpage),
+ self._html_search_meta('aetn:SeriesTitle', webpage))
+ elif url_parts_len == 2:
+ entries = []
+ for episode_item in re.findall(r'(?s)<div[^>]+class="[^"]*episode-item[^"]*"[^>]*>', webpage):
+ episode_attributes = extract_attributes(episode_item)
+ episode_url = compat_urlparse.urljoin(
+ url, episode_attributes['data-canonical'])
+ entries.append(self.url_result(
+ episode_url, 'AENetworks',
+ episode_attributes['data-videoid']))
+ return self.playlist_result(
+ entries, self._html_search_meta('aetn:SeasonId', webpage))
+
+ query = {
+ 'mbr': 'true',
+ 'assetTypes': 'medium_video_s3'
+ }
+ video_id = self._html_search_meta('aetn:VideoID', webpage)
+ media_url = self._search_regex(
+ r"media_url\s*=\s*'([^']+)'", webpage, 'video url')
+ theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
+ r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), video_id)
+ info = self._parse_theplatform_metadata(theplatform_metadata)
+ if theplatform_metadata.get('AETN$isBehindWall'):
+ requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain]
+ resource = '<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/"><channel><title>%s</title><item><title>%s</title><guid>%s</guid><media:rating scheme="urn:v-chip">%s</media:rating></item></channel></rss>' % (requestor_id, theplatform_metadata['title'], theplatform_metadata['AETN$PPL_pplProgramId'], theplatform_metadata['ratings'][0]['rating'])
+ query['auth'] = self._extract_mvpd_auth(
+ url, video_id, requestor_id, resource)
+ info.update(self._search_json_ld(webpage, video_id, fatal=False))
+ media_url = update_url_query(media_url, query)
+ media_url = self._sign_url(media_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET)
+ formats, subtitles = self._extract_theplatform_smil(media_url, video_id)
+ self._sort_formats(formats)
+ info.update({
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ })
+ return info
- webpage = self._download_webpage(url, video_id)
- video_url_re = [
- r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id,
- r"media_url\s*=\s*'([^']+)'"
- ]
- video_url = unescapeHTML(self._search_regex(video_url_re, webpage, 'video url'))
- query = {'mbr': 'true'}
- if page_type == 'shows':
- query['assetTypes'] = 'medium_video_s3'
- if 'switch=hds' in video_url:
- query['switch'] = 'hls'
+class HistoryTopicIE(AENetworksBaseIE):
+ IE_NAME = 'history:topic'
+ IE_DESC = 'History.com Topic'
+ _VALID_URL = r'https?://(?:www\.)?history\.com/topics/(?:[^/]+/)?(?P<topic_id>[^/]+)(?:/[^/]+(?:/(?P<video_display_id>[^/?#]+))?)?'
+ _TESTS = [{
+ 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false',
+ 'info_dict': {
+ 'id': '40700995724',
+ 'ext': 'mp4',
+ 'title': "Bet You Didn't Know: Valentine's Day",
+ 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7',
+ 'timestamp': 1375819729,
+ 'upload_date': '20130806',
+ 'uploader': 'AENE-NEW',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ 'add_ie': ['ThePlatform'],
+ }, {
+ 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/videos',
+ 'info_dict':
+ {
+ 'id': 'world-war-i-history',
+ 'title': 'World War I History',
+ },
+ 'playlist_mincount': 24,
+ }, {
+ 'url': 'http://www.history.com/topics/world-war-i-history/videos',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/speeches',
+ 'only_matching': True,
+ }]
- info = self._search_json_ld(webpage, video_id, fatal=False)
- info.update({
+ def theplatform_url_result(self, theplatform_url, video_id, query):
+ return {
'_type': 'url_transparent',
+ 'id': video_id,
'url': smuggle_url(
- update_url_query(video_url, query),
+ update_url_query(theplatform_url, query),
{
'sig': {
- 'key': 'crazyjava',
- 'secret': 's3cr3t'},
+ 'key': self._THEPLATFORM_KEY,
+ 'secret': self._THEPLATFORM_SECRET,
+ },
'force_smil_url': True
}),
- })
- return info
+ 'ie_key': 'ThePlatform',
+ }
+
+ def _real_extract(self, url):
+ topic_id, video_display_id = re.match(self._VALID_URL, url).groups()
+ if video_display_id:
+ webpage = self._download_webpage(url, video_display_id)
+ release_url, video_id = re.search(r"_videoPlayer.play\('([^']+)'\s*,\s*'[^']+'\s*,\s*'(\d+)'\)", webpage).groups()
+ release_url = unescapeHTML(release_url)
+
+ return self.theplatform_url_result(
+ release_url, video_id, {
+ 'mbr': 'true',
+ 'switch': 'hls'
+ })
+ else:
+ webpage = self._download_webpage(url, topic_id)
+ entries = []
+ for episode_item in re.findall(r'<a.+?data-release-url="[^"]+"[^>]*>', webpage):
+ video_attributes = extract_attributes(episode_item)
+ entries.append(self.theplatform_url_result(
+ video_attributes['data-release-url'], video_attributes['data-id'], {
+ 'mbr': 'true',
+ 'switch': 'hls'
+ }))
+ return self.playlist_result(entries, topic_id, get_element_by_attribute('class', 'show-title', webpage))
diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py
index 8545681be..e8e40126b 100644
--- a/youtube_dl/extractor/amp.py
+++ b/youtube_dl/extractor/amp.py
@@ -5,6 +5,8 @@ from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_iso8601,
+ mimetype2ext,
+ determine_ext,
)
@@ -50,21 +52,25 @@ class AMPIE(InfoExtractor):
if isinstance(media_content, dict):
media_content = [media_content]
for media_data in media_content:
- media = media_data['@attributes']
- media_type = media['type']
- if media_type in ('video/f4m', 'application/f4m+xml'):
+ media = media_data.get('@attributes', {})
+ media_url = media.get('url')
+ if not media_url:
+ continue
+ ext = mimetype2ext(media.get('type')) or determine_ext(media_url)
+ if ext == 'f4m':
formats.extend(self._extract_f4m_formats(
- media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',
+ media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',
video_id, f4m_id='hds', fatal=False))
- elif media_type == 'application/x-mpegURL':
+ elif ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
- media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False))
+ media_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
else:
formats.append({
'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
'url': media['url'],
'tbr': int_or_none(media.get('bitrate')),
'filesize': int_or_none(media.get('fileSize')),
+ 'ext': ext,
})
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py
index 9b01e38f5..9e28f2579 100644
--- a/youtube_dl/extractor/animeondemand.py
+++ b/youtube_dl/extractor/animeondemand.py
@@ -22,6 +22,7 @@ class AnimeOnDemandIE(InfoExtractor):
_APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply'
_NETRC_MACHINE = 'animeondemand'
_TESTS = [{
+ # jap, OmU
'url': 'https://www.anime-on-demand.de/anime/161',
'info_dict': {
'id': '161',
@@ -30,17 +31,21 @@ class AnimeOnDemandIE(InfoExtractor):
},
'playlist_mincount': 4,
}, {
- # Film wording is used instead of Episode
+ # Film wording is used instead of Episode, ger/jap, Dub/OmU
'url': 'https://www.anime-on-demand.de/anime/39',
'only_matching': True,
}, {
- # Episodes without titles
+ # Episodes without titles, jap, OmU
'url': 'https://www.anime-on-demand.de/anime/162',
'only_matching': True,
}, {
# ger/jap, Dub/OmU, account required
'url': 'https://www.anime-on-demand.de/anime/169',
'only_matching': True,
+ }, {
+ # Full length film, non-series, ger/jap, Dub/OmU, account required
+ 'url': 'https://www.anime-on-demand.de/anime/185',
+ 'only_matching': True,
}]
def _login(self):
@@ -110,35 +115,12 @@ class AnimeOnDemandIE(InfoExtractor):
entries = []
- for num, episode_html in enumerate(re.findall(
- r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage), 1):
- episodebox_title = self._search_regex(
- (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1',
- r'class="episodebox-title"[^>]+>(?P<title>.+?)<'),
- episode_html, 'episodebox title', default=None, group='title')
- if not episodebox_title:
- continue
-
- episode_number = int(self._search_regex(
- r'(?:Episode|Film)\s*(\d+)',
- episodebox_title, 'episode number', default=num))
- episode_title = self._search_regex(
- r'(?:Episode|Film)\s*\d+\s*-\s*(.+)',
- episodebox_title, 'episode title', default=None)
-
- video_id = 'episode-%d' % episode_number
-
- common_info = {
- 'id': video_id,
- 'series': anime_title,
- 'episode': episode_title,
- 'episode_number': episode_number,
- }
-
+ def extract_info(html, video_id, num=None):
+ title, description = [None] * 2
formats = []
for input_ in re.findall(
- r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', episode_html):
+ r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', html):
attributes = extract_attributes(input_)
playlist_urls = []
for playlist_key in ('data-playlist', 'data-otherplaylist'):
@@ -161,7 +143,7 @@ class AnimeOnDemandIE(InfoExtractor):
format_id_list.append(lang)
if kind:
format_id_list.append(kind)
- if not format_id_list:
+ if not format_id_list and num is not None:
format_id_list.append(compat_str(num))
format_id = '-'.join(format_id_list)
format_note = ', '.join(filter(None, (kind, lang_note)))
@@ -215,28 +197,74 @@ class AnimeOnDemandIE(InfoExtractor):
})
formats.extend(file_formats)
- if formats:
- self._sort_formats(formats)
+ return {
+ 'title': title,
+ 'description': description,
+ 'formats': formats,
+ }
+
+ def extract_entries(html, video_id, common_info, num=None):
+ info = extract_info(html, video_id, num)
+
+ if info['formats']:
+ self._sort_formats(info['formats'])
f = common_info.copy()
- f.update({
- 'title': title,
- 'description': description,
- 'formats': formats,
- })
+ f.update(info)
entries.append(f)
- # Extract teaser only when full episode is not available
- if not formats:
+ # Extract teaser/trailer only when full episode is not available
+ if not info['formats']:
m = re.search(
- r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<',
- episode_html)
+ r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>(?P<kind>Teaser|Trailer)<',
+ html)
if m:
f = common_info.copy()
f.update({
- 'id': '%s-teaser' % f['id'],
+ 'id': '%s-%s' % (f['id'], m.group('kind').lower()),
'title': m.group('title'),
'url': compat_urlparse.urljoin(url, m.group('href')),
})
entries.append(f)
+ def extract_episodes(html):
+ for num, episode_html in enumerate(re.findall(
+ r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', html), 1):
+ episodebox_title = self._search_regex(
+ (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1',
+ r'class="episodebox-title"[^>]+>(?P<title>.+?)<'),
+ episode_html, 'episodebox title', default=None, group='title')
+ if not episodebox_title:
+ continue
+
+ episode_number = int(self._search_regex(
+ r'(?:Episode|Film)\s*(\d+)',
+ episodebox_title, 'episode number', default=num))
+ episode_title = self._search_regex(
+ r'(?:Episode|Film)\s*\d+\s*-\s*(.+)',
+ episodebox_title, 'episode title', default=None)
+
+ video_id = 'episode-%d' % episode_number
+
+ common_info = {
+ 'id': video_id,
+ 'series': anime_title,
+ 'episode': episode_title,
+ 'episode_number': episode_number,
+ }
+
+ extract_entries(episode_html, video_id, common_info)
+
+ def extract_film(html, video_id):
+ common_info = {
+ 'id': anime_id,
+ 'title': anime_title,
+ 'description': anime_description,
+ }
+ extract_entries(html, video_id, common_info)
+
+ extract_episodes(webpage)
+
+ if not entries:
+ extract_film(webpage, anime_id)
+
return self.playlist_result(entries, anime_id, anime_title, anime_description)
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index be40f85b4..a6801f3d4 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -7,6 +7,8 @@ from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
int_or_none,
+ parse_duration,
+ unified_strdate,
)
@@ -16,7 +18,8 @@ class AppleTrailersIE(InfoExtractor):
_TESTS = [{
'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',
'info_dict': {
- 'id': 'manofsteel',
+ 'id': '5111',
+ 'title': 'Man of Steel',
},
'playlist': [
{
@@ -70,6 +73,15 @@ class AppleTrailersIE(InfoExtractor):
'id': 'blackthorn',
},
'playlist_mincount': 2,
+ 'expected_warnings': ['Unable to download JSON metadata'],
+ }, {
+ # json data only available from http://trailers.apple.com/trailers/feeds/data/15881.json
+ 'url': 'http://trailers.apple.com/trailers/fox/kungfupanda3/',
+ 'info_dict': {
+ 'id': '15881',
+ 'title': 'Kung Fu Panda 3',
+ },
+ 'playlist_mincount': 4,
}, {
'url': 'http://trailers.apple.com/ca/metropole/autrui/',
'only_matching': True,
@@ -85,6 +97,45 @@ class AppleTrailersIE(InfoExtractor):
movie = mobj.group('movie')
uploader_id = mobj.group('company')
+ webpage = self._download_webpage(url, movie)
+ film_id = self._search_regex(r"FilmId\s*=\s*'(\d+)'", webpage, 'film id')
+ film_data = self._download_json(
+ 'http://trailers.apple.com/trailers/feeds/data/%s.json' % film_id,
+ film_id, fatal=False)
+
+ if film_data:
+ entries = []
+ for clip in film_data.get('clips', []):
+ clip_title = clip['title']
+
+ formats = []
+ for version, version_data in clip.get('versions', {}).items():
+ for size, size_data in version_data.get('sizes', {}).items():
+ src = size_data.get('src')
+ if not src:
+ continue
+ formats.append({
+ 'format_id': '%s-%s' % (version, size),
+ 'url': re.sub(r'_(\d+p.mov)', r'_h\1', src),
+ 'width': int_or_none(size_data.get('width')),
+ 'height': int_or_none(size_data.get('height')),
+ 'language': version[:2],
+ })
+ self._sort_formats(formats)
+
+ entries.append({
+ 'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(),
+ 'formats': formats,
+ 'title': clip_title,
+ 'thumbnail': clip.get('screen') or clip.get('thumb'),
+ 'duration': parse_duration(clip.get('runtime') or clip.get('faded')),
+ 'upload_date': unified_strdate(clip.get('posted')),
+ 'uploader_id': uploader_id,
+ })
+
+ page_data = film_data.get('page', {})
+ return self.playlist_result(entries, film_id, page_data.get('movie_title'))
+
playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
def fix_html(s):
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py
index fd45b3e42..13a06396d 100644
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -13,6 +13,7 @@ from ..utils import (
parse_duration,
unified_strdate,
xpath_text,
+ update_url_query,
)
from ..compat import compat_etree_fromstring
@@ -34,6 +35,7 @@ class ARDMediathekIE(InfoExtractor):
# m3u8 download
'skip_download': True,
},
+ 'skip': 'HTTP Error 404: Not Found',
}, {
'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916',
'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e',
@@ -44,6 +46,7 @@ class ARDMediathekIE(InfoExtractor):
'description': 'md5:196392e79876d0ac94c94e8cdb2875f1',
'duration': 5252,
},
+ 'skip': 'HTTP Error 404: Not Found',
}, {
# audio
'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
@@ -55,6 +58,7 @@ class ARDMediathekIE(InfoExtractor):
'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef',
'duration': 3240,
},
+ 'skip': 'HTTP Error 404: Not Found',
}, {
'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
'only_matching': True,
@@ -113,11 +117,14 @@ class ARDMediathekIE(InfoExtractor):
continue
if ext == 'f4m':
formats.extend(self._extract_f4m_formats(
- stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
- video_id, preference=-1, f4m_id='hds', fatal=False))
+ update_url_query(stream_url, {
+ 'hdcore': '3.1.1',
+ 'plugin': 'aasp-3.1.1.69.124'
+ }),
+ video_id, f4m_id='hds', fatal=False))
elif ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
- stream_url, video_id, 'mp4', preference=1, m3u8_id='hls', fatal=False))
+ stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
else:
if server and server.startswith('rtmp'):
f = {
@@ -231,7 +238,8 @@ class ARDIE(InfoExtractor):
'title': 'Die Story im Ersten: Mission unter falscher Flagge',
'upload_date': '20140804',
'thumbnail': 're:^https?://.*\.jpg$',
- }
+ },
+ 'skip': 'HTTP Error 404: Not Found',
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 049f1fa9e..e0c5c1804 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -419,6 +419,7 @@ class ArteTVPlaylistIE(ArteTVBaseIE):
'info_dict': {
'id': 'PL-013263',
'title': 'Areva & Uramin',
+ 'description': 'md5:a1dc0312ce357c262259139cfd48c9bf',
},
'playlist_mincount': 6,
}, {
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index ef560b592..57ce0c174 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -90,6 +90,7 @@ class BrightcoveLegacyIE(InfoExtractor):
'description': 'md5:363109c02998fee92ec02211bd8000df',
'uploader': 'National Ballet of Canada',
},
+ 'skip': 'Video gone',
},
{
# test flv videos served by akamaihd.net
@@ -108,7 +109,7 @@ class BrightcoveLegacyIE(InfoExtractor):
},
},
{
- # playlist test
+ # playlist with 'videoList'
# from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players
'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL',
'info_dict': {
@@ -117,6 +118,15 @@ class BrightcoveLegacyIE(InfoExtractor):
},
'playlist_mincount': 7,
},
+ {
+ # playlist with 'playlistTab' (https://github.com/rg3/youtube-dl/issues/9965)
+ 'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg',
+ 'info_dict': {
+ 'id': '1522758701001',
+ 'title': 'Lesson 08',
+ },
+ 'playlist_mincount': 10,
+ },
]
FLV_VCODECS = {
1: 'SORENSON',
@@ -298,13 +308,19 @@ class BrightcoveLegacyIE(InfoExtractor):
info_url, player_key, 'Downloading playlist information')
json_data = json.loads(playlist_info)
- if 'videoList' not in json_data:
+ if 'videoList' in json_data:
+ playlist_info = json_data['videoList']
+ playlist_dto = playlist_info['mediaCollectionDTO']
+ elif 'playlistTabs' in json_data:
+ playlist_info = json_data['playlistTabs']
+ playlist_dto = playlist_info['lineupListDTO']['playlistDTOs'][0]
+ else:
raise ExtractorError('Empty playlist')
- playlist_info = json_data['videoList']
- videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
+
+ videos = [self._extract_video_info(video_info) for video_info in playlist_dto['videoDTOs']]
return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'],
- playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
+ playlist_title=playlist_dto['displayName'])
def _extract_video_info(self, video_info):
video_id = compat_str(video_info['id'])
@@ -585,6 +601,13 @@ class BrightcoveNewIE(InfoExtractor):
'format_id': build_format_id('rtmp'),
})
formats.append(f)
+
+ errors = json_data.get('errors')
+ if not formats and errors:
+ error = errors[0]
+ raise ExtractorError(
+ error.get('message') or error.get('error_subcode') or error['error_code'], expected=True)
+
self._sort_formats(formats)
subtitles = {}
diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py
index df503ecc0..75fa92d7c 100644
--- a/youtube_dl/extractor/buzzfeed.py
+++ b/youtube_dl/extractor/buzzfeed.py
@@ -5,6 +5,7 @@ import json
import re
from .common import InfoExtractor
+from .facebook import FacebookIE
class BuzzFeedIE(InfoExtractor):
@@ -20,11 +21,11 @@ class BuzzFeedIE(InfoExtractor):
'info_dict': {
'id': 'aVCR29aE_OQ',
'ext': 'mp4',
+ 'title': 'Angry Ram destroys a punching bag..',
+ 'description': 'md5:c59533190ef23fd4458a5e8c8c872345',
'upload_date': '20141024',
'uploader_id': 'Buddhanz1',
- 'description': 'He likes to stay in shape with his heavy bag, he wont stop until its on the ground\n\nFollow Angry Ram on Facebook for regular updates -\nhttps://www.facebook.com/pages/Angry-Ram/1436897249899558?ref=hl',
- 'uploader': 'Buddhanz',
- 'title': 'Angry Ram destroys a punching bag',
+ 'uploader': 'Angry Ram',
}
}]
}, {
@@ -41,13 +42,30 @@ class BuzzFeedIE(InfoExtractor):
'info_dict': {
'id': 'mVmBL8B-In0',
'ext': 'mp4',
+ 'title': 're:Munchkin the Teddy Bear gets her exercise',
+ 'description': 'md5:28faab95cda6e361bcff06ec12fc21d8',
'upload_date': '20141124',
'uploader_id': 'CindysMunchkin',
- 'description': 're:© 2014 Munchkin the',
'uploader': 're:^Munchkin the',
- 'title': 're:Munchkin the Teddy Bear gets her exercise',
},
}]
+ }, {
+ 'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK',
+ 'info_dict': {
+ 'id': 'the-most-adorable-crash-landing-ever',
+ 'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing',
+ 'description': 'This gosling knows how to stick a landing.',
+ },
+ 'playlist': [{
+ 'md5': '763ca415512f91ca62e4621086900a23',
+ 'info_dict': {
+ 'id': '971793786185728',
+ 'ext': 'mp4',
+ 'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...',
+ 'uploader': 'Calgary Outdoor Centre-University of Calgary',
+ },
+ }],
+ 'add_ie': ['Facebook'],
}]
def _real_extract(self, url):
@@ -66,6 +84,10 @@ class BuzzFeedIE(InfoExtractor):
continue
entries.append(self.url_result(video['url']))
+ facebook_url = FacebookIE._extract_url(webpage)
+ if facebook_url:
+ entries.append(self.url_result(facebook_url))
+
return {
'_type': 'playlist',
'id': playlist_id,
diff --git a/youtube_dl/extractor/cbsinteractive.py b/youtube_dl/extractor/cbsinteractive.py
index 0011c3029..821db20b2 100644
--- a/youtube_dl/extractor/cbsinteractive.py
+++ b/youtube_dl/extractor/cbsinteractive.py
@@ -80,9 +80,6 @@ class CBSInteractiveIE(ThePlatformIE):
media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId'])
formats, subtitles = [], {}
- if site == 'cnet':
- formats, subtitles = self._extract_theplatform_smil(
- self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id)
for (fkey, vid) in vdata['files'].items():
if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']:
continue
@@ -94,7 +91,7 @@ class CBSInteractiveIE(ThePlatformIE):
subtitles = self._merge_subtitles(subtitles, tp_subtitles)
self._sort_formats(formats)
- info = self.get_metadata('kYEXFC/%s' % media_guid_path, video_id)
+ info = self._extract_theplatform_metadata('kYEXFC/%s' % media_guid_path, video_id)
info.update({
'id': video_id,
'display_id': display_id,
diff --git a/youtube_dl/extractor/cliprs.py b/youtube_dl/extractor/cliprs.py
index 4f9320ea5..d55b26d59 100644
--- a/youtube_dl/extractor/cliprs.py
+++ b/youtube_dl/extractor/cliprs.py
@@ -1,16 +1,10 @@
# coding: utf-8
from __future__ import unicode_literals
-from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- float_or_none,
- int_or_none,
- parse_iso8601,
-)
+from .onet import OnetBaseIE
-class ClipRsIE(InfoExtractor):
+class ClipRsIE(OnetBaseIE):
_VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P<id>[^/]+)/\d+'
_TEST = {
'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732',
@@ -27,64 +21,13 @@ class ClipRsIE(InfoExtractor):
}
def _real_extract(self, url):
- video_id = self._match_id(url)
+ display_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(url, display_id)
- video_id = self._search_regex(
- r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id')
+ mvp_id = self._search_mvp_id(webpage)
- response = self._download_json(
- 'http://qi.ckm.onetapi.pl/', video_id,
- query={
- 'body[id]': video_id,
- 'body[jsonrpc]': '2.0',
- 'body[method]': 'get_asset_detail',
- 'body[params][ID_Publikacji]': video_id,
- 'body[params][Service]': 'www.onet.pl',
- 'content-type': 'application/jsonp',
- 'x-onet-app': 'player.front.onetapi.pl',
- })
+ info_dict = self._extract_from_id(mvp_id, webpage)
+ info_dict['display_id'] = display_id
- error = response.get('error')
- if error:
- raise ExtractorError(
- '%s said: %s' % (self.IE_NAME, error['message']), expected=True)
-
- video = response['result'].get('0')
-
- formats = []
- for _, formats_dict in video['formats'].items():
- if not isinstance(formats_dict, dict):
- continue
- for format_id, format_list in formats_dict.items():
- if not isinstance(format_list, list):
- continue
- for f in format_list:
- if not f.get('url'):
- continue
- formats.append({
- 'url': f['url'],
- 'format_id': format_id,
- 'height': int_or_none(f.get('vertical_resolution')),
- 'width': int_or_none(f.get('horizontal_resolution')),
- 'abr': float_or_none(f.get('audio_bitrate')),
- 'vbr': float_or_none(f.get('video_bitrate')),
- })
- self._sort_formats(formats)
-
- meta = video.get('meta', {})
-
- title = self._og_search_title(webpage, default=None) or meta['title']
- description = self._og_search_description(webpage, default=None) or meta.get('description')
- duration = meta.get('length') or meta.get('lenght')
- timestamp = parse_iso8601(meta.get('addDate'), ' ')
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'duration': duration,
- 'timestamp': timestamp,
- 'formats': formats,
- }
+ return info_dict
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 661889593..df546da27 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -44,6 +44,7 @@ from ..utils import (
sanitized_Request,
unescapeHTML,
unified_strdate,
+ unified_timestamp,
url_basename,
xpath_element,
xpath_text,
@@ -163,6 +164,7 @@ class InfoExtractor(object):
* "height" (optional, int)
* "resolution" (optional, string "{width}x{height"},
deprecated)
+ * "filesize" (optional, int)
thumbnail: Full URL to a video thumbnail image.
description: Full video description.
uploader: Full name of the video uploader.
@@ -751,10 +753,12 @@ class InfoExtractor(object):
return self._og_search_property('url', html, **kargs)
def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
+ if not isinstance(name, (list, tuple)):
+ name = [name]
if display_name is None:
- display_name = name
+ display_name = name[0]
return self._html_search_regex(
- self._meta_regex(name),
+ [self._meta_regex(n) for n in name],
html, display_name, fatal=fatal, group='content', **kwargs)
def _dc_search_uploader(self, html):
@@ -803,15 +807,17 @@ class InfoExtractor(object):
return self._html_search_meta('twitter:player', html,
'twitter card player')
- def _search_json_ld(self, html, video_id, **kwargs):
+ def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
json_ld = self._search_regex(
r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
html, 'JSON-LD', group='json_ld', **kwargs)
if not json_ld:
return {}
- return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
+ return self._json_ld(
+ json_ld, video_id, fatal=kwargs.get('fatal', True),
+ expected_type=expected_type)
- def _json_ld(self, json_ld, video_id, fatal=True):
+ def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
if isinstance(json_ld, compat_str):
json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
if not json_ld:
@@ -819,6 +825,8 @@ class InfoExtractor(object):
info = {}
if json_ld.get('@context') == 'http://schema.org':
item_type = json_ld.get('@type')
+ if expected_type is not None and expected_type != item_type:
+ return info
if item_type == 'TVEpisode':
info.update({
'episode': unescapeHTML(json_ld.get('name')),
@@ -837,6 +845,19 @@ class InfoExtractor(object):
'title': unescapeHTML(json_ld.get('headline')),
'description': unescapeHTML(json_ld.get('articleBody')),
})
+ elif item_type == 'VideoObject':
+ info.update({
+ 'url': json_ld.get('contentUrl'),
+ 'title': unescapeHTML(json_ld.get('name')),
+ 'description': unescapeHTML(json_ld.get('description')),
+ 'thumbnail': json_ld.get('thumbnailUrl'),
+ 'duration': parse_duration(json_ld.get('duration')),
+ 'timestamp': unified_timestamp(json_ld.get('uploadDate')),
+ 'filesize': float_or_none(json_ld.get('contentSize')),
+ 'tbr': int_or_none(json_ld.get('bitrate')),
+ 'width': int_or_none(json_ld.get('width')),
+ 'height': int_or_none(json_ld.get('height')),
+ })
return dict((k, v) for k, v in info.items() if v is not None)
@staticmethod
@@ -878,7 +899,11 @@ class InfoExtractor(object):
f['ext'] = determine_ext(f['url'])
if isinstance(field_preference, (list, tuple)):
- return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
+ return tuple(
+ f.get(field)
+ if f.get(field) is not None
+ else ('' if field == 'format_id' else -1)
+ for field in field_preference)
preference = f.get('preference')
if preference is None:
@@ -1781,6 +1806,13 @@ class InfoExtractor(object):
def _mark_watched(self, *args, **kwargs):
raise NotImplementedError('This method must be implemented by subclasses')
+ def geo_verification_headers(self):
+ headers = {}
+ geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
+ if geo_verification_proxy:
+ headers['Ytdl-request-proxy'] = geo_verification_proxy
+ return headers
+
class SearchInfoExtractor(InfoExtractor):
"""
diff --git a/youtube_dl/extractor/ctv.py b/youtube_dl/extractor/ctv.py
new file mode 100644
index 000000000..5807fbac9
--- /dev/null
+++ b/youtube_dl/extractor/ctv.py
@@ -0,0 +1,30 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class CTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?ctv\.ca/video/player\?vid=(?P<id>[0-9.]+)'
+ _TESTS = [{
+ 'url': 'http://www.ctv.ca/video/player?vid=706966',
+ 'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0',
+ 'info_dict': {
+ 'id': '706966',
+ 'ext': 'mp4',
+ 'title': 'Larry Day and Richard Jutras on the TIFF red carpet of \'Stonewall\'',
+ 'description': 'etalk catches up with Larry Day and Richard Jutras on the TIFF red carpet of "Stonewall”.',
+ 'upload_date': '20150919',
+ 'timestamp': 1442624700,
+ },
+ 'expected_warnings': ['HTTP Error 404'],
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ return {
+ '_type': 'url_transparent',
+ 'id': video_id,
+ 'url': '9c9media:ctv_web:%s' % video_id,
+ 'ie_key': 'NineCNineMedia',
+ }
diff --git a/youtube_dl/extractor/ctvnews.py b/youtube_dl/extractor/ctvnews.py
new file mode 100644
index 000000000..1023b6130
--- /dev/null
+++ b/youtube_dl/extractor/ctvnews.py
@@ -0,0 +1,65 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import orderedSet
+
+
+class CTVNewsIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P<id>[0-9.]+)'
+ _TESTS = [{
+ 'url': 'http://www.ctvnews.ca/video?clipId=901995',
+ 'md5': '10deb320dc0ccb8d01d34d12fc2ea672',
+ 'info_dict': {
+ 'id': '901995',
+ 'ext': 'mp4',
+ 'title': 'Extended: \'That person cannot be me\' Johnson says',
+ 'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285',
+ 'timestamp': 1467286284,
+ 'upload_date': '20160630',
+ }
+ }, {
+ 'url': 'http://www.ctvnews.ca/video?playlistId=1.2966224',
+ 'info_dict':
+ {
+ 'id': '1.2966224',
+ },
+ 'playlist_mincount': 19,
+ }, {
+ 'url': 'http://www.ctvnews.ca/video?binId=1.2876780',
+ 'info_dict':
+ {
+ 'id': '1.2876780',
+ },
+ 'playlist_mincount': 100,
+ }, {
+ 'url': 'http://www.ctvnews.ca/1.810401',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.ctvnews.ca/canadiens-send-p-k-subban-to-nashville-in-blockbuster-trade-1.2967231',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ page_id = self._match_id(url)
+
+ def ninecninemedia_url_result(clip_id):
+ return {
+ '_type': 'url_transparent',
+ 'id': clip_id,
+ 'url': '9c9media:ctvnews_web:%s' % clip_id,
+ 'ie_key': 'NineCNineMedia',
+ }
+
+ if page_id.isdigit():
+ return ninecninemedia_url_result(page_id)
+ else:
+ webpage = self._download_webpage('http://www.ctvnews.ca/%s' % page_id, page_id, query={
+ 'ot': 'example.AjaxPageLayout.ot',
+ 'maxItemsPerPage': 1000000,
+ })
+ entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet(
+ re.findall(r'clip\.id\s*=\s*(\d+);', webpage))]
+ return self.playlist_result(entries, page_id)
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 2e6226ea0..1f92823b7 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -16,6 +16,7 @@ from ..utils import (
sanitized_Request,
str_to_int,
unescapeHTML,
+ mimetype2ext,
)
@@ -111,6 +112,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
}
]
+ @staticmethod
+ def _extract_urls(webpage):
+ # Look for embedded Dailymotion player
+ matches = re.findall(
+ r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
+ return list(map(lambda m: unescapeHTML(m[1]), matches))
+
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -153,18 +161,19 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
type_ = media.get('type')
if type_ == 'application/vnd.lumberjack.manifest':
continue
- ext = determine_ext(media_url)
- if type_ == 'application/x-mpegURL' or ext == 'm3u8':
+ ext = mimetype2ext(type_) or determine_ext(media_url)
+ if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
media_url, video_id, 'mp4', preference=-1,
m3u8_id='hls', fatal=False))
- elif type_ == 'application/f4m' or ext == 'f4m':
+ elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
media_url, video_id, preference=-1, f4m_id='hds', fatal=False))
else:
f = {
'url': media_url,
'format_id': 'http-%s' % quality,
+ 'ext': ext,
}
m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url)
if m:
diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py
index 86024a745..b5c310ccb 100644
--- a/youtube_dl/extractor/daum.py
+++ b/youtube_dl/extractor/daum.py
@@ -66,22 +66,32 @@ class DaumIE(InfoExtractor):
'view_count': int,
'comment_count': int,
},
+ }, {
+ # Requires dte_type=WEB (#9972)
+ 'url': 'http://tvpot.daum.net/v/s3794Uf1NZeZ1qMpGpeqeRU',
+ 'md5': 'a8917742069a4dd442516b86e7d66529',
+ 'info_dict': {
+ 'id': 's3794Uf1NZeZ1qMpGpeqeRU',
+ 'ext': 'mp4',
+ 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny) [쇼! 음악중심] 508회 20160611',
+ 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\n\n[쇼! 음악중심] 20160611, 507회',
+ 'upload_date': '20160611',
+ },
}]
def _real_extract(self, url):
video_id = compat_urllib_parse_unquote(self._match_id(url))
- query = compat_urllib_parse_urlencode({'vid': video_id})
movie_data = self._download_json(
- 'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json?' + query,
- video_id, 'Downloading video formats info')
+ 'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json',
+ video_id, 'Downloading video formats info', query={'vid': video_id, 'dte_type': 'WEB'})
# For urls like http://m.tvpot.daum.net/v/65139429, where the video_id is really a clipid
if not movie_data.get('output_list', {}).get('output_list') and re.match(r'^\d+$', video_id):
return self.url_result('http://tvpot.daum.net/clip/ClipView.do?clipid=%s' % video_id)
info = self._download_xml(
- 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id,
- 'Downloading video info')
+ 'http://tvpot.daum.net/clip/ClipInfoXml.do', video_id,
+ 'Downloading video info', query={'vid': video_id})
formats = []
for format_el in movie_data['output_list']['output_list']:
diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py
index 5deff5f30..efb8585e8 100644
--- a/youtube_dl/extractor/dcn.py
+++ b/youtube_dl/extractor/dcn.py
@@ -20,7 +20,7 @@ from ..utils import (
class DCNIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?'
+ _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?'
def _real_extract(self, url):
show_id, video_id, season_id = re.match(self._VALID_URL, url).groups()
@@ -55,30 +55,32 @@ class DCNBaseIE(InfoExtractor):
'is_live': is_live,
}
- def _extract_video_formats(self, webpage, video_id, entry_protocol):
+ def _extract_video_formats(self, webpage, video_id, m3u8_entry_protocol):
formats = []
- m3u8_url = self._html_search_regex(
- r'file\s*:\s*"([^"]+)', webpage, 'm3u8 url', fatal=False)
- if m3u8_url:
- formats.extend(self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=None))
-
- rtsp_url = self._search_regex(
- r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False)
- if rtsp_url:
- formats.append({
- 'url': rtsp_url,
- 'format_id': 'rtsp',
- })
-
+ format_url_base = 'http' + self._html_search_regex(
+ [
+ r'file\s*:\s*"https?(://[^"]+)/playlist.m3u8',
+ r'<a[^>]+href="rtsp(://[^"]+)"'
+ ], webpage, 'format url')
+ # TODO: Current DASH formats are broken - $Time$ pattern in
+ # <SegmentTemplate> not implemented yet
+ # formats.extend(self._extract_mpd_formats(
+ # format_url_base + '/manifest.mpd',
+ # video_id, mpd_id='dash', fatal=False))
+ formats.extend(self._extract_m3u8_formats(
+ format_url_base + '/playlist.m3u8', video_id, 'mp4',
+ m3u8_entry_protocol, m3u8_id='hls', fatal=False))
+ formats.extend(self._extract_f4m_formats(
+ format_url_base + '/manifest.f4m',
+ video_id, f4m_id='hds', fatal=False))
self._sort_formats(formats)
return formats
class DCNVideoIE(DCNBaseIE):
IE_NAME = 'dcn:video'
- _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?(?:video(?:/[^/]+)?|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)'
+ _TESTS = [{
'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375',
'info_dict':
{
@@ -94,7 +96,10 @@ class DCNVideoIE(DCNBaseIE):
# m3u8 download
'skip_download': True,
},
- }
+ }, {
+ 'url': 'http://awaan.ae/video/26723981/%D8%AF%D8%A7%D8%B1-%D8%A7%D9%84%D8%B3%D9%84%D8%A7%D9%85:-%D8%AE%D9%8A%D8%B1-%D8%AF%D9%88%D8%B1-%D8%A7%D9%84%D8%A3%D9%86%D8%B5%D8%A7%D8%B1',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -120,7 +125,7 @@ class DCNVideoIE(DCNBaseIE):
class DCNLiveIE(DCNBaseIE):
IE_NAME = 'dcn:live'
- _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?live/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?live/(?P<id>\d+)'
def _real_extract(self, url):
channel_id = self._match_id(url)
@@ -147,7 +152,7 @@ class DCNLiveIE(DCNBaseIE):
class DCNSeasonIE(InfoExtractor):
IE_NAME = 'dcn:season'
- _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))'
+ _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))'
_TEST = {
'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A',
'info_dict':
diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py
index 113a4966f..12d28d3b9 100644
--- a/youtube_dl/extractor/eagleplatform.py
+++ b/youtube_dl/extractor/eagleplatform.py
@@ -51,6 +51,14 @@ class EaglePlatformIE(InfoExtractor):
}]
@staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1',
+ webpage)
+ if mobj is not None:
+ return mobj.group('url')
+
+ @staticmethod
def _handle_error(response):
status = int_or_none(response.get('status', 200))
if status != 200:
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index b1b04f2fc..864c9af68 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -20,7 +20,10 @@ from .adobetv import (
AdobeTVVideoIE,
)
from .adultswim import AdultSwimIE
-from .aenetworks import AENetworksIE
+from .aenetworks import (
+ AENetworksIE,
+ HistoryTopicIE,
+)
from .afreecatv import AfreecaTVIE
from .aftonbladet import AftonbladetIE
from .airmozilla import AirMozillaIE
@@ -136,9 +139,9 @@ from .chirbit import (
ChirbitProfileIE,
)
from .cinchcast import CinchcastIE
-from .cliprs import ClipRsIE
from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE
+from .cliprs import ClipRsIE
from .clipsyndicate import ClipsyndicateIE
from .closertotruth import CloserToTruthIE
from .cloudy import CloudyIE
@@ -168,6 +171,8 @@ from .crunchyroll import (
)
from .cspan import CSpanIE
from .ctsnews import CtsNewsIE
+from .ctv import CTVIE
+from .ctvnews import CTVNewsIE
from .cultureunplugged import CultureUnpluggedIE
from .cwtv import CWTVIE
from .dailymail import DailyMailIE
@@ -251,6 +256,7 @@ from .fivemin import FiveMinIE
from .fivetv import FiveTVIE
from .fktv import FKTVIE
from .flickr import FlickrIE
+from .flipagram import FlipagramIE
from .folketinget import FolketingetIE
from .footyroom import FootyRoomIE
from .formula1 import Formula1IE
@@ -276,6 +282,7 @@ from .freespeech import FreespeechIE
from .freevideo import FreeVideoIE
from .funimation import FunimationIE
from .funnyordie import FunnyOrDieIE
+from .fusion import FusionIE
from .gameinformer import GameInformerIE
from .gamekings import GamekingsIE
from .gameone import (
@@ -285,7 +292,6 @@ from .gameone import (
from .gamersyde import GamersydeIE
from .gamespot import GameSpotIE
from .gamestar import GameStarIE
-from .gametrailers import GametrailersIE
from .gazeta import GazetaIE
from .gdcvault import GDCVaultIE
from .generic import GenericIE
@@ -321,6 +327,10 @@ from .hotnewhiphop import HotNewHipHopIE
from .hotstar import HotStarIE
from .howcast import HowcastIE
from .howstuffworks import HowStuffWorksIE
+from .hrti import (
+ HRTiIE,
+ HRTiPlaylistIE,
+)
from .huffpost import HuffPostIE
from .hypem import HypemIE
from .iconosquare import IconosquareIE
@@ -359,6 +369,7 @@ from .jove import JoveIE
from .jwplatform import JWPlatformIE
from .jpopsukitv import JpopsukiIE
from .kaltura import KalturaIE
+from .kamcord import KamcordIE
from .kanalplay import KanalPlayIE
from .kankan import KankanIE
from .karaoketv import KaraoketvIE
@@ -423,6 +434,7 @@ from .makerschannel import MakersChannelIE
from .makertv import MakerTVIE
from .matchtv import MatchTVIE
from .mdr import MDRIE
+from .meta import METAIE
from .metacafe import MetacafeIE
from .metacritic import MetacriticIE
from .mgoon import MgoonIE
@@ -455,6 +467,7 @@ from .motherless import MotherlessIE
from .motorsport import MotorsportIE
from .movieclips import MovieClipsIE
from .moviezine import MoviezineIE
+from .msn import MSNIE
from .mtv import (
MTVIE,
MTVServicesEmbeddedIE,
@@ -481,7 +494,6 @@ from .nbc import (
NBCNewsIE,
NBCSportsIE,
NBCSportsVPlayerIE,
- MSNBCIE,
)
from .ndr import (
NDRIE,
@@ -523,6 +535,7 @@ from .nick import (
NickDeIE,
)
from .niconico import NiconicoIE, NiconicoPlaylistIE
+from .ninecninemedia import NineCNineMediaIE
from .ninegag import NineGagIE
from .noco import NocoIE
from .normalboots import NormalbootsIE
@@ -570,6 +583,10 @@ from .nytimes import (
from .nuvid import NuvidIE
from .odnoklassniki import OdnoklassnikiIE
from .oktoberfesttv import OktoberfestTVIE
+from .onet import (
+ OnetIE,
+ OnetChannelIE,
+)
from .onionstudios import OnionStudiosIE
from .ooyala import (
OoyalaIE,
@@ -608,6 +625,7 @@ from .pluralsight import (
PluralsightCourseIE,
)
from .podomatic import PodomaticIE
+from .polskieradio import PolskieRadioIE
from .porn91 import Porn91IE
from .pornhd import PornHdIE
from .pornhub import (
@@ -662,6 +680,7 @@ from .rice import RICEIE
from .ringtv import RingTVIE
from .ro220 import Ro220IE
from .rockstargames import RockstarGamesIE
+from .roosterteeth import RoosterTeethIE
from .rottentomatoes import RottenTomatoesIE
from .roxwel import RoxwelIE
from .rtbf import RTBFIE
@@ -706,10 +725,12 @@ from .shahid import ShahidIE
from .shared import SharedIE
from .sharesix import ShareSixIE
from .sina import SinaIE
+from .sixplay import SixPlayIE
from .skynewsarabia import (
SkyNewsArabiaIE,
SkyNewsArabiaArticleIE,
)
+from .skysports import SkySportsIE
from .slideshare import SlideshareIE
from .slutload import SlutloadIE
from .smotri import (
@@ -891,6 +912,7 @@ from .udn import UDNEmbedIE
from .digiteka import DigitekaIE
from .unistra import UnistraIE
from .urort import UrortIE
+from .urplay import URPlayIE
from .usatoday import USATodayIE
from .ustream import UstreamIE, UstreamChannelIE
from .ustudio import (
@@ -917,6 +939,7 @@ from .vice import (
ViceIE,
ViceShowIE,
)
+from .vidbit import VidbitIE
from .viddler import ViddlerIE
from .videodetective import VideoDetectiveIE
from .videofyme import VideofyMeIE
@@ -1050,6 +1073,7 @@ from .youtube import (
YoutubeSearchDateIE,
YoutubeSearchIE,
YoutubeSearchURLIE,
+ YoutubeSharedVideoIE,
YoutubeShowIE,
YoutubeSubscriptionsIE,
YoutubeTruncatedIDIE,
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index f5bbd39d2..cdb093262 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -129,6 +129,21 @@ class FacebookIE(InfoExtractor):
'only_matching': True,
}]
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
+ if mobj is not None:
+ return mobj.group('url')
+
+ # Facebook API embed
+ # see https://developers.facebook.com/docs/plugins/embedded-video-player
+ mobj = re.search(r'''(?x)<div[^>]+
+ class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
+ data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage)
+ if mobj is not None:
+ return mobj.group('url')
+
def _login(self):
(useremail, password) = self._get_login_info()
if useremail is None:
@@ -204,12 +219,25 @@ class FacebookIE(InfoExtractor):
BEFORE = '{swf.addParam(param[0], param[1]);});'
AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
- m = re.search(re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER), webpage)
- if m:
- swf_params = m.group(1).replace('\\\\', '\\').replace('\\"', '"')
+ PATTERN = re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER)
+
+ for m in re.findall(PATTERN, webpage):
+ swf_params = m.replace('\\\\', '\\').replace('\\"', '"')
data = dict(json.loads(swf_params))
params_raw = compat_urllib_parse_unquote(data['params'])
- video_data = json.loads(params_raw)['video_data']
+ video_data_candidate = json.loads(params_raw)['video_data']
+ for _, f in video_data_candidate.items():
+ if not f:
+ continue
+ if isinstance(f, dict):
+ f = [f]
+ if not isinstance(f, list):
+ continue
+ if f[0].get('video_id') == video_id:
+ video_data = video_data_candidate
+ break
+ if video_data:
+ break
def video_data_list2dict(video_data):
ret = {}
@@ -239,6 +267,8 @@ class FacebookIE(InfoExtractor):
formats = []
for format_id, f in video_data.items():
+ if f and isinstance(f, dict):
+ f = [f]
if not f or not isinstance(f, list):
continue
for quality in ('sd', 'hd'):
diff --git a/youtube_dl/extractor/flipagram.py b/youtube_dl/extractor/flipagram.py
new file mode 100644
index 000000000..acb6133ff
--- /dev/null
+++ b/youtube_dl/extractor/flipagram.py
@@ -0,0 +1,115 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ float_or_none,
+ try_get,
+ unified_timestamp,
+)
+
+
+class FlipagramIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?flipagram\.com/f/(?P<id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'https://flipagram.com/f/nyvTSJMKId',
+ 'md5': '888dcf08b7ea671381f00fab74692755',
+ 'info_dict': {
+ 'id': 'nyvTSJMKId',
+ 'ext': 'mp4',
+ 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction',
+ 'description': 'md5:d55e32edc55261cae96a41fa85ff630e',
+ 'duration': 35.571,
+ 'timestamp': 1461244995,
+ 'upload_date': '20160421',
+ 'uploader': 'kitty juria',
+ 'uploader_id': 'sjuria101',
+ 'creator': 'kitty juria',
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ 'comments': list,
+ 'formats': 'mincount:2',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ video_data = self._parse_json(
+ self._search_regex(
+ r'window\.reactH2O\s*=\s*({.+});', webpage, 'video data'),
+ video_id)
+
+ flipagram = video_data['flipagram']
+ video = flipagram['video']
+
+ json_ld = self._search_json_ld(webpage, video_id, default=False)
+ title = json_ld.get('title') or flipagram['captionText']
+ description = json_ld.get('description') or flipagram.get('captionText')
+
+ formats = [{
+ 'url': video['url'],
+ 'width': int_or_none(video.get('width')),
+ 'height': int_or_none(video.get('height')),
+ 'filesize': int_or_none(video_data.get('size')),
+ }]
+
+ preview_url = try_get(
+ flipagram, lambda x: x['music']['track']['previewUrl'], compat_str)
+ if preview_url:
+ formats.append({
+ 'url': preview_url,
+ 'ext': 'm4a',
+ 'vcodec': 'none',
+ })
+
+ self._sort_formats(formats)
+
+ counts = flipagram.get('counts', {})
+ user = flipagram.get('user', {})
+ video_data = flipagram.get('video', {})
+
+ thumbnails = [{
+ 'url': self._proto_relative_url(cover['url']),
+ 'width': int_or_none(cover.get('width')),
+ 'height': int_or_none(cover.get('height')),
+ 'filesize': int_or_none(cover.get('size')),
+ } for cover in flipagram.get('covers', []) if cover.get('url')]
+
+ # Note that this only retrieves comments that are initally loaded.
+ # For videos with large amounts of comments, most won't be retrieved.
+ comments = []
+ for comment in video_data.get('comments', {}).get(video_id, {}).get('items', []):
+ text = comment.get('comment')
+ if not text or not isinstance(text, list):
+ continue
+ comments.append({
+ 'author': comment.get('user', {}).get('name'),
+ 'author_id': comment.get('user', {}).get('username'),
+ 'id': comment.get('id'),
+ 'text': text[0],
+ 'timestamp': unified_timestamp(comment.get('created')),
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': float_or_none(flipagram.get('duration'), 1000),
+ 'thumbnails': thumbnails,
+ 'timestamp': unified_timestamp(flipagram.get('iso8601Created')),
+ 'uploader': user.get('name'),
+ 'uploader_id': user.get('username'),
+ 'creator': user.get('name'),
+ 'view_count': int_or_none(counts.get('plays')),
+ 'like_count': int_or_none(counts.get('likes')),
+ 'repost_count': int_or_none(counts.get('reflips')),
+ 'comment_count': int_or_none(counts.get('comments')),
+ 'comments': comments,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py
index ad94e31f3..7653975e3 100644
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -14,7 +14,10 @@ from ..utils import (
parse_duration,
determine_ext,
)
-from .dailymotion import DailymotionCloudIE
+from .dailymotion import (
+ DailymotionIE,
+ DailymotionCloudIE,
+)
class FranceTVBaseInfoExtractor(InfoExtractor):
@@ -188,6 +191,21 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
'params': {
'skip_download': True,
},
+ }, {
+ # Dailymotion embed
+ 'url': 'http://www.francetvinfo.fr/politique/notre-dame-des-landes/video-sur-france-inter-cecile-duflot-denonce-le-regard-meprisant-de-patrick-cohen_1520091.html',
+ 'md5': 'ee7f1828f25a648addc90cb2687b1f12',
+ 'info_dict': {
+ 'id': 'x4iiko0',
+ 'ext': 'mp4',
+ 'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen',
+ 'description': 'Au lendemain de la victoire du "oui" au référendum sur l\'aéroport de Notre-Dame-des-Landes, l\'ancienne ministre écologiste est l\'invitée de Patrick Cohen. Plus d\'info : https://www.franceinter.fr/emissions/le-7-9/le-7-9-27-juin-2016',
+ 'timestamp': 1467011958,
+ 'upload_date': '20160627',
+ 'uploader': 'France Inter',
+ 'uploader_id': 'x2q2ez',
+ },
+ 'add_ie': ['Dailymotion'],
}]
def _real_extract(self, url):
@@ -197,7 +215,13 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
if dmcloud_url:
- return self.url_result(dmcloud_url, 'DailymotionCloud')
+ return self.url_result(dmcloud_url, DailymotionCloudIE.ie_key())
+
+ dailymotion_urls = DailymotionIE._extract_urls(webpage)
+ if dailymotion_urls:
+ return self.playlist_result([
+ self.url_result(dailymotion_url, DailymotionIE.ie_key())
+ for dailymotion_url in dailymotion_urls])
video_id, catalogue = self._search_regex(
(r'id-video=([^@]+@[^"]+)',
diff --git a/youtube_dl/extractor/fusion.py b/youtube_dl/extractor/fusion.py
new file mode 100644
index 000000000..b4ab4cbb7
--- /dev/null
+++ b/youtube_dl/extractor/fusion.py
@@ -0,0 +1,35 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+
+
+class FusionIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?fusion\.net/video/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://fusion.net/video/201781/u-s-and-panamanian-forces-work-together-to-stop-a-vessel-smuggling-drugs/',
+ 'info_dict': {
+ 'id': 'ZpcWNoMTE6x6uVIIWYpHh0qQDjxBuq5P',
+ 'ext': 'mp4',
+ 'title': 'U.S. and Panamanian forces work together to stop a vessel smuggling drugs',
+ 'description': 'md5:0cc84a9943c064c0f46b128b41b1b0d7',
+ 'duration': 140.0,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': ['Ooyala'],
+ }, {
+ 'url': 'http://fusion.net/video/201781',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ ooyala_code = self._search_regex(
+ r'data-video-id=(["\'])(?P<code>.+?)\1',
+ webpage, 'ooyala code', group='code')
+
+ return OoyalaIE._build_url_result(ooyala_code)
diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py
deleted file mode 100644
index 1e7948ab8..000000000
--- a/youtube_dl/extractor/gametrailers.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import (
- int_or_none,
- parse_age_limit,
- url_basename,
-)
-
-
-class GametrailersIE(InfoExtractor):
- _VALID_URL = r'https?://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)'
-
- _TEST = {
- 'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review',
- 'md5': 'f28c4efa0bdfaf9b760f6507955b6a6a',
- 'info_dict': {
- 'id': '2983958',
- 'ext': 'mp4',
- 'display_id': '116437-Just-Cause-3-Review',
- 'title': 'Just Cause 3 - Review',
- 'description': 'It\'s a lot of fun to shoot at things and then watch them explode in Just Cause 3, but should there be more to the experience than that?',
- },
- }
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
- title = self._html_search_regex(
- r'<title>(.+?)\|', webpage, 'title').strip()
- embed_url = self._proto_relative_url(
- self._search_regex(
- r'src=\'(//embed.gametrailers.com/embed/[^\']+)\'', webpage,
- 'embed url'),
- scheme='http:')
- video_id = url_basename(embed_url)
- embed_page = self._download_webpage(embed_url, video_id)
- embed_vars_json = self._search_regex(
- r'(?s)var embedVars = (\{.*?\})\s*</script>', embed_page,
- 'embed vars')
- info = self._parse_json(embed_vars_json, video_id)
-
- formats = []
- for media in info['media']:
- if media['mediaPurpose'] == 'play':
- formats.append({
- 'url': media['uri'],
- 'height': media['height'],
- 'width:': media['width'],
- })
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'display_id': display_id,
- 'title': title,
- 'formats': formats,
- 'thumbnail': info.get('thumbUri'),
- 'description': self._og_search_description(webpage),
- 'duration': int_or_none(info.get('videoLengthInSeconds')),
- 'age_limit': parse_age_limit(info.get('audienceRating')),
- }
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 4aa24061c..cddd1a817 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -49,7 +49,10 @@ from .pornhub import PornHubIE
from .xhamster import XHamsterEmbedIE
from .tnaflix import TNAFlixNetworkEmbedIE
from .vimeo import VimeoIE
-from .dailymotion import DailymotionCloudIE
+from .dailymotion import (
+ DailymotionIE,
+ DailymotionCloudIE,
+)
from .onionstudios import OnionStudiosIE
from .viewlift import ViewLiftEmbedIE
from .screenwavemedia import ScreenwaveMediaIE
@@ -64,6 +67,9 @@ from .liveleak import LiveLeakIE
from .threeqsdn import ThreeQSDNIE
from .theplatform import ThePlatformIE
from .vessel import VesselIE
+from .kaltura import KalturaIE
+from .eagleplatform import EaglePlatformIE
+from .facebook import FacebookIE
class GenericIE(InfoExtractor):
@@ -920,6 +926,24 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['Kaltura'],
},
+ {
+ # Kaltura embedded via quoted entry_id
+ 'url': 'https://www.oreilly.com/ideas/my-cloud-makes-pretty-pictures',
+ 'info_dict': {
+ 'id': '0_utuok90b',
+ 'ext': 'mp4',
+ 'title': '06_matthew_brender_raj_dutt',
+ 'timestamp': 1466638791,
+ 'upload_date': '20160622',
+ },
+ 'add_ie': ['Kaltura'],
+ 'expected_warnings': [
+ 'Could not send HEAD request'
+ ],
+ 'params': {
+ 'skip_download': True,
+ }
+ },
# Eagle.Platform embed (generic URL)
{
'url': 'http://lenta.ru/news/2015/03/06/navalny/',
@@ -1091,12 +1115,17 @@ class GenericIE(InfoExtractor):
# Dailymotion Cloud video
{
'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
- 'md5': '49444254273501a64675a7e68c502681',
+ 'md5': 'dcaf23ad0c67a256f4278bce6e0bae38',
'info_dict': {
- 'id': '5585de919473990de4bee11b',
+ 'id': 'x2uy8t3',
'ext': 'mp4',
- 'title': 'Le débat',
+ 'title': 'Sauvons les abeilles ! - Le débat',
+ 'description': 'md5:d9082128b1c5277987825d684939ca26',
'thumbnail': 're:^https?://.*\.jpe?g$',
+ 'timestamp': 1434970506,
+ 'upload_date': '20150622',
+ 'uploader': 'Public Sénat',
+ 'uploader_id': 'xa9gza',
}
},
# OnionStudios embed
@@ -1220,6 +1249,102 @@ class GenericIE(InfoExtractor):
'uploader': 'www.hudl.com',
},
},
+ # twitter:player embed
+ {
+ 'url': 'http://www.theatlantic.com/video/index/484130/what-do-black-holes-sound-like/',
+ 'md5': 'a3e0df96369831de324f0778e126653c',
+ 'info_dict': {
+ 'id': '4909620399001',
+ 'ext': 'mp4',
+ 'title': 'What Do Black Holes Sound Like?',
+ 'description': 'what do black holes sound like',
+ 'upload_date': '20160524',
+ 'uploader_id': '29913724001',
+ 'timestamp': 1464107587,
+ 'uploader': 'TheAtlantic',
+ },
+ 'add_ie': ['BrightcoveLegacy'],
+ },
+ # Facebook <iframe> embed
+ {
+ 'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html',
+ 'md5': 'fbcde74f534176ecb015849146dd3aee',
+ 'info_dict': {
+ 'id': '599637780109885',
+ 'ext': 'mp4',
+ 'title': 'Facebook video #599637780109885',
+ },
+ },
+ # Facebook API embed
+ {
+ 'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/',
+ 'md5': 'a47372ee61b39a7b90287094d447d94e',
+ 'info_dict': {
+ 'id': '10153467542406923',
+ 'ext': 'mp4',
+ 'title': 'Facebook video #10153467542406923',
+ },
+ },
+ # Wordpress "YouTube Video Importer" plugin
+ {
+ 'url': 'http://www.lothype.com/blue-devils-drumline-stanford-lot-2016/',
+ 'md5': 'd16797741b560b485194eddda8121b48',
+ 'info_dict': {
+ 'id': 'HNTXWDXV9Is',
+ 'ext': 'mp4',
+ 'title': 'Blue Devils Drumline Stanford lot 2016',
+ 'upload_date': '20160627',
+ 'uploader_id': 'GENOCIDE8GENERAL10',
+ 'uploader': 'cylus cyrus',
+ },
+ },
+ {
+ # video stored on custom kaltura server
+ 'url': 'http://www.expansion.com/multimedia/videos.html?media=EQcM30NHIPv',
+ 'md5': '537617d06e64dfed891fa1593c4b30cc',
+ 'info_dict': {
+ 'id': '0_1iotm5bh',
+ 'ext': 'mp4',
+ 'title': 'Elecciones británicas: 5 lecciones para Rajoy',
+ 'description': 'md5:435a89d68b9760b92ce67ed227055f16',
+ 'uploader_id': 'videos.expansion@el-mundo.net',
+ 'upload_date': '20150429',
+ 'timestamp': 1430303472,
+ },
+ 'add_ie': ['Kaltura'],
+ },
+ {
+ # Non-standard Vimeo embed
+ 'url': 'https://openclassrooms.com/courses/understanding-the-web',
+ 'md5': '64d86f1c7d369afd9a78b38cbb88d80a',
+ 'info_dict': {
+ 'id': '148867247',
+ 'ext': 'mp4',
+ 'title': 'Understanding the web - Teaser',
+ 'description': 'This is "Understanding the web - Teaser" by openclassrooms on Vimeo, the home for high quality videos and the people who love them.',
+ 'upload_date': '20151214',
+ 'uploader': 'OpenClassrooms',
+ 'uploader_id': 'openclassrooms',
+ },
+ 'add_ie': ['Vimeo'],
+ },
+ # {
+ # # TODO: find another test
+ # # http://schema.org/VideoObject
+ # 'url': 'https://flipagram.com/f/nyvTSJMKId',
+ # 'md5': '888dcf08b7ea671381f00fab74692755',
+ # 'info_dict': {
+ # 'id': 'nyvTSJMKId',
+ # 'ext': 'mp4',
+ # 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction',
+ # 'description': '#love for cats.',
+ # 'timestamp': 1461244995,
+ # 'upload_date': '20160421',
+ # },
+ # 'params': {
+ # 'force_generic_extractor': True,
+ # },
+ # }
]
def report_following_redirect(self, new_url):
@@ -1576,12 +1701,16 @@ class GenericIE(InfoExtractor):
if matches:
return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
- # Look for embedded Dailymotion player
- matches = re.findall(
- r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
+ # Look for Wordpress "YouTube Video Importer" plugin
+ matches = re.findall(r'''(?x)<div[^>]+
+ class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
+ data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
if matches:
- return _playlist_from_matches(
- matches, lambda m: unescapeHTML(m[1]))
+ return _playlist_from_matches(matches, lambda m: m[-1])
+
+ matches = DailymotionIE._extract_urls(webpage)
+ if matches:
+ return _playlist_from_matches(matches)
# Look for embedded Dailymotion playlist player (#3822)
m = re.search(
@@ -1718,10 +1847,9 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'))
# Look for embedded Facebook player
- mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
- if mobj is not None:
- return self.url_result(mobj.group('url'), 'Facebook')
+ facebook_url = FacebookIE._extract_url(webpage)
+ if facebook_url is not None:
+ return self.url_result(facebook_url, 'Facebook')
# Look for embedded VK player
mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
@@ -1903,18 +2031,14 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'Zapiks')
# Look for Kaltura embeds
- mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P<q1>['\"])wid(?P=q1)\s*:\s*(?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?(?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),", webpage) or
- re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage))
- if mobj is not None:
- return self.url_result(smuggle_url(
- 'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(),
- {'source_url': url}), 'Kaltura')
+ kaltura_url = KalturaIE._extract_url(webpage)
+ if kaltura_url:
+ return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key())
# Look for Eagle.Platform embeds
- mobj = re.search(
- r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
- if mobj is not None:
- return self.url_result(mobj.group('url'), 'EaglePlatform')
+ eagleplatform_url = EaglePlatformIE._extract_url(webpage)
+ if eagleplatform_url:
+ return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key())
# Look for ClipYou (uses Eagle.Platform) embeds
mobj = re.search(
@@ -2060,6 +2184,24 @@ class GenericIE(InfoExtractor):
'uploader': video_uploader,
}
+ # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser
+ embed_url = self._html_search_meta('twitter:player', webpage, default=None)
+ if embed_url:
+ return self.url_result(embed_url)
+
+ # Looking for http://schema.org/VideoObject
+ json_ld = self._search_json_ld(
+ webpage, video_id, default=None, expected_type='VideoObject')
+ if json_ld and json_ld.get('url'):
+ info_dict.update({
+ 'title': video_title or info_dict['title'],
+ 'description': video_description,
+ 'thumbnail': video_thumbnail,
+ 'age_limit': age_limit
+ })
+ info_dict.update(json_ld)
+ return info_dict
+
def check_video(vurl):
if YoutubeIE.suitable(vurl):
return True
diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py
new file mode 100644
index 000000000..656ce6d05
--- /dev/null
+++ b/youtube_dl/extractor/hrti.py
@@ -0,0 +1,202 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+ clean_html,
+ ExtractorError,
+ int_or_none,
+ parse_age_limit,
+ sanitized_Request,
+ try_get,
+)
+
+
+class HRTiBaseIE(InfoExtractor):
+ """
+ Base Information Extractor for Croatian Radiotelevision
+ video on demand site https://hrti.hrt.hr
+ Reverse engineered from the JavaScript app in app.min.js
+ """
+ _NETRC_MACHINE = 'hrti'
+
+ _APP_LANGUAGE = 'hr'
+ _APP_VERSION = '1.1'
+ _APP_PUBLICATION_ID = 'all_in_one'
+ _API_URL = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json'
+
+ def _initialize_api(self):
+ init_data = {
+ 'application_publication_id': self._APP_PUBLICATION_ID
+ }
+
+ uuid = self._download_json(
+ self._API_URL, None, note='Downloading uuid',
+ errnote='Unable to download uuid',
+ data=json.dumps(init_data).encode('utf-8'))['uuid']
+
+ app_data = {
+ 'uuid': uuid,
+ 'application_publication_id': self._APP_PUBLICATION_ID,
+ 'application_version': self._APP_VERSION
+ }
+
+ req = sanitized_Request(self._API_URL, data=json.dumps(app_data).encode('utf-8'))
+ req.get_method = lambda: 'PUT'
+
+ resources = self._download_json(
+ req, None, note='Downloading session information',
+ errnote='Unable to download session information')
+
+ self._session_id = resources['session_id']
+
+ modules = resources['modules']
+
+ self._search_url = modules['vod_catalog']['resources']['search']['uri'].format(
+ language=self._APP_LANGUAGE,
+ application_id=self._APP_PUBLICATION_ID)
+
+ self._login_url = (modules['user']['resources']['login']['uri'] +
+ '/format/json').format(session_id=self._session_id)
+
+ self._logout_url = modules['user']['resources']['logout']['uri']
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ # TODO: figure out authentication with cookies
+ if username is None or password is None:
+ self.raise_login_required()
+
+ auth_data = {
+ 'username': username,
+ 'password': password,
+ }
+
+ try:
+ auth_info = self._download_json(
+ self._login_url, None, note='Logging in', errnote='Unable to log in',
+ data=json.dumps(auth_data).encode('utf-8'))
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406:
+ auth_info = self._parse_json(e.cause.read().encode('utf-8'), None)
+ else:
+ raise
+
+ error_message = auth_info.get('error', {}).get('message')
+ if error_message:
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, error_message),
+ expected=True)
+
+ self._token = auth_info['secure_streaming_token']
+
+ def _real_initialize(self):
+ self._initialize_api()
+ self._login()
+
+
+class HRTiIE(HRTiBaseIE):
+ _VALID_URL = r'''(?x)
+ (?:
+ hrti:(?P<short_id>[0-9]+)|
+ https?://
+ hrti\.hrt\.hr/\#/video/show/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?
+ )
+ '''
+ _TESTS = [{
+ 'url': 'https://hrti.hrt.hr/#/video/show/2181385/republika-dokumentarna-serija-16-hd',
+ 'info_dict': {
+ 'id': '2181385',
+ 'display_id': 'republika-dokumentarna-serija-16-hd',
+ 'ext': 'mp4',
+ 'title': 'REPUBLIKA, dokumentarna serija (1/6) (HD)',
+ 'description': 'md5:48af85f620e8e0e1df4096270568544f',
+ 'duration': 2922,
+ 'view_count': int,
+ 'average_rating': int,
+ 'episode_number': int,
+ 'season_number': int,
+ 'age_limit': 12,
+ },
+ 'skip': 'Requires account credentials',
+ }, {
+ 'url': 'https://hrti.hrt.hr/#/video/show/2181385/',
+ 'only_matching': True,
+ }, {
+ 'url': 'hrti:2181385',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('short_id') or mobj.group('id')
+ display_id = mobj.group('display_id') or video_id
+
+ video = self._download_json(
+ '%s/video_id/%s/format/json' % (self._search_url, video_id),
+ display_id, 'Downloading video metadata JSON')['video'][0]
+
+ title_info = video['title']
+ title = title_info['title_long']
+
+ movie = video['video_assets']['movie'][0]
+ m3u8_url = movie['url'].format(TOKEN=self._token)
+ formats = self._extract_m3u8_formats(
+ m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls')
+ self._sort_formats(formats)
+
+ description = clean_html(title_info.get('summary_long'))
+ age_limit = parse_age_limit(video.get('parental_control', {}).get('rating'))
+ view_count = int_or_none(video.get('views'))
+ average_rating = int_or_none(video.get('user_rating'))
+ duration = int_or_none(movie.get('duration'))
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'average_rating': average_rating,
+ 'age_limit': age_limit,
+ 'formats': formats,
+ }
+
+
+class HRTiPlaylistIE(HRTiBaseIE):
+ _VALID_URL = r'https?://hrti.hrt.hr/#/video/list/category/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?'
+ _TESTS = [{
+ 'url': 'https://hrti.hrt.hr/#/video/list/category/212/ekumena',
+ 'info_dict': {
+ 'id': '212',
+ 'title': 'ekumena',
+ },
+ 'playlist_mincount': 8,
+ 'skip': 'Requires account credentials',
+ }, {
+ 'url': 'https://hrti.hrt.hr/#/video/list/category/212/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ category_id = mobj.group('id')
+ display_id = mobj.group('display_id') or category_id
+
+ response = self._download_json(
+ '%s/category_id/%s/format/json' % (self._search_url, category_id),
+ display_id, 'Downloading video metadata JSON')
+
+ video_ids = try_get(
+ response, lambda x: x['video_listings'][0]['alternatives'][0]['list'],
+ list) or [video['id'] for video in response.get('videos', []) if video.get('id')]
+
+ entries = [self.url_result('hrti:%s' % video_id) for video_id in video_ids]
+
+ return self.playlist_result(entries, category_id, display_id)
diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py
index ddcb3c916..01c7b3042 100644
--- a/youtube_dl/extractor/iqiyi.py
+++ b/youtube_dl/extractor/iqiyi.py
@@ -3,28 +3,22 @@ from __future__ import unicode_literals
import hashlib
import itertools
-import math
-import os
-import random
import re
import time
-import uuid
from .common import InfoExtractor
from ..compat import (
- compat_parse_qs,
compat_str,
compat_urllib_parse_urlencode,
- compat_urllib_parse_urlparse,
)
from ..utils import (
+ clean_html,
decode_packed_codes,
+ get_element_by_id,
+ get_element_by_attribute,
ExtractorError,
ohdave_rsa_encrypt,
remove_start,
- sanitized_Request,
- urlencode_postdata,
- url_basename,
)
@@ -171,70 +165,21 @@ class IqiyiIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
- 'md5': '2cb594dc2781e6c941a110d8f358118b',
+ # MD5 checksum differs on my machine and Travis CI
'info_dict': {
'id': '9c1fb1b99d192b21c559e5a1a2cb3c73',
+ 'ext': 'mp4',
'title': '美国德州空中惊现奇异云团 酷似UFO',
- 'ext': 'f4v',
}
}, {
'url': 'http://www.iqiyi.com/v_19rrhnnclk.html',
+ 'md5': '667171934041350c5de3f5015f7f1152',
'info_dict': {
'id': 'e3f585b550a280af23c98b6cb2be19fb',
- 'title': '名侦探柯南第752集',
- },
- 'playlist': [{
- 'info_dict': {
- 'id': 'e3f585b550a280af23c98b6cb2be19fb_part1',
- 'ext': 'f4v',
- 'title': '名侦探柯南第752集',
- },
- }, {
- 'info_dict': {
- 'id': 'e3f585b550a280af23c98b6cb2be19fb_part2',
- 'ext': 'f4v',
- 'title': '名侦探柯南第752集',
- },
- }, {
- 'info_dict': {
- 'id': 'e3f585b550a280af23c98b6cb2be19fb_part3',
- 'ext': 'f4v',
- 'title': '名侦探柯南第752集',
- },
- }, {
- 'info_dict': {
- 'id': 'e3f585b550a280af23c98b6cb2be19fb_part4',
- 'ext': 'f4v',
- 'title': '名侦探柯南第752集',
- },
- }, {
- 'info_dict': {
- 'id': 'e3f585b550a280af23c98b6cb2be19fb_part5',
- 'ext': 'f4v',
- 'title': '名侦探柯南第752集',
- },
- }, {
- 'info_dict': {
- 'id': 'e3f585b550a280af23c98b6cb2be19fb_part6',
- 'ext': 'f4v',
- 'title': '名侦探柯南第752集',
- },
- }, {
- 'info_dict': {
- 'id': 'e3f585b550a280af23c98b6cb2be19fb_part7',
- 'ext': 'f4v',
- 'title': '名侦探柯南第752集',
- },
- }, {
- 'info_dict': {
- 'id': 'e3f585b550a280af23c98b6cb2be19fb_part8',
- 'ext': 'f4v',
- 'title': '名侦探柯南第752集',
- },
- }],
- 'params': {
- 'skip_download': True,
+ 'ext': 'mp4',
+ 'title': '名侦探柯南 国语版:第752集 迫近灰原秘密的黑影 下篇',
},
+ 'skip': 'Geo-restricted to China',
}, {
'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html',
'only_matching': True,
@@ -250,22 +195,10 @@ class IqiyiIE(InfoExtractor):
'url': 'http://www.iqiyi.com/v_19rrny4w8w.html',
'info_dict': {
'id': 'f3cf468b39dddb30d676f89a91200dc1',
+ 'ext': 'mp4',
'title': '泰坦尼克号',
},
- 'playlist': [{
- 'info_dict': {
- 'id': 'f3cf468b39dddb30d676f89a91200dc1_part1',
- 'ext': 'f4v',
- 'title': '泰坦尼克号',
- },
- }, {
- 'info_dict': {
- 'id': 'f3cf468b39dddb30d676f89a91200dc1_part2',
- 'ext': 'f4v',
- 'title': '泰坦尼克号',
- },
- }],
- 'expected_warnings': ['Needs a VIP account for full video'],
+ 'skip': 'Geo-restricted to China',
}, {
'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html',
'info_dict': {
@@ -278,20 +211,15 @@ class IqiyiIE(InfoExtractor):
'only_matching': True,
}]
- _FORMATS_MAP = [
- ('1', 'h6'),
- ('2', 'h5'),
- ('3', 'h4'),
- ('4', 'h3'),
- ('5', 'h2'),
- ('10', 'h1'),
- ]
-
- AUTH_API_ERRORS = {
- # No preview available (不允许试看鉴权失败)
- 'Q00505': 'This video requires a VIP account',
- # End of preview time (试看结束鉴权失败)
- 'Q00506': 'Needs a VIP account for full video',
+ _FORMATS_MAP = {
+ '96': 1, # 216p, 240p
+ '1': 2, # 336p, 360p
+ '2': 3, # 480p, 504p
+ '21': 4, # 504p
+ '4': 5, # 720p
+ '17': 5, # 720p
+ '5': 6, # 1072p, 1080p
+ '18': 7, # 1080p
}
def _real_initialize(self):
@@ -352,177 +280,23 @@ class IqiyiIE(InfoExtractor):
return True
- def _authenticate_vip_video(self, api_video_url, video_id, tvid, _uuid, do_report_warning):
- auth_params = {
- # version and platform hard-coded in com/qiyi/player/core/model/remote/AuthenticationRemote.as
- 'version': '2.0',
- 'platform': 'b6c13e26323c537d',
- 'aid': tvid,
- 'tvid': tvid,
- 'uid': '',
- 'deviceId': _uuid,
- 'playType': 'main', # XXX: always main?
- 'filename': os.path.splitext(url_basename(api_video_url))[0],
- }
+ def get_raw_data(self, tvid, video_id):
+ tm = int(time.time() * 1000)
- qd_items = compat_parse_qs(compat_urllib_parse_urlparse(api_video_url).query)
- for key, val in qd_items.items():
- auth_params[key] = val[0]
-
- auth_req = sanitized_Request(
- 'http://api.vip.iqiyi.com/services/ckn.action',
- urlencode_postdata(auth_params))
- # iQiyi server throws HTTP 405 error without the following header
- auth_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
- auth_result = self._download_json(
- auth_req, video_id,
- note='Downloading video authentication JSON',
- errnote='Unable to download video authentication JSON')
-
- code = auth_result.get('code')
- msg = self.AUTH_API_ERRORS.get(code) or auth_result.get('msg') or code
- if code == 'Q00506':
- if do_report_warning:
- self.report_warning(msg)
- return False
- if 'data' not in auth_result:
- if msg is not None:
- raise ExtractorError('%s said: %s' % (self.IE_NAME, msg), expected=True)
- raise ExtractorError('Unexpected error from Iqiyi auth API')
-
- return auth_result['data']
-
- def construct_video_urls(self, data, video_id, _uuid, tvid):
- def do_xor(x, y):
- a = y % 3
- if a == 1:
- return x ^ 121
- if a == 2:
- return x ^ 72
- return x ^ 103
-
- def get_encode_code(l):
- a = 0
- b = l.split('-')
- c = len(b)
- s = ''
- for i in range(c - 1, -1, -1):
- a = do_xor(int(b[c - i - 1], 16), i)
- s += chr(a)
- return s[::-1]
-
- def get_path_key(x, format_id, segment_index):
- mg = ')(*&^flash@#$%a'
- tm = self._download_json(
- 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id,
- note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
- )['t']
- t = str(int(math.floor(int(tm) / (600.0))))
- return md5_text(t + mg + x)
-
- video_urls_dict = {}
- need_vip_warning_report = True
- for format_item in data['vp']['tkl'][0]['vs']:
- if 0 < int(format_item['bid']) <= 10:
- format_id = self.get_format(format_item['bid'])
- else:
- continue
-
- video_urls = []
-
- video_urls_info = format_item['fs']
- if not format_item['fs'][0]['l'].startswith('/'):
- t = get_encode_code(format_item['fs'][0]['l'])
- if t.endswith('mp4'):
- video_urls_info = format_item['flvs']
-
- for segment_index, segment in enumerate(video_urls_info):
- vl = segment['l']
- if not vl.startswith('/'):
- vl = get_encode_code(vl)
- is_vip_video = '/vip/' in vl
- filesize = segment['b']
- base_url = data['vp']['du'].split('/')
- if not is_vip_video:
- key = get_path_key(
- vl.split('/')[-1].split('.')[0], format_id, segment_index)
- base_url.insert(-1, key)
- base_url = '/'.join(base_url)
- param = {
- 'su': _uuid,
- 'qyid': uuid.uuid4().hex,
- 'client': '',
- 'z': '',
- 'bt': '',
- 'ct': '',
- 'tn': str(int(time.time()))
- }
- api_video_url = base_url + vl
- if is_vip_video:
- api_video_url = api_video_url.replace('.f4v', '.hml')
- auth_result = self._authenticate_vip_video(
- api_video_url, video_id, tvid, _uuid, need_vip_warning_report)
- if auth_result is False:
- need_vip_warning_report = False
- break
- param.update({
- 't': auth_result['t'],
- # cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as
- 'cid': 'afbe8fd3d73448c9',
- 'vid': video_id,
- 'QY00001': auth_result['u'],
- })
- api_video_url += '?' if '?' not in api_video_url else '&'
- api_video_url += compat_urllib_parse_urlencode(param)
- js = self._download_json(
- api_video_url, video_id,
- note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
- video_url = js['l']
- video_urls.append(
- (video_url, filesize))
-
- video_urls_dict[format_id] = video_urls
- return video_urls_dict
-
- def get_format(self, bid):
- matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)]
- return matched_format_ids[0] if len(matched_format_ids) else None
-
- def get_bid(self, format_id):
- matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id]
- return matched_bids[0] if len(matched_bids) else None
-
- def get_raw_data(self, tvid, video_id, enc_key, _uuid):
- tm = str(int(time.time()))
- tail = tm + tvid
- param = {
- 'key': 'fvip',
- 'src': md5_text('youtube-dl'),
- 'tvId': tvid,
+ key = 'd5fb4bd9d50c4be6948c97edd7254b0e'
+ sc = md5_text(compat_str(tm) + key + tvid)
+ params = {
+ 'tvid': tvid,
'vid': video_id,
- 'vinfo': 1,
- 'tm': tm,
- 'enc': md5_text(enc_key + tail),
- 'qyid': _uuid,
- 'tn': random.random(),
- # In iQiyi's flash player, um is set to 1 if there's a logged user
- # Some 1080P formats are only available with a logged user.
- # Here force um=1 to trick the iQiyi server
- 'um': 1,
- 'authkey': md5_text(md5_text('') + tail),
- 'k_tag': 1,
+ 'src': '76f90cbd92f94a2e925d83e8ccd22cb7',
+ 'sc': sc,
+ 't': tm,
}
- api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
- compat_urllib_parse_urlencode(param)
- raw_data = self._download_json(api_url, video_id)
- return raw_data
-
- def get_enc_key(self, video_id):
- # TODO: automatic key extraction
- # last update at 2016-01-22 for Zombie::bite
- enc_key = '4a1caba4b4465345366f28da7c117d20'
- return enc_key
+ return self._download_json(
+ 'http://cache.m.iqiyi.com/jp/tmts/%s/%s/' % (tvid, video_id),
+ video_id, transform_source=lambda s: remove_start(s, 'var tvInfoJs='),
+ query=params, headers=self.geo_verification_headers())
def _extract_playlist(self, webpage):
PAGE_SIZE = 50
@@ -571,58 +345,41 @@ class IqiyiIE(InfoExtractor):
r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
video_id = self._search_regex(
r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
- _uuid = uuid.uuid4().hex
-
- enc_key = self.get_enc_key(video_id)
-
- raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
-
- if raw_data['code'] != 'A000000':
- raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
-
- data = raw_data['data']
-
- title = data['vi']['vn']
-
- # generate video_urls_dict
- video_urls_dict = self.construct_video_urls(
- data, video_id, _uuid, tvid)
-
- # construct info
- entries = []
- for format_id in video_urls_dict:
- video_urls = video_urls_dict[format_id]
- for i, video_url_info in enumerate(video_urls):
- if len(entries) < i + 1:
- entries.append({'formats': []})
- entries[i]['formats'].append(
- {
- 'url': video_url_info[0],
- 'filesize': video_url_info[-1],
- 'format_id': format_id,
- 'preference': int(self.get_bid(format_id))
- }
- )
-
- for i in range(len(entries)):
- self._sort_formats(entries[i]['formats'])
- entries[i].update(
- {
- 'id': '%s_part%d' % (video_id, i + 1),
- 'title': title,
- }
- )
-
- if len(entries) > 1:
- info = {
- '_type': 'multi_video',
- 'id': video_id,
- 'title': title,
- 'entries': entries,
- }
- else:
- info = entries[0]
- info['id'] = video_id
- info['title'] = title
-
- return info
+
+ formats = []
+ for _ in range(5):
+ raw_data = self.get_raw_data(tvid, video_id)
+
+ if raw_data['code'] != 'A00000':
+ if raw_data['code'] == 'A00111':
+ self.raise_geo_restricted()
+ raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
+
+ data = raw_data['data']
+
+ for stream in data['vidl']:
+ if 'm3utx' not in stream:
+ continue
+ vd = compat_str(stream['vd'])
+ formats.append({
+ 'url': stream['m3utx'],
+ 'format_id': vd,
+ 'ext': 'mp4',
+ 'preference': self._FORMATS_MAP.get(vd, -1),
+ 'protocol': 'm3u8_native',
+ })
+
+ if formats:
+ break
+
+ self._sleep(5, video_id)
+
+ self._sort_formats(formats)
+ title = (get_element_by_id('widget-videotitle', webpage) or
+ clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py
index a65697ff5..1729f5bfb 100644
--- a/youtube_dl/extractor/kaltura.py
+++ b/youtube_dl/extractor/kaltura.py
@@ -6,7 +6,6 @@ import base64
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse_urlencode,
compat_urlparse,
compat_parse_qs,
)
@@ -15,6 +14,7 @@ from ..utils import (
ExtractorError,
int_or_none,
unsmuggle_url,
+ smuggle_url,
)
@@ -34,7 +34,8 @@ class KalturaIE(InfoExtractor):
)(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))?
)
'''
- _API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?'
+ _SERVICE_URL = 'http://cdnapi.kaltura.com'
+ _SERVICE_BASE = '/api_v3/index.php'
_TESTS = [
{
'url': 'kaltura:269692:1_1jc2y3e4',
@@ -64,16 +65,50 @@ class KalturaIE(InfoExtractor):
}
]
- def _kaltura_api_call(self, video_id, actions, *args, **kwargs):
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = (
+ re.search(
+ r"""(?xs)
+ kWidget\.(?:thumb)?[Ee]mbed\(
+ \{.*?
+ (?P<q1>['\"])wid(?P=q1)\s*:\s*
+ (?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?
+ (?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*
+ (?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),
+ """, webpage) or
+ re.search(
+ r'''(?xs)
+ (?P<q1>["\'])
+ (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?
+ (?P=q1).*?
+ (?:
+ entry_?[Ii]d|
+ (?P<q2>["\'])entry_?[Ii]d(?P=q2)
+ )\s*:\s*
+ (?P<q3>["\'])(?P<id>.+?)(?P=q3)
+ ''', webpage))
+ if mobj:
+ embed_info = mobj.groupdict()
+ url = 'kaltura:%(partner_id)s:%(id)s' % embed_info
+ escaped_pid = re.escape(embed_info['partner_id'])
+ service_url = re.search(
+ r'<script[^>]+src=["\']((?:https?:)?//.+?)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid),
+ webpage)
+ if service_url:
+ url = smuggle_url(url, {'service_url': service_url.group(1)})
+ return url
+
+ def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs):
params = actions[0]
if len(actions) > 1:
for i, a in enumerate(actions[1:], start=1):
for k, v in a.items():
params['%d:%s' % (i, k)] = v
- query = compat_urllib_parse_urlencode(params)
- url = self._API_BASE + query
- data = self._download_json(url, video_id, *args, **kwargs)
+ data = self._download_json(
+ (service_url or self._SERVICE_URL) + self._SERVICE_BASE,
+ video_id, query=params, *args, **kwargs)
status = data if len(actions) == 1 else data[0]
if status.get('objectType') == 'KalturaAPIException':
@@ -82,7 +117,7 @@ class KalturaIE(InfoExtractor):
return data
- def _get_kaltura_signature(self, video_id, partner_id):
+ def _get_kaltura_signature(self, video_id, partner_id, service_url=None):
actions = [{
'apiVersion': '3.1',
'expiry': 86400,
@@ -92,10 +127,10 @@ class KalturaIE(InfoExtractor):
'widgetId': '_%s' % partner_id,
}]
return self._kaltura_api_call(
- video_id, actions, note='Downloading Kaltura signature')['ks']
+ video_id, actions, service_url, note='Downloading Kaltura signature')['ks']
- def _get_video_info(self, video_id, partner_id):
- signature = self._get_kaltura_signature(video_id, partner_id)
+ def _get_video_info(self, video_id, partner_id, service_url=None):
+ signature = self._get_kaltura_signature(video_id, partner_id, service_url)
actions = [
{
'action': 'null',
@@ -118,7 +153,7 @@ class KalturaIE(InfoExtractor):
},
]
return self._kaltura_api_call(
- video_id, actions, note='Downloading video info JSON')
+ video_id, actions, service_url, note='Downloading video info JSON')
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
@@ -127,7 +162,7 @@ class KalturaIE(InfoExtractor):
partner_id, entry_id = mobj.group('partner_id', 'id')
ks = None
if partner_id and entry_id:
- info, flavor_assets = self._get_video_info(entry_id, partner_id)
+ info, flavor_assets = self._get_video_info(entry_id, partner_id, smuggled_data.get('service_url'))
else:
path, query = mobj.group('path', 'query')
if not path and not query:
@@ -175,12 +210,17 @@ class KalturaIE(InfoExtractor):
unsigned_url += '?referrer=%s' % referrer
return unsigned_url
+ data_url = info['dataUrl']
+ if '/flvclipper/' in data_url:
+ data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url)
+
formats = []
for f in flavor_assets:
# Continue if asset is not ready
if f['status'] != 2:
continue
- video_url = sign_url('%s/flavorId/%s' % (info['dataUrl'], f['id']))
+ video_url = sign_url(
+ '%s/flavorId/%s' % (data_url, f['id']))
formats.append({
'format_id': '%(fileExt)s-%(bitrate)s' % f,
'ext': f.get('fileExt'),
@@ -193,9 +233,12 @@ class KalturaIE(InfoExtractor):
'width': int_or_none(f.get('width')),
'url': video_url,
})
- m3u8_url = sign_url(info['dataUrl'].replace('format/url', 'format/applehttp'))
- formats.extend(self._extract_m3u8_formats(
- m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+ if '/playManifest/' in data_url:
+ m3u8_url = sign_url(data_url.replace(
+ 'format/url', 'format/applehttp'))
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, entry_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
self._check_formats(formats, entry_id)
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/kamcord.py b/youtube_dl/extractor/kamcord.py
new file mode 100644
index 000000000..b50120d98
--- /dev/null
+++ b/youtube_dl/extractor/kamcord.py
@@ -0,0 +1,71 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ qualities,
+)
+
+
+class KamcordIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?kamcord\.com/v/(?P<id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'https://www.kamcord.com/v/hNYRduDgWb4',
+ 'md5': 'c3180e8a9cfac2e86e1b88cb8751b54c',
+ 'info_dict': {
+ 'id': 'hNYRduDgWb4',
+ 'ext': 'mp4',
+ 'title': 'Drinking Madness',
+ 'uploader': 'jacksfilms',
+ 'uploader_id': '3044562',
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video = self._parse_json(
+ self._search_regex(
+ r'window\.__props\s*=\s*({.+?});?(?:\n|\s*</script)',
+ webpage, 'video'),
+ video_id)['video']
+
+ title = video['title']
+
+ formats = self._extract_m3u8_formats(
+ video['play']['hls'], video_id, 'mp4', entry_protocol='m3u8_native')
+ self._sort_formats(formats)
+
+ uploader = video.get('user', {}).get('username')
+ uploader_id = video.get('user', {}).get('id')
+
+ view_count = int_or_none(video.get('viewCount'))
+ like_count = int_or_none(video.get('heartCount'))
+ comment_count = int_or_none(video.get('messageCount'))
+
+ preference_key = qualities(('small', 'medium', 'large'))
+
+ thumbnails = [{
+ 'url': thumbnail_url,
+ 'id': thumbnail_id,
+ 'preference': preference_key(thumbnail_id),
+ } for thumbnail_id, thumbnail_url in (video.get('thumbnail') or {}).items()
+ if isinstance(thumbnail_id, compat_str) and isinstance(thumbnail_url, compat_str)]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'comment_count': comment_count,
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py
index 0221fb919..b1d460599 100644
--- a/youtube_dl/extractor/kuwo.py
+++ b/youtube_dl/extractor/kuwo.py
@@ -26,11 +26,6 @@ class KuwoBaseIE(InfoExtractor):
def _get_formats(self, song_id, tolerate_ip_deny=False):
formats = []
for file_format in self._FORMATS:
- headers = {}
- cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
- if cn_verification_proxy:
- headers['Ytdl-request-proxy'] = cn_verification_proxy
-
query = {
'format': file_format['ext'],
'br': file_format.get('br', ''),
@@ -42,7 +37,7 @@ class KuwoBaseIE(InfoExtractor):
song_url = self._download_webpage(
'http://antiserver.kuwo.cn/anti.s',
song_id, note='Download %s url info' % file_format['format'],
- query=query, headers=headers,
+ query=query, headers=self.geo_verification_headers(),
)
if song_url == 'IPDeny' and not tolerate_ip_deny:
diff --git a/youtube_dl/extractor/la7.py b/youtube_dl/extractor/la7.py
index b08f6e3c9..da5a5de4a 100644
--- a/youtube_dl/extractor/la7.py
+++ b/youtube_dl/extractor/la7.py
@@ -1,60 +1,65 @@
+# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
- parse_duration,
+ js_to_json,
+ smuggle_url,
)
class LA7IE(InfoExtractor):
- IE_NAME = 'la7.tv'
- _VALID_URL = r'''(?x)
- https?://(?:www\.)?la7\.tv/
- (?:
- richplayer/\?assetid=|
- \?contentId=
- )
- (?P<id>[0-9]+)'''
-
- _TEST = {
- 'url': 'http://www.la7.tv/richplayer/?assetid=50355319',
- 'md5': 'ec7d1f0224d20ba293ab56cf2259651f',
+ IE_NAME = 'la7.it'
+ _VALID_URL = r'''(?x)(https?://)?(?:
+ (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video)/|
+ tg\.la7\.it/repliche-tgla7\?id=
+ )(?P<id>.+)'''
+
+ _TESTS = [{
+ # 'src' is a plain URL
+ 'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722',
+ 'md5': '8b613ffc0c4bf9b9e377169fc19c214c',
'info_dict': {
- 'id': '50355319',
+ 'id': 'inccool8-02-10-2015-163722',
'ext': 'mp4',
- 'title': 'IL DIVO',
- 'description': 'Un film di Paolo Sorrentino con Toni Servillo, Anna Bonaiuto, Giulio Bosetti e Flavio Bucci',
- 'duration': 6254,
+ 'title': 'Inc.Cool8',
+ 'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico',
+ 'thumbnail': 're:^https?://.*',
+ 'uploader_id': 'kdla7pillole@iltrovatore.it',
+ 'timestamp': 1443814869,
+ 'upload_date': '20151002',
},
- 'skip': 'Blocked in the US',
- }
+ }, {
+ # 'src' is a dictionary
+ 'url': 'http://tg.la7.it/repliche-tgla7?id=189080',
+ 'md5': '6b0d8888d286e39870208dfeceaf456b',
+ 'info_dict': {
+ 'id': '189080',
+ 'ext': 'mp4',
+ 'title': 'TG LA7',
+ },
+ }, {
+ 'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
- xml_url = 'http://www.la7.tv/repliche/content/index.php?contentId=%s' % video_id
- doc = self._download_xml(xml_url, video_id)
-
- video_title = doc.find('title').text
- description = doc.find('description').text
- duration = parse_duration(doc.find('duration').text)
- thumbnail = doc.find('img').text
- view_count = int(doc.find('views').text)
- prefix = doc.find('.//fqdn').text.strip().replace('auto:', 'http:')
+ webpage = self._download_webpage(url, video_id)
- formats = [{
- 'format': vnode.find('quality').text,
- 'tbr': int(vnode.find('quality').text),
- 'url': vnode.find('fms').text.strip().replace('mp4:', prefix),
- } for vnode in doc.findall('.//videos/video')]
- self._sort_formats(formats)
+ player_data = self._parse_json(
+ self._search_regex(r'videoLa7\(({[^;]+})\);', webpage, 'player data'),
+ video_id, transform_source=js_to_json)
return {
+ '_type': 'url_transparent',
+ 'url': smuggle_url('kaltura:103:%s' % player_data['vid'], {
+ 'service_url': 'http://kdam.iltrovatore.it',
+ }),
'id': video_id,
- 'title': video_title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'formats': formats,
- 'view_count': view_count,
+ 'title': player_data['title'],
+ 'description': self._og_search_description(webpage, default=None),
+ 'thumbnail': player_data.get('poster'),
+ 'ie_key': 'Kaltura',
}
diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py
index 63f581cd9..e9cc9aa59 100644
--- a/youtube_dl/extractor/leeco.py
+++ b/youtube_dl/extractor/leeco.py
@@ -20,9 +20,10 @@ from ..utils import (
int_or_none,
orderedSet,
parse_iso8601,
- sanitized_Request,
str_or_none,
url_basename,
+ urshift,
+ update_url_query,
)
@@ -74,15 +75,11 @@ class LeIE(InfoExtractor):
'only_matching': True,
}]
- @staticmethod
- def urshift(val, n):
- return val >> n if val >= 0 else (val + 0x100000000) >> n
-
# ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf
def ror(self, param1, param2):
_loc3_ = 0
while _loc3_ < param2:
- param1 = self.urshift(param1, 1) + ((param1 & 1) << 31)
+ param1 = urshift(param1, 1) + ((param1 & 1) << 31)
_loc3_ += 1
return param1
@@ -93,6 +90,10 @@ class LeIE(InfoExtractor):
_loc3_ = self.ror(_loc3_, _loc2_ % 17)
return _loc3_
+ # reversed from http://jstatic.letvcdn.com/sdk/player.js
+ def get_mms_key(self, time):
+ return self.ror(time, 8) ^ 185025305
+
# see M3U8Encryption class in KLetvPlayer.swf
@staticmethod
def decrypt_m3u8(encrypted_data):
@@ -113,28 +114,7 @@ class LeIE(InfoExtractor):
return bytes(_loc7_)
- def _real_extract(self, url):
- media_id = self._match_id(url)
- page = self._download_webpage(url, media_id)
- params = {
- 'id': media_id,
- 'platid': 1,
- 'splatid': 101,
- 'format': 1,
- 'tkey': self.calc_time_key(int(time.time())),
- 'domain': 'www.le.com'
- }
- play_json_req = sanitized_Request(
- 'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse_urlencode(params)
- )
- cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
- if cn_verification_proxy:
- play_json_req.add_header('Ytdl-request-proxy', cn_verification_proxy)
-
- play_json = self._download_json(
- play_json_req,
- media_id, 'Downloading playJson data')
-
+ def _check_errors(self, play_json):
# Check for errors
playstatus = play_json['playstatus']
if playstatus['status'] == 0:
@@ -145,43 +125,99 @@ class LeIE(InfoExtractor):
msg = 'Generic error. flag = %d' % flag
raise ExtractorError(msg, expected=True)
- playurl = play_json['playurl']
-
- formats = ['350', '1000', '1300', '720p', '1080p']
- dispatch = playurl['dispatch']
+ def _real_extract(self, url):
+ media_id = self._match_id(url)
+ page = self._download_webpage(url, media_id)
- urls = []
- for format_id in formats:
- if format_id in dispatch:
- media_url = playurl['domain'][0] + dispatch[format_id][0]
- media_url += '&' + compat_urllib_parse_urlencode({
- 'm3v': 1,
+ play_json_h5 = self._download_json(
+ 'http://api.le.com/mms/out/video/playJsonH5',
+ media_id, 'Downloading html5 playJson data', query={
+ 'id': media_id,
+ 'platid': 3,
+ 'splatid': 304,
+ 'format': 1,
+ 'tkey': self.get_mms_key(int(time.time())),
+ 'domain': 'www.le.com',
+ 'tss': 'no',
+ },
+ headers=self.geo_verification_headers())
+ self._check_errors(play_json_h5)
+
+ play_json_flash = self._download_json(
+ 'http://api.le.com/mms/out/video/playJson',
+ media_id, 'Downloading flash playJson data', query={
+ 'id': media_id,
+ 'platid': 1,
+ 'splatid': 101,
+ 'format': 1,
+ 'tkey': self.calc_time_key(int(time.time())),
+ 'domain': 'www.le.com',
+ },
+ headers=self.geo_verification_headers())
+ self._check_errors(play_json_flash)
+
+ def get_h5_urls(media_url, format_id):
+ location = self._download_json(
+ media_url, media_id,
+ 'Download JSON metadata for format %s' % format_id, query={
'format': 1,
'expect': 3,
- 'rateid': format_id,
- })
+ 'tss': 'no',
+ })['location']
+
+ return {
+ 'http': update_url_query(location, {'tss': 'no'}),
+ 'hls': update_url_query(location, {'tss': 'ios'}),
+ }
- nodes_data = self._download_json(
- media_url, media_id,
- 'Download JSON metadata for format %s' % format_id)
+ def get_flash_urls(media_url, format_id):
+ media_url += '&' + compat_urllib_parse_urlencode({
+ 'm3v': 1,
+ 'format': 1,
+ 'expect': 3,
+ 'rateid': format_id,
+ })
- req = self._request_webpage(
- nodes_data['nodelist'][0]['location'], media_id,
- note='Downloading m3u8 information for format %s' % format_id)
+ nodes_data = self._download_json(
+ media_url, media_id,
+ 'Download JSON metadata for format %s' % format_id)
- m3u8_data = self.decrypt_m3u8(req.read())
+ req = self._request_webpage(
+ nodes_data['nodelist'][0]['location'], media_id,
+ note='Downloading m3u8 information for format %s' % format_id)
- url_info_dict = {
- 'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'),
- 'ext': determine_ext(dispatch[format_id][1]),
- 'format_id': format_id,
- 'protocol': 'm3u8',
- }
+ m3u8_data = self.decrypt_m3u8(req.read())
- if format_id[-1:] == 'p':
- url_info_dict['height'] = int_or_none(format_id[:-1])
+ return {
+ 'hls': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'),
+ }
- urls.append(url_info_dict)
+ extracted_formats = []
+ formats = []
+ for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)):
+ playurl = play_json['playurl']
+ play_domain = playurl['domain'][0]
+
+ for format_id, format_data in playurl.get('dispatch', []).items():
+ if format_id in extracted_formats:
+ continue
+ extracted_formats.append(format_id)
+
+ media_url = play_domain + format_data[0]
+ for protocol, format_url in get_urls(media_url, format_id).items():
+ f = {
+ 'url': format_url,
+ 'ext': determine_ext(format_data[1]),
+ 'format_id': '%s-%s' % (protocol, format_id),
+ 'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
+ 'quality': int_or_none(format_id),
+ }
+
+ if format_id[-1:] == 'p':
+ f['height'] = int_or_none(format_id[:-1])
+
+ formats.append(f)
+ self._sort_formats(formats, ('height', 'quality', 'format_id'))
publish_time = parse_iso8601(self._html_search_regex(
r'发布时间&nbsp;([^<>]+) ', page, 'publish time', default=None),
@@ -190,7 +226,7 @@ class LeIE(InfoExtractor):
return {
'id': media_id,
- 'formats': urls,
+ 'formats': formats,
'title': playurl['title'],
'thumbnail': playurl['pic'],
'description': description,
diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py
index 2d5040032..a98c4c530 100644
--- a/youtube_dl/extractor/lynda.py
+++ b/youtube_dl/extractor/lynda.py
@@ -100,7 +100,7 @@ class LyndaIE(LyndaBaseIE):
_TESTS = [{
'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
- 'md5': 'ecfc6862da89489161fb9cd5f5a6fac1',
+ # md5 is unstable
'info_dict': {
'id': '114408',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/m6.py b/youtube_dl/extractor/m6.py
index d5945ad66..39d2742c8 100644
--- a/youtube_dl/extractor/m6.py
+++ b/youtube_dl/extractor/m6.py
@@ -1,8 +1,6 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
@@ -23,34 +21,5 @@ class M6IE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- rss = self._download_xml('http://ws.m6.fr/v1/video/info/m6/bonus/%s' % video_id, video_id,
- 'Downloading video RSS')
-
- title = rss.find('./channel/item/title').text
- description = rss.find('./channel/item/description').text
- thumbnail = rss.find('./channel/item/visuel_clip_big').text
- duration = int(rss.find('./channel/item/duration').text)
- view_count = int(rss.find('./channel/item/nombre_vues').text)
-
- formats = []
- for format_id in ['lq', 'sd', 'hq', 'hd']:
- video_url = rss.find('./channel/item/url_video_%s' % format_id)
- if video_url is None:
- continue
- formats.append({
- 'url': video_url.text,
- 'format_id': format_id,
- })
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'view_count': view_count,
- 'formats': formats,
- }
+ video_id = self._match_id(url)
+ return self.url_result('6play:%s' % video_id, 'SixPlay', video_id)
diff --git a/youtube_dl/extractor/meta.py b/youtube_dl/extractor/meta.py
new file mode 100644
index 000000000..cdb46e163
--- /dev/null
+++ b/youtube_dl/extractor/meta.py
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .pladform import PladformIE
+from ..utils import (
+ unescapeHTML,
+ int_or_none,
+ ExtractorError,
+)
+
+
+class METAIE(InfoExtractor):
+ _VALID_URL = r'https?://video\.meta\.ua/(?:iframe/)?(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://video.meta.ua/5502115.video',
+ 'md5': '71b6f3ee274bef16f1ab410f7f56b476',
+ 'info_dict': {
+ 'id': '5502115',
+ 'ext': 'mp4',
+ 'title': 'Sony Xperia Z camera test [HQ]',
+ 'description': 'Xperia Z shoots video in FullHD HDR.',
+ 'uploader_id': 'nomobile',
+ 'uploader': 'CHЁZA.TV',
+ 'upload_date': '20130211',
+ },
+ 'add_ie': ['Youtube'],
+ }, {
+ 'url': 'http://video.meta.ua/iframe/5502115',
+ 'only_matching': True,
+ }, {
+ # pladform embed
+ 'url': 'http://video.meta.ua/7121015.video',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ st_html5 = self._search_regex(
+ r"st_html5\s*=\s*'#([^']+)'", webpage, 'uppod html5 st', default=None)
+
+ if st_html5:
+ # uppod st decryption algorithm is reverse engineered from function un(s) at uppod.js
+ json_str = ''
+ for i in range(0, len(st_html5), 3):
+ json_str += '&#x0%s;' % st_html5[i:i + 3]
+ uppod_data = self._parse_json(unescapeHTML(json_str), video_id)
+ error = uppod_data.get('customnotfound')
+ if error:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+
+ video_url = uppod_data['file']
+ info = {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': uppod_data.get('comment') or self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage, default=None),
+ 'thumbnail': uppod_data.get('poster') or self._og_search_thumbnail(webpage),
+ 'duration': int_or_none(self._og_search_property(
+ 'video:duration', webpage, default=None)),
+ }
+ if 'youtube.com/' in video_url:
+ info.update({
+ '_type': 'url_transparent',
+ 'ie_key': 'Youtube',
+ })
+ return info
+
+ pladform_url = PladformIE._extract_url(webpage)
+ if pladform_url:
+ return self.url_result(pladform_url)
diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py
index b6f00cc25..e6e7659a1 100644
--- a/youtube_dl/extractor/metacafe.py
+++ b/youtube_dl/extractor/metacafe.py
@@ -11,13 +11,14 @@ from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
- sanitized_Request,
urlencode_postdata,
+ get_element_by_attribute,
+ mimetype2ext,
)
class MetacafeIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
+ _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P<video_id>[^/]+)/(?P<display_id>[^/?#]+)'
_DISCLAIMER = 'http://www.metacafe.com/family_filter/'
_FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
IE_NAME = 'metacafe'
@@ -47,6 +48,7 @@ class MetacafeIE(InfoExtractor):
'uploader': 'ign',
'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
},
+ 'skip': 'Page is temporarily unavailable.',
},
# AnyClip video
{
@@ -55,8 +57,8 @@ class MetacafeIE(InfoExtractor):
'id': 'an-dVVXnuY7Jh77J',
'ext': 'mp4',
'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3',
- 'uploader': 'anyclip',
- 'description': 'md5:38c711dd98f5bb87acf973d573442e67',
+ 'uploader': 'AnyClip',
+ 'description': 'md5:cbef0460d31e3807f6feb4e7a5952e5b',
},
},
# age-restricted video
@@ -110,28 +112,25 @@ class MetacafeIE(InfoExtractor):
def report_disclaimer(self):
self.to_screen('Retrieving disclaimer')
- def _real_initialize(self):
+ def _confirm_age(self):
# Retrieve disclaimer
self.report_disclaimer()
self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer')
# Confirm age
- disclaimer_form = {
- 'filters': '0',
- 'submit': "Continue - I'm over 18",
- }
- request = sanitized_Request(self._FILTER_POST, urlencode_postdata(disclaimer_form))
- request.add_header('Content-Type', 'application/x-www-form-urlencoded')
self.report_age_confirmation()
- self._download_webpage(request, None, False, 'Unable to confirm age')
+ self._download_webpage(
+ self._FILTER_POST, None, False, 'Unable to confirm age',
+ data=urlencode_postdata({
+ 'filters': '0',
+ 'submit': "Continue - I'm over 18",
+ }), headers={
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ })
def _real_extract(self, url):
# Extract id and simplified title from URL
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError('Invalid URL: %s' % url)
-
- video_id = mobj.group(1)
+ video_id, display_id = re.match(self._VALID_URL, url).groups()
# the video may come from an external site
m_external = re.match('^(\w{2})-(.*)$', video_id)
@@ -144,15 +143,24 @@ class MetacafeIE(InfoExtractor):
if prefix == 'cb':
return self.url_result('theplatform:%s' % ext_id, 'ThePlatform')
- # Retrieve video webpage to extract further information
- req = sanitized_Request('http://www.metacafe.com/watch/%s/' % video_id)
+ # self._confirm_age()
# AnyClip videos require the flashversion cookie so that we get the link
# to the mp4 file
- mobj_an = re.match(r'^an-(.*?)$', video_id)
- if mobj_an:
- req.headers['Cookie'] = 'flashVersion=0;'
- webpage = self._download_webpage(req, video_id)
+ headers = {}
+ if video_id.startswith('an-'):
+ headers['Cookie'] = 'flashVersion=0;'
+
+ # Retrieve video webpage to extract further information
+ webpage = self._download_webpage(url, video_id, headers=headers)
+
+ error = get_element_by_attribute(
+ 'class', 'notfound-page-title', webpage)
+ if error:
+ raise ExtractorError(error, expected=True)
+
+ video_title = self._html_search_meta(
+ ['og:title', 'twitter:title'], webpage, 'title', default=None) or self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
# Extract URL, uploader and title from webpage
self.report_extraction(video_id)
@@ -216,20 +224,40 @@ class MetacafeIE(InfoExtractor):
'player_url': player_url,
'ext': play_path.partition(':')[0],
})
+ if video_url is None:
+ flashvars = self._parse_json(self._search_regex(
+ r'flashvars\s*=\s*({.*});', webpage, 'flashvars',
+ default=None), video_id, fatal=False)
+ if flashvars:
+ video_url = []
+ for source in flashvars.get('sources'):
+ source_url = source.get('src')
+ if not source_url:
+ continue
+ ext = mimetype2ext(source.get('type')) or determine_ext(source_url)
+ if ext == 'm3u8':
+ video_url.extend(self._extract_m3u8_formats(
+ source_url, video_id, 'mp4',
+ 'm3u8_native', m3u8_id='hls', fatal=False))
+ else:
+ video_url.append({
+ 'url': source_url,
+ 'ext': ext,
+ })
if video_url is None:
raise ExtractorError('Unsupported video type')
- video_title = self._html_search_regex(
- r'(?im)<title>(.*) - Video</title>', webpage, 'title')
- description = self._og_search_description(webpage)
- thumbnail = self._og_search_thumbnail(webpage)
+ description = self._html_search_meta(
+ ['og:description', 'twitter:description', 'description'],
+ webpage, 'title', fatal=False)
+ thumbnail = self._html_search_meta(
+ ['og:image', 'twitter:image'], webpage, 'title', fatal=False)
video_uploader = self._html_search_regex(
r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
webpage, 'uploader nickname', fatal=False)
duration = int_or_none(
- self._html_search_meta('video:duration', webpage))
-
+ self._html_search_meta('video:duration', webpage, default=None))
age_limit = (
18
if re.search(r'(?:"contentRating":|"rating",)"restricted"', webpage)
@@ -242,10 +270,11 @@ class MetacafeIE(InfoExtractor):
'url': video_url,
'ext': video_ext,
}]
-
self._sort_formats(formats)
+
return {
'id': video_id,
+ 'display_id': display_id,
'description': description,
'uploader': video_uploader,
'title': video_title,
diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py
index 9fbc74f5d..d970e94ec 100644
--- a/youtube_dl/extractor/mgtv.py
+++ b/youtube_dl/extractor/mgtv.py
@@ -26,7 +26,8 @@ class MGTVIE(InfoExtractor):
video_id = self._match_id(url)
api_data = self._download_json(
'http://v.api.mgtv.com/player/video', video_id,
- query={'video_id': video_id})['data']
+ query={'video_id': video_id},
+ headers=self.geo_verification_headers())['data']
info = api_data['info']
formats = []
diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py
index 5a00cd397..cd169f361 100644
--- a/youtube_dl/extractor/mitele.py
+++ b/youtube_dl/extractor/mitele.py
@@ -12,12 +12,69 @@ from ..utils import (
get_element_by_attribute,
int_or_none,
remove_start,
+ extract_attributes,
+ determine_ext,
)
-class MiTeleIE(InfoExtractor):
+class MiTeleBaseIE(InfoExtractor):
+ def _get_player_info(self, url, webpage):
+ player_data = extract_attributes(self._search_regex(
+ r'(?s)(<ms-video-player.+?</ms-video-player>)',
+ webpage, 'ms video player'))
+ video_id = player_data['data-media-id']
+ config_url = compat_urlparse.urljoin(url, player_data['data-config'])
+ config = self._download_json(
+ config_url, video_id, 'Downloading config JSON')
+ mmc_url = config['services']['mmc']
+
+ duration = None
+ formats = []
+ for m_url in (mmc_url, mmc_url.replace('/flash.json', '/html5.json')):
+ mmc = self._download_json(
+ m_url, video_id, 'Downloading mmc JSON')
+ if not duration:
+ duration = int_or_none(mmc.get('duration'))
+ for location in mmc['locations']:
+ gat = self._proto_relative_url(location.get('gat'), 'http:')
+ bas = location.get('bas')
+ loc = location.get('loc')
+ ogn = location.get('ogn')
+ if None in (gat, bas, loc, ogn):
+ continue
+ token_data = {
+ 'bas': bas,
+ 'icd': loc,
+ 'ogn': ogn,
+ 'sta': '0',
+ }
+ media = self._download_json(
+ '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)),
+ video_id, 'Downloading %s JSON' % location['loc'])
+ file_ = media.get('file')
+ if not file_:
+ continue
+ ext = determine_ext(file_)
+ if ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
+ video_id, f4m_id='hds', fatal=False))
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ file_, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'thumbnail': player_data.get('data-poster') or config.get('poster', {}).get('imageUrl'),
+ 'duration': duration,
+ }
+
+
+class MiTeleIE(MiTeleBaseIE):
IE_DESC = 'mitele.es'
- _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/'
+ _VALID_URL = r'https?://www\.mitele\.es/(?:[^/]+/){3}(?P<id>[^/]+)/'
_TESTS = [{
'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
@@ -25,7 +82,7 @@ class MiTeleIE(InfoExtractor):
'info_dict': {
'id': '0NF1jJnxS1Wu3pHrmvFyw2',
'display_id': 'programa-144',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Tor, la web invisible',
'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',
'series': 'Diario de',
@@ -40,7 +97,7 @@ class MiTeleIE(InfoExtractor):
'info_dict': {
'id': 'eLZSwoEd1S3pVyUm8lc6F',
'display_id': 'programa-226',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Cuarto Milenio - Temporada 6 - Programa 226',
'description': 'md5:50daf9fadefa4e62d9fc866d0c015701',
'series': 'Cuarto Milenio',
@@ -59,40 +116,7 @@ class MiTeleIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
- config_url = self._search_regex(
- r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url')
- config_url = compat_urlparse.urljoin(url, config_url)
-
- config = self._download_json(
- config_url, display_id, 'Downloading config JSON')
-
- mmc = self._download_json(
- config['services']['mmc'], display_id, 'Downloading mmc JSON')
-
- formats = []
- for location in mmc['locations']:
- gat = self._proto_relative_url(location.get('gat'), 'http:')
- bas = location.get('bas')
- loc = location.get('loc')
- ogn = location.get('ogn')
- if None in (gat, bas, loc, ogn):
- continue
- token_data = {
- 'bas': bas,
- 'icd': loc,
- 'ogn': ogn,
- 'sta': '0',
- }
- media = self._download_json(
- '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)),
- display_id, 'Downloading %s JSON' % location['loc'])
- file_ = media.get('file')
- if not file_:
- continue
- formats.extend(self._extract_f4m_formats(
- file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
- display_id, f4m_id=loc))
- self._sort_formats(formats)
+ info = self._get_player_info(url, webpage)
title = self._search_regex(
r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>',
@@ -112,21 +136,12 @@ class MiTeleIE(InfoExtractor):
title = remove_start(self._search_regex(
r'<title>([^<]+)</title>', webpage, 'title'), 'Ver online ')
- video_id = self._search_regex(
- r'data-media-id\s*=\s*"([^"]+)"', webpage,
- 'data media id', default=None) or display_id
- thumbnail = config.get('poster', {}).get('imageUrl')
- duration = int_or_none(mmc.get('duration'))
-
- return {
- 'id': video_id,
+ info.update({
'display_id': display_id,
'title': title,
'description': get_element_by_attribute('class', 'text', webpage),
'series': series,
'season': season,
'episode': episode,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'formats': formats,
- }
+ })
+ return info
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index 483f6925f..560fe188b 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -102,11 +102,11 @@ class MixcloudIE(InfoExtractor):
description = self._og_search_description(webpage)
like_count = parse_count(self._search_regex(
r'\bbutton-favorite[^>]+>.*?<span[^>]+class=["\']toggle-number[^>]+>\s*([^<]+)',
- webpage, 'like count', fatal=False))
+ webpage, 'like count', default=None))
view_count = str_to_int(self._search_regex(
[r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
r'/listeners/?">([0-9,.]+)</a>'],
- webpage, 'play count', fatal=False))
+ webpage, 'play count', default=None))
return {
'id': track_id,
diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py
new file mode 100644
index 000000000..1ec8e0f50
--- /dev/null
+++ b/youtube_dl/extractor/msn.py
@@ -0,0 +1,122 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ determine_ext,
+ ExtractorError,
+ int_or_none,
+ unescapeHTML,
+)
+
+
+class MSNIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?msn\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/[a-z]{2}-(?P<id>[\da-zA-Z]+)'
+ _TESTS = [{
+ 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/criminal-minds-shemar-moore-shares-a-touching-goodbye-message/vp-BBqQYNE',
+ 'md5': '8442f66c116cbab1ff7098f986983458',
+ 'info_dict': {
+ 'id': 'BBqQYNE',
+ 'display_id': 'criminal-minds-shemar-moore-shares-a-touching-goodbye-message',
+ 'ext': 'mp4',
+ 'title': 'Criminal Minds - Shemar Moore Shares A Touching Goodbye Message',
+ 'description': 'md5:e8e89b897b222eb33a6b5067a8f1bc25',
+ 'duration': 104,
+ 'uploader': 'CBS Entertainment',
+ 'uploader_id': 'IT0X5aoJ6bJgYerJXSDCgFmYPB1__54v',
+ },
+ }, {
+ 'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.msn.com/en-ae/video/watch/obama-a-lot-of-people-will-be-disappointed/vi-AAhxUMH',
+ 'only_matching': True,
+ }, {
+ # geo restricted
+ 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/the-first-fart-makes-you-laugh-the-last-fart-makes-you-cry/vp-AAhzIBU',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.msn.com/en-ae/entertainment/bollywood/watch-how-salman-khan-reacted-when-asked-if-he-would-apologize-for-his-‘raped-woman’-comment/vi-AAhvzW6',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id, display_id = mobj.group('id', 'display_id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ video = self._parse_json(
+ self._search_regex(
+ r'data-metadata\s*=\s*(["\'])(?P<data>.+?)\1',
+ webpage, 'video data', default='{}', group='data'),
+ display_id, transform_source=unescapeHTML)
+
+ if not video:
+ error = unescapeHTML(self._search_regex(
+ r'data-error=(["\'])(?P<error>.+?)\1',
+ webpage, 'error', group='error'))
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+
+ title = video['title']
+
+ formats = []
+ for file_ in video.get('videoFiles', []):
+ format_url = file_.get('url')
+ if not format_url:
+ continue
+ ext = determine_ext(format_url)
+ # .ism is not yet supported (see
+ # https://github.com/rg3/youtube-dl/issues/8118)
+ if ext == 'ism':
+ continue
+ if 'm3u8' in format_url:
+ # m3u8_native should not be used here until
+ # https://github.com/rg3/youtube-dl/issues/9913 is fixed
+ m3u8_formats = self._extract_m3u8_formats(
+ format_url, display_id, 'mp4',
+ m3u8_id='hls', fatal=False)
+ # Despite metadata in m3u8 all video+audio formats are
+ # actually video-only (no audio)
+ for f in m3u8_formats:
+ if f.get('acodec') != 'none' and f.get('vcodec') != 'none':
+ f['acodec'] = 'none'
+ formats.extend(m3u8_formats)
+ else:
+ formats.append({
+ 'url': format_url,
+ 'ext': 'mp4',
+ 'format_id': 'http',
+ 'width': int_or_none(file_.get('width')),
+ 'height': int_or_none(file_.get('height')),
+ })
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for file_ in video.get('files', []):
+ format_url = file_.get('url')
+ format_code = file_.get('formatCode')
+ if not format_url or not format_code:
+ continue
+ if compat_str(format_code) == '3100':
+ subtitles.setdefault(file_.get('culture', 'en'), []).append({
+ 'ext': determine_ext(format_url, 'ttml'),
+ 'url': format_url,
+ })
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': video.get('description'),
+ 'thumbnail': video.get('headlineImage', {}).get('url'),
+ 'duration': int_or_none(video.get('durationSecs')),
+ 'uploader': video.get('sourceFriendly'),
+ 'uploader_id': video.get('providerId'),
+ 'creator': video.get('creator'),
+ 'subtitles': subtitles,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py
index 722518663..e717abb9f 100644
--- a/youtube_dl/extractor/nationalgeographic.py
+++ b/youtube_dl/extractor/nationalgeographic.py
@@ -1,6 +1,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from .theplatform import ThePlatformIE
from ..utils import (
smuggle_url,
url_basename,
@@ -61,7 +62,7 @@ class NationalGeographicIE(InfoExtractor):
}
-class NationalGeographicChannelIE(InfoExtractor):
+class NationalGeographicChannelIE(ThePlatformIE):
IE_NAME = 'natgeo:channel'
_VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/videos/(?P<id>[^/?]+)'
@@ -102,12 +103,22 @@ class NationalGeographicChannelIE(InfoExtractor):
release_url = self._search_regex(
r'video_auth_playlist_url\s*=\s*"([^"]+)"',
webpage, 'release url')
+ query = {
+ 'mbr': 'true',
+ 'switch': 'http',
+ }
+ is_auth = self._search_regex(r'video_is_auth\s*=\s*"([^"]+)"', webpage, 'is auth', fatal=False)
+ if is_auth == 'auth':
+ auth_resource_id = self._search_regex(
+ r"video_auth_resourceId\s*=\s*'([^']+)'",
+ webpage, 'auth resource id')
+ query['auth'] = self._extract_mvpd_auth(url, display_id, 'natgeo', auth_resource_id) or ''
return {
'_type': 'url_transparent',
'ie_key': 'ThePlatform',
'url': smuggle_url(
- update_url_query(release_url, {'mbr': 'true', 'switch': 'http'}),
+ update_url_query(release_url, query),
{'force_smil_url': True}),
'display_id': display_id,
}
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index 6b7da1149..f694e210b 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -9,10 +9,6 @@ from ..utils import (
lowercase_escape,
smuggle_url,
unescapeHTML,
- update_url_query,
- int_or_none,
- HEADRequest,
- parse_iso8601,
)
@@ -192,9 +188,9 @@ class CSNNEIE(InfoExtractor):
class NBCNewsIE(ThePlatformIE):
- _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today)\.com/
+ _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/
(?:video/.+?/(?P<id>\d+)|
- ([^/]+/)*(?P<display_id>[^/?]+))
+ ([^/]+/)*(?:.*-)?(?P<mpx_id>[^/?]+))
'''
_TESTS = [
@@ -216,13 +212,16 @@ class NBCNewsIE(ThePlatformIE):
'ext': 'mp4',
'title': 'How Twitter Reacted To The Snowden Interview',
'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',
+ 'uploader': 'NBCU-NEWS',
+ 'timestamp': 1401363060,
+ 'upload_date': '20140529',
},
},
{
'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156',
'md5': 'fdbf39ab73a72df5896b6234ff98518a',
'info_dict': {
- 'id': 'Wjf9EDR3A_60',
+ 'id': '529953347624',
'ext': 'mp4',
'title': 'FULL EPISODE: Family Business',
'description': 'md5:757988edbaae9d7be1d585eb5d55cc04',
@@ -237,6 +236,9 @@ class NBCNewsIE(ThePlatformIE):
'ext': 'mp4',
'title': 'Nightly News with Brian Williams Full Broadcast (February 4)',
'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5',
+ 'timestamp': 1423104900,
+ 'uploader': 'NBCU-NEWS',
+ 'upload_date': '20150205',
},
},
{
@@ -245,10 +247,12 @@ class NBCNewsIE(ThePlatformIE):
'info_dict': {
'id': '529953347624',
'ext': 'mp4',
- 'title': 'Volkswagen U.S. Chief: We \'Totally Screwed Up\'',
- 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301',
+ 'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up',
+ 'description': 'md5:c8be487b2d80ff0594c005add88d8351',
+ 'upload_date': '20150922',
+ 'timestamp': 1442917800,
+ 'uploader': 'NBCU-NEWS',
},
- 'expected_warnings': ['http-6000 is not available']
},
{
'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788',
@@ -260,6 +264,22 @@ class NBCNewsIE(ThePlatformIE):
'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1',
'upload_date': '20160420',
'timestamp': 1461152093,
+ 'uploader': 'NBCU-NEWS',
+ },
+ },
+ {
+ 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924',
+ 'md5': '6d236bf4f3dddc226633ce6e2c3f814d',
+ 'info_dict': {
+ 'id': '314487875924',
+ 'ext': 'mp4',
+ 'title': 'The chaotic GOP immigration vote',
+ 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1406937606,
+ 'upload_date': '20140802',
+ 'uploader': 'NBCU-NEWS',
+ 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'],
},
},
{
@@ -290,105 +310,28 @@ class NBCNewsIE(ThePlatformIE):
}
else:
# "feature" and "nightly-news" pages use theplatform.com
- display_id = mobj.group('display_id')
- webpage = self._download_webpage(url, display_id)
- info = None
- bootstrap_json = self._search_regex(
- [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$',
- r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'],
- webpage, 'bootstrap json', default=None)
- bootstrap = self._parse_json(
- bootstrap_json, display_id, transform_source=unescapeHTML)
- if 'results' in bootstrap:
- info = bootstrap['results'][0]['video']
- elif 'video' in bootstrap:
- info = bootstrap['video']
- else:
- info = bootstrap
- video_id = info['mpxId']
- title = info['title']
-
- subtitles = {}
- caption_links = info.get('captionLinks')
- if caption_links:
- for (sub_key, sub_ext) in (('smpte-tt', 'ttml'), ('web-vtt', 'vtt'), ('srt', 'srt')):
- sub_url = caption_links.get(sub_key)
- if sub_url:
- subtitles.setdefault('en', []).append({
- 'url': sub_url,
- 'ext': sub_ext,
- })
-
- formats = []
- for video_asset in info['videoAssets']:
- video_url = video_asset.get('publicUrl')
- if not video_url:
- continue
- container = video_asset.get('format')
- asset_type = video_asset.get('assetType') or ''
- if container == 'ISM' or asset_type == 'FireTV-Once':
- continue
- elif asset_type == 'OnceURL':
- tp_formats, tp_subtitles = self._extract_theplatform_smil(
- video_url, video_id)
- formats.extend(tp_formats)
- subtitles = self._merge_subtitles(subtitles, tp_subtitles)
+ video_id = mobj.group('mpx_id')
+ if not video_id.isdigit():
+ webpage = self._download_webpage(url, video_id)
+ info = None
+ bootstrap_json = self._search_regex(
+ [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$',
+ r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'],
+ webpage, 'bootstrap json', default=None)
+ bootstrap = self._parse_json(
+ bootstrap_json, video_id, transform_source=unescapeHTML)
+ if 'results' in bootstrap:
+ info = bootstrap['results'][0]['video']
+ elif 'video' in bootstrap:
+ info = bootstrap['video']
else:
- tbr = int_or_none(video_asset.get('bitRate') or video_asset.get('bitrate'), 1000)
- format_id = 'http%s' % ('-%d' % tbr if tbr else '')
- video_url = update_url_query(
- video_url, {'format': 'redirect'})
- # resolve the url so that we can check availability and detect the correct extension
- head = self._request_webpage(
- HEADRequest(video_url), video_id,
- 'Checking %s url' % format_id,
- '%s is not available' % format_id,
- fatal=False)
- if head:
- video_url = head.geturl()
- formats.append({
- 'format_id': format_id,
- 'url': video_url,
- 'width': int_or_none(video_asset.get('width')),
- 'height': int_or_none(video_asset.get('height')),
- 'tbr': tbr,
- 'container': video_asset.get('format'),
- })
- self._sort_formats(formats)
+ info = bootstrap
+ video_id = info['mpxId']
return {
+ '_type': 'url_transparent',
'id': video_id,
- 'title': title,
- 'description': info.get('description'),
- 'thumbnail': info.get('thumbnail'),
- 'duration': int_or_none(info.get('duration')),
- 'timestamp': parse_iso8601(info.get('pubDate') or info.get('pub_date')),
- 'formats': formats,
- 'subtitles': subtitles,
+ # http://feed.theplatform.com/f/2E2eJC/nbcnews also works
+ 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byId=%s' % video_id,
+ 'ie_key': 'ThePlatformFeed',
}
-
-
-class MSNBCIE(InfoExtractor):
- # https URLs redirect to corresponding http ones
- _VALID_URL = r'https?://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)'
- _TEST = {
- 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924',
- 'md5': '6d236bf4f3dddc226633ce6e2c3f814d',
- 'info_dict': {
- 'id': 'n_hayes_Aimm_140801_272214',
- 'ext': 'mp4',
- 'title': 'The chaotic GOP immigration vote',
- 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'timestamp': 1406937606,
- 'upload_date': '20140802',
- 'uploader': 'NBCU-NEWS',
- 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'],
- },
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- embed_url = self._html_search_meta('embedURL', webpage)
- return self.url_result(embed_url)
diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py
index e96013791..4935002d0 100644
--- a/youtube_dl/extractor/nick.py
+++ b/youtube_dl/extractor/nick.py
@@ -8,7 +8,7 @@ from ..utils import update_url_query
class NickIE(MTVServicesInfoExtractor):
IE_NAME = 'nick.com'
- _VALID_URL = r'https?://(?:www\.)?nick\.com/videos/clip/(?P<id>[^/?#.]+)'
+ _VALID_URL = r'https?://(?:www\.)?nick(?:jr)?\.com/(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)'
_FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm'
_TESTS = [{
'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html',
@@ -52,6 +52,9 @@ class NickIE(MTVServicesInfoExtractor):
}
},
],
+ }, {
+ 'url': 'http://www.nickjr.com/paw-patrol/videos/pups-save-a-goldrush-s3-ep302-full-episode/',
+ 'only_matching': True,
}]
def _get_feed_query(self, uri):
diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py
new file mode 100644
index 000000000..d889245ad
--- /dev/null
+++ b/youtube_dl/extractor/ninecninemedia.py
@@ -0,0 +1,55 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_iso8601,
+ parse_duration,
+ ExtractorError
+)
+
+
+class NineCNineMediaIE(InfoExtractor):
+ _VALID_URL = r'9c9media:(?P<destination_code>[^:]+):(?P<id>\d+)'
+
+ def _real_extract(self, url):
+ destination_code, video_id = re.match(self._VALID_URL, url).groups()
+ api_base_url = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' % (destination_code, video_id)
+ content = self._download_json(api_base_url, video_id, query={
+ '$include': '[contentpackages]',
+ })
+ title = content['Name']
+ if len(content['ContentPackages']) > 1:
+ raise ExtractorError('multiple content packages')
+ content_package = content['ContentPackages'][0]
+ stacks_base_url = api_base_url + 'contentpackages/%s/stacks/' % content_package['Id']
+ stacks = self._download_json(stacks_base_url, video_id)['Items']
+ if len(stacks) > 1:
+ raise ExtractorError('multiple stacks')
+ stack = stacks[0]
+ stack_base_url = '%s%s/manifest.' % (stacks_base_url, stack['Id'])
+ formats = []
+ formats.extend(self._extract_m3u8_formats(
+ stack_base_url + 'm3u8', video_id, 'mp4',
+ 'm3u8_native', m3u8_id='hls', fatal=False))
+ formats.extend(self._extract_f4m_formats(
+ stack_base_url + 'f4m', video_id,
+ f4m_id='hds', fatal=False))
+ mp4_url = self._download_webpage(stack_base_url + 'pd', video_id, fatal=False)
+ if mp4_url:
+ formats.append({
+ 'url': mp4_url,
+ 'format_id': 'mp4',
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': content.get('Desc') or content.get('ShortDesc'),
+ 'timestamp': parse_iso8601(content.get('BroadcastDateTime')),
+ 'duration': parse_duration(content.get('BroadcastTime')),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py
new file mode 100644
index 000000000..402d3a9f7
--- /dev/null
+++ b/youtube_dl/extractor/onet.py
@@ -0,0 +1,172 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ ExtractorError,
+ float_or_none,
+ get_element_by_class,
+ int_or_none,
+ js_to_json,
+ parse_iso8601,
+ remove_start,
+ strip_or_none,
+ url_basename,
+)
+
+
+class OnetBaseIE(InfoExtractor):
+ def _search_mvp_id(self, webpage):
+ return self._search_regex(
+ r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id')
+
+ def _extract_from_id(self, video_id, webpage):
+ response = self._download_json(
+ 'http://qi.ckm.onetapi.pl/', video_id,
+ query={
+ 'body[id]': video_id,
+ 'body[jsonrpc]': '2.0',
+ 'body[method]': 'get_asset_detail',
+ 'body[params][ID_Publikacji]': video_id,
+ 'body[params][Service]': 'www.onet.pl',
+ 'content-type': 'application/jsonp',
+ 'x-onet-app': 'player.front.onetapi.pl',
+ })
+
+ error = response.get('error')
+ if error:
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, error['message']), expected=True)
+
+ video = response['result'].get('0')
+
+ formats = []
+ for _, formats_dict in video['formats'].items():
+ if not isinstance(formats_dict, dict):
+ continue
+ for format_id, format_list in formats_dict.items():
+ if not isinstance(format_list, list):
+ continue
+ for f in format_list:
+ video_url = f.get('url')
+ if not video_url:
+ continue
+ ext = determine_ext(video_url)
+ if format_id == 'ism':
+ # TODO: Support Microsoft Smooth Streaming
+ continue
+ elif ext == 'mpd':
+ # TODO: Current DASH formats are broken - $Time$ pattern in
+ # <SegmentTemplate> not implemented yet
+ # formats.extend(self._extract_mpd_formats(
+ # video_url, video_id, mpd_id='dash', fatal=False))
+ continue
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'height': int_or_none(f.get('vertical_resolution')),
+ 'width': int_or_none(f.get('horizontal_resolution')),
+ 'abr': float_or_none(f.get('audio_bitrate')),
+ 'vbr': float_or_none(f.get('video_bitrate')),
+ })
+ self._sort_formats(formats)
+
+ meta = video.get('meta', {})
+
+ title = self._og_search_title(webpage, default=None) or meta['title']
+ description = self._og_search_description(webpage, default=None) or meta.get('description')
+ duration = meta.get('length') or meta.get('lenght')
+ timestamp = parse_iso8601(meta.get('addDate'), ' ')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ }
+
+
+class OnetIE(OnetBaseIE):
+ _VALID_URL = 'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P<display_id>[0-9a-z-]+)/(?P<id>[0-9a-z]+)'
+ IE_NAME = 'onet.tv'
+
+ _TEST = {
+ 'url': 'http://onet.tv/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc',
+ 'md5': 'e3ffbf47590032ac3f27249204173d50',
+ 'info_dict': {
+ 'id': 'qbpyqc',
+ 'display_id': 'open-er-festival-2016-najdziwniejsze-wymagania-gwiazd',
+ 'ext': 'mp4',
+ 'title': 'Open\'er Festival 2016: najdziwniejsze wymagania gwiazd',
+ 'description': 'Trzy samochody, których nigdy nie użyto, prywatne spa, hotel dekorowany czarnym suknem czy nielegalne używki. Organizatorzy koncertów i festiwali muszą stawać przed nie lada wyzwaniem zapraszając gwia...',
+ 'upload_date': '20160705',
+ 'timestamp': 1467721580,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id, video_id = mobj.group('display_id', 'id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ mvp_id = self._search_mvp_id(webpage)
+
+ info_dict = self._extract_from_id(mvp_id, webpage)
+ info_dict.update({
+ 'id': video_id,
+ 'display_id': display_id,
+ })
+
+ return info_dict
+
+
+class OnetChannelIE(OnetBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/(?P<id>[a-z]+)(?:[?#]|$)'
+ IE_NAME = 'onet.tv:channel'
+
+ _TEST = {
+ 'url': 'http://onet.tv/k/openerfestival',
+ 'info_dict': {
+ 'id': 'openerfestival',
+ 'title': 'Open\'er Festival Live',
+ 'description': 'Dziękujemy, że oglądaliście transmisje. Zobaczcie nasze relacje i wywiady z artystami.',
+ },
+ 'playlist_mincount': 46,
+ }
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, channel_id)
+
+ current_clip_info = self._parse_json(self._search_regex(
+ r'var\s+currentClip\s*=\s*({[^}]+})', webpage, 'video info'), channel_id,
+ transform_source=lambda s: js_to_json(re.sub(r'\'\s*\+\s*\'', '', s)))
+ video_id = remove_start(current_clip_info['ckmId'], 'mvp:')
+ video_name = url_basename(current_clip_info['url'])
+
+ if self._downloader.params.get('noplaylist'):
+ self.to_screen(
+ 'Downloading just video %s because of --no-playlist' % video_name)
+ return self._extract_from_id(video_id, webpage)
+
+ self.to_screen(
+ 'Downloading channel %s - add --no-playlist to just download video %s' % (
+ channel_id, video_name))
+ matches = re.findall(
+ r'<a[^>]+href=[\'"](https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/[0-9a-z-]+/[0-9a-z]+)',
+ webpage)
+ entries = [
+ self.url_result(video_link, OnetIE.ie_key())
+ for video_link in matches]
+
+ channel_title = strip_or_none(get_element_by_class('o_channelName', webpage))
+ channel_description = strip_or_none(get_element_by_class('o_channelDesc', webpage))
+ return self.playlist_result(entries, channel_id, channel_title, channel_description)
diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py
index d7b13a0f1..6fb1a3fcc 100644
--- a/youtube_dl/extractor/onionstudios.py
+++ b/youtube_dl/extractor/onionstudios.py
@@ -7,6 +7,8 @@ from .common import InfoExtractor
from ..utils import (
determine_ext,
int_or_none,
+ float_or_none,
+ mimetype2ext,
)
@@ -15,15 +17,14 @@ class OnionStudiosIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937',
- 'md5': 'd4851405d31adfadf71cd7a487b765bb',
+ 'md5': 'e49f947c105b8a78a675a0ee1bddedfe',
'info_dict': {
'id': '2937',
'ext': 'mp4',
'title': 'Hannibal charges forward, stops for a cocktail',
- 'description': 'md5:e786add7f280b7f0fe237b64cc73df76',
'thumbnail': 're:^https?://.*\.jpg$',
'uploader': 'The A.V. Club',
- 'uploader_id': 'TheAVClub',
+ 'uploader_id': 'the-av-club',
},
}, {
'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true',
@@ -40,50 +41,38 @@ class OnionStudiosIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(
- 'http://www.onionstudios.com/embed?id=%s' % video_id, video_id)
+ video_data = self._download_json(
+ 'http://www.onionstudios.com/video/%s.json' % video_id, video_id)
+
+ title = video_data['title']
formats = []
- for src in re.findall(r'<source[^>]+src="([^"]+)"', webpage):
- ext = determine_ext(src)
+ for source in video_data.get('sources', []):
+ source_url = source.get('url')
+ if not source_url:
+ continue
+ ext = mimetype2ext(source.get('content_type')) or determine_ext(source_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
- src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+ source_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
else:
- height = int_or_none(self._search_regex(
- r'/(\d+)\.%s' % ext, src, 'height', default=None))
+ tbr = int_or_none(source.get('bitrate'))
formats.append({
- 'format_id': ext + ('-%sp' % height if height else ''),
- 'url': src,
- 'height': height,
+ 'format_id': ext + ('-%d' % tbr if tbr else ''),
+ 'url': source_url,
+ 'width': int_or_none(source.get('width')),
+ 'tbr': tbr,
'ext': ext,
- 'preference': 1,
})
self._sort_formats(formats)
- title = self._search_regex(
- r'share_title\s*=\s*(["\'])(?P<title>[^\1]+?)\1',
- webpage, 'title', group='title')
- description = self._search_regex(
- r'share_description\s*=\s*(["\'])(?P<description>[^\'"]+?)\1',
- webpage, 'description', default=None, group='description')
- thumbnail = self._search_regex(
- r'poster\s*=\s*(["\'])(?P<thumbnail>[^\1]+?)\1',
- webpage, 'thumbnail', default=False, group='thumbnail')
-
- uploader_id = self._search_regex(
- r'twitter_handle\s*=\s*(["\'])(?P<uploader_id>[^\1]+?)\1',
- webpage, 'uploader id', fatal=False, group='uploader_id')
- uploader = self._search_regex(
- r'window\.channelName\s*=\s*(["\'])Embedded:(?P<uploader>[^\1]+?)\1',
- webpage, 'uploader', default=False, group='uploader')
-
return {
'id': video_id,
'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
+ 'thumbnail': video_data.get('poster_url'),
+ 'uploader': video_data.get('channel_name'),
+ 'uploader_id': video_data.get('channel_slug'),
+ 'duration': float_or_none(video_data.get('duration', 1000)),
+ 'tags': video_data.get('tags'),
'formats': formats,
}
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index 81918ac6e..f6f423597 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -516,9 +516,14 @@ class PBSIE(InfoExtractor):
# https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications
if not bitrate or bitrate not in ('400k', '800k', '1200k', '2500k'):
continue
+ f_url = re.sub(r'\d+k|baseline', bitrate, http_url)
+ # This may produce invalid links sometimes (e.g.
+ # http://www.pbs.org/wgbh/frontline/film/suicide-plan)
+ if not self._is_valid_url(f_url, display_id, 'http-%s video' % bitrate):
+ continue
f = m3u8_format.copy()
f.update({
- 'url': re.sub(r'\d+k|baseline', bitrate, http_url),
+ 'url': f_url,
'format_id': m3u8_format['format_id'].replace('hls', 'http'),
'protocol': 'http',
})
diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py
index c23b314e7..75f5884a9 100644
--- a/youtube_dl/extractor/periscope.py
+++ b/youtube_dl/extractor/periscope.py
@@ -120,9 +120,12 @@ class PeriscopeUserIE(InfoExtractor):
title = user.get('display_name') or user.get('username')
description = user.get('description')
+ broadcast_ids = (data_store.get('UserBroadcastHistory', {}).get('broadcastIds') or
+ data_store.get('BroadcastCache', {}).get('broadcastIds', []))
+
entries = [
self.url_result(
- 'https://www.periscope.tv/%s/%s' % (user_id, broadcast['id']))
- for broadcast in data_store.get('UserBroadcastHistory', {}).get('broadcasts', [])]
+ 'https://www.periscope.tv/%s/%s' % (user_id, broadcast_id))
+ for broadcast_id in broadcast_ids]
return self.playlist_result(entries, user_id, title, description)
diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py
index bc559d1df..77e1211d6 100644
--- a/youtube_dl/extractor/pladform.py
+++ b/youtube_dl/extractor/pladform.py
@@ -49,7 +49,7 @@ class PladformIE(InfoExtractor):
@staticmethod
def _extract_url(webpage):
mobj = re.search(
- r'<iframe[^>]+src="(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)"', webpage)
+ r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage)
if mobj:
return mobj.group('url')
diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py
new file mode 100644
index 000000000..f559b899f
--- /dev/null
+++ b/youtube_dl/extractor/polskieradio.py
@@ -0,0 +1,99 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+ compat_urllib_parse_unquote,
+)
+from ..utils import (
+ int_or_none,
+ strip_or_none,
+ unified_timestamp,
+)
+
+
+class PolskieRadioIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie',
+ 'info_dict': {
+ 'id': '1587943',
+ 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie',
+ 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5',
+ },
+ 'playlist': [{
+ 'md5': '2984ee6ce9046d91fc233bc1a864a09a',
+ 'info_dict': {
+ 'id': '1540576',
+ 'ext': 'mp3',
+ 'title': 'md5:d4623290d4ac983bf924061c75c23a0d',
+ 'timestamp': 1456594200,
+ 'upload_date': '20160227',
+ 'duration': 2364,
+ 'thumbnail': 're:^https?://static\.prsa\.pl/images/.*\.jpg$'
+ },
+ }],
+ }, {
+ 'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal',
+ 'info_dict': {
+ 'id': '1635803',
+ 'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał',
+ 'description': 'md5:01cb7d0cad58664095d72b51a1ebada2',
+ },
+ 'playlist_mincount': 12,
+ }, {
+ 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943',
+ 'only_matching': True,
+ }, {
+ # with mp4 video
+ 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ content = self._search_regex(
+ r'(?s)<div[^>]+class="audio atarticle"[^>]*>(.+?)<script>',
+ webpage, 'content')
+
+ timestamp = unified_timestamp(self._html_search_regex(
+ r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>',
+ webpage, 'timestamp', fatal=False))
+
+ thumbnail_url = self._og_search_thumbnail(webpage)
+
+ entries = []
+
+ media_urls = set()
+
+ for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content):
+ media = self._parse_json(data_media, playlist_id, fatal=False)
+ if not media.get('file') or not media.get('desc'):
+ continue
+ media_url = self._proto_relative_url(media['file'], 'http:')
+ if media_url in media_urls:
+ continue
+ media_urls.add(media_url)
+ entries.append({
+ 'id': compat_str(media['id']),
+ 'url': media_url,
+ 'title': compat_urllib_parse_unquote(media['desc']),
+ 'duration': int_or_none(media.get('length')),
+ 'vcodec': 'none' if media.get('provider') == 'audio' else None,
+ 'timestamp': timestamp,
+ 'thumbnail': thumbnail_url
+ })
+
+ title = self._og_search_title(webpage).strip()
+ description = strip_or_none(self._og_search_description(webpage))
+
+ return self.playlist_result(entries, playlist_id, title, description)
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index 6d57e1d35..d2c92531b 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -25,7 +25,15 @@ from ..aes import (
class PornHubIE(InfoExtractor):
- _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)'
+ IE_DESC = 'PornHub and Thumbzilla'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)|
+ (?:www\.)?thumbzilla\.com/video/
+ )
+ (?P<id>[0-9a-z]+)
+ '''
_TESTS = [{
'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
'md5': '1e19b41231a02eba417839222ac9d58e',
@@ -63,8 +71,24 @@ class PornHubIE(InfoExtractor):
'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
'only_matching': True,
}, {
+ # removed at the request of cam4.com
'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
'only_matching': True,
+ }, {
+ # removed at the request of the copyright owner
+ 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
+ 'only_matching': True,
+ }, {
+ # removed by uploader
+ 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
+ 'only_matching': True,
+ }, {
+ # private video
+ 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
+ 'only_matching': True,
}]
@classmethod
@@ -87,8 +111,8 @@ class PornHubIE(InfoExtractor):
webpage = self._download_webpage(req, video_id)
error_msg = self._html_search_regex(
- r'(?s)<div class="userMessageSection[^"]*".*?>(.*?)</div>',
- webpage, 'error message', default=None)
+ r'(?s)<div[^>]+class=(["\']).*?\b(?:removed|userMessageSection)\b.*?\1[^>]*>(?P<error>.+?)</div>',
+ webpage, 'error message', default=None, group='error')
if error_msg:
error_msg = re.sub(r'\s+', ' ', error_msg)
raise ExtractorError(
diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py
index 07d49d489..c6eee3b72 100644
--- a/youtube_dl/extractor/prosiebensat1.py
+++ b/youtube_dl/extractor/prosiebensat1.py
@@ -5,7 +5,7 @@ import re
from hashlib import sha1
from .common import InfoExtractor
-from ..compat import compat_urllib_parse_urlencode
+from ..compat import compat_str
from ..utils import (
ExtractorError,
determine_ext,
@@ -71,6 +71,7 @@ class ProSiebenSat1IE(InfoExtractor):
# rtmp download
'skip_download': True,
},
+ 'skip': 'This video is unavailable',
},
{
'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip',
@@ -86,6 +87,7 @@ class ProSiebenSat1IE(InfoExtractor):
# rtmp download
'skip_download': True,
},
+ 'skip': 'This video is unavailable',
},
{
'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip',
@@ -101,6 +103,7 @@ class ProSiebenSat1IE(InfoExtractor):
# rtmp download
'skip_download': True,
},
+ 'skip': 'This video is unavailable',
},
{
'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge',
@@ -116,6 +119,7 @@ class ProSiebenSat1IE(InfoExtractor):
# rtmp download
'skip_download': True,
},
+ 'skip': 'This video is unavailable',
},
{
'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge',
@@ -131,6 +135,7 @@ class ProSiebenSat1IE(InfoExtractor):
# rtmp download
'skip_download': True,
},
+ 'skip': 'This video is unavailable',
},
{
'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip',
@@ -227,70 +232,42 @@ class ProSiebenSat1IE(InfoExtractor):
]
def _extract_clip(self, url, webpage):
- clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id')
+ clip_id = self._html_search_regex(
+ self._CLIPID_REGEXES, webpage, 'clip id')
access_token = 'prosieben'
client_name = 'kolibri-2.0.19-splec4'
client_location = url
- videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse_urlencode({
- 'access_token': access_token,
- 'client_location': client_location,
- 'client_name': client_name,
- 'ids': clip_id,
- })
-
- video = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')[0]
+ video = self._download_json(
+ 'http://vas.sim-technik.de/vas/live/v2/videos',
+ clip_id, 'Downloading videos JSON', query={
+ 'access_token': access_token,
+ 'client_location': client_location,
+ 'client_name': client_name,
+ 'ids': clip_id,
+ })[0]
if video.get('is_protected') is True:
raise ExtractorError('This video is DRM protected.', expected=True)
duration = float_or_none(video.get('duration'))
- source_ids = [source['id'] for source in video['sources']]
- source_ids_str = ','.join(map(str, source_ids))
+ source_ids = [compat_str(source['id']) for source in video['sources']]
g = '01!8d8F_)r9]4s[qeuXfP%'
+ client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]).encode('utf-8')).hexdigest()
- client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name])
- .encode('utf-8')).hexdigest()
-
- sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse_urlencode({
- 'access_token': access_token,
- 'client_id': client_id,
- 'client_location': client_location,
- 'client_name': client_name,
- }))
-
- sources = self._download_json(sources_api_url, clip_id, 'Downloading sources JSON')
+ sources = self._download_json(
+ 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id,
+ clip_id, 'Downloading sources JSON', query={
+ 'access_token': access_token,
+ 'client_id': client_id,
+ 'client_location': client_location,
+ 'client_name': client_name,
+ })
server_id = sources['server_id']
- client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id,
- client_location, source_ids_str, g, client_name])
- .encode('utf-8')).hexdigest()
-
- url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse_urlencode({
- 'access_token': access_token,
- 'client_id': client_id,
- 'client_location': client_location,
- 'client_name': client_name,
- 'server_id': server_id,
- 'source_ids': source_ids_str,
- }))
-
- urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON')
-
title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title')
- description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False)
- thumbnail = self._og_search_thumbnail(webpage)
-
- upload_date = unified_strdate(self._html_search_regex(
- self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None))
-
- formats = []
-
- urls_sources = urls['sources']
- if isinstance(urls_sources, dict):
- urls_sources = urls_sources.values()
def fix_bitrate(bitrate):
bitrate = int_or_none(bitrate)
@@ -298,37 +275,73 @@ class ProSiebenSat1IE(InfoExtractor):
return None
return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate
- for source in urls_sources:
- protocol = source['protocol']
- source_url = source['url']
- if protocol == 'rtmp' or protocol == 'rtmpe':
- mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url)
- if not mobj:
- continue
- path = mobj.group('path')
- mp4colon_index = path.rfind('mp4:')
- app = path[:mp4colon_index]
- play_path = path[mp4colon_index:]
- formats.append({
- 'url': '%s/%s' % (mobj.group('url'), app),
- 'app': app,
- 'play_path': play_path,
- 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
- 'page_url': 'http://www.prosieben.de',
- 'vbr': fix_bitrate(source['bitrate']),
- 'ext': 'mp4',
- 'format_id': '%s_%s' % (source['cdn'], source['bitrate']),
- })
- elif 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m':
- formats.extend(self._extract_f4m_formats(source_url, clip_id))
- else:
- formats.append({
- 'url': source_url,
- 'vbr': fix_bitrate(source['bitrate']),
+ formats = []
+ for source_id in source_ids:
+ client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id, client_location, source_id, g, client_name]).encode('utf-8')).hexdigest()
+ urls = self._download_json(
+ 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id,
+ clip_id, 'Downloading urls JSON', fatal=False, query={
+ 'access_token': access_token,
+ 'client_id': client_id,
+ 'client_location': client_location,
+ 'client_name': client_name,
+ 'server_id': server_id,
+ 'source_ids': source_id,
})
-
+ if not urls:
+ continue
+ if urls.get('status_code') != 0:
+ raise ExtractorError('This video is unavailable', expected=True)
+ urls_sources = urls['sources']
+ if isinstance(urls_sources, dict):
+ urls_sources = urls_sources.values()
+ for source in urls_sources:
+ source_url = source.get('url')
+ if not source_url:
+ continue
+ protocol = source.get('protocol')
+ mimetype = source.get('mimetype')
+ if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ source_url, clip_id, f4m_id='hds', fatal=False))
+ elif mimetype == 'application/x-mpegURL':
+ formats.extend(self._extract_m3u8_formats(
+ source_url, clip_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ else:
+ tbr = fix_bitrate(source['bitrate'])
+ if protocol in ('rtmp', 'rtmpe'):
+ mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url)
+ if not mobj:
+ continue
+ path = mobj.group('path')
+ mp4colon_index = path.rfind('mp4:')
+ app = path[:mp4colon_index]
+ play_path = path[mp4colon_index:]
+ formats.append({
+ 'url': '%s/%s' % (mobj.group('url'), app),
+ 'app': app,
+ 'play_path': play_path,
+ 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
+ 'page_url': 'http://www.prosieben.de',
+ 'tbr': tbr,
+ 'ext': 'flv',
+ 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''),
+ })
+ else:
+ formats.append({
+ 'url': source_url,
+ 'tbr': tbr,
+ 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''),
+ })
self._sort_formats(formats)
+ description = self._html_search_regex(
+ self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False)
+ thumbnail = self._og_search_thumbnail(webpage)
+ upload_date = unified_strdate(self._html_search_regex(
+ self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None))
+
return {
'id': clip_id,
'title': title,
diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py
index 4f05bbddc..8ec402646 100644
--- a/youtube_dl/extractor/radiocanada.py
+++ b/youtube_dl/extractor/radiocanada.py
@@ -12,6 +12,7 @@ from ..utils import (
unified_strdate,
xpath_element,
ExtractorError,
+ determine_protocol,
)
@@ -22,13 +23,13 @@ class RadioCanadaIE(InfoExtractor):
'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272',
'info_dict': {
'id': '7184272',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Le parcours du tireur capté sur vidéo',
'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa',
'upload_date': '20141023',
},
'params': {
- # rtmp download
+ # m3u8 download
'skip_download': True,
},
}
@@ -36,11 +37,14 @@ class RadioCanadaIE(InfoExtractor):
def _real_extract(self, url):
app_code, video_id = re.match(self._VALID_URL, url).groups()
+ device_types = ['ipad', 'android']
+ if app_code != 'toutv':
+ device_types.append('flash')
+
formats = []
- # TODO: extract m3u8 and f4m formats
- # m3u8 formats can be extracted using ipad device_type return 403 error code when ffmpeg try to download segements
+ # TODO: extract f4m formats
# f4m formats can be extracted using flashhd device_type but they produce unplayable file
- for device_type in ('flash',):
+ for device_type in device_types:
v_data = self._download_xml(
'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx',
video_id, note='Downloading %s XML' % device_type, query={
@@ -52,7 +56,7 @@ class RadioCanadaIE(InfoExtractor):
# paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction
'paysJ391wsHjbOJwvCs26toz': 'CA',
'bypasslock': 'NZt5K62gRqfc',
- })
+ }, fatal=False)
v_url = xpath_text(v_data, 'url')
if not v_url:
continue
@@ -64,7 +68,8 @@ class RadioCanadaIE(InfoExtractor):
formats.extend(self._extract_m3u8_formats(
v_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
elif ext == 'f4m':
- formats.extend(self._extract_f4m_formats(v_url, video_id, f4m_id='hds', fatal=False))
+ formats.extend(self._extract_f4m_formats(
+ v_url, video_id, f4m_id='hds', fatal=False))
else:
ext = determine_ext(v_url)
bitrates = xpath_element(v_data, 'bitrates')
@@ -72,15 +77,28 @@ class RadioCanadaIE(InfoExtractor):
tbr = int_or_none(url_e.get('bitrate'))
if not tbr:
continue
+ f_url = re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url)
+ protocol = determine_protocol({'url': f_url})
formats.append({
- 'format_id': 'rtmp-%d' % tbr,
- 'url': re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url),
- 'ext': 'flv',
- 'protocol': 'rtmp',
+ 'format_id': '%s-%d' % (protocol, tbr),
+ 'url': f_url,
+ 'ext': 'flv' if protocol == 'rtmp' else ext,
+ 'protocol': protocol,
'width': int_or_none(url_e.get('width')),
'height': int_or_none(url_e.get('height')),
'tbr': tbr,
})
+ if protocol == 'rtsp':
+ base_url = self._search_regex(
+ r'rtsp://([^?]+)', f_url, 'base url', default=None)
+ if base_url:
+ base_url = 'http://' + base_url
+ formats.extend(self._extract_m3u8_formats(
+ base_url + '/playlist.m3u8', video_id, 'mp4',
+ 'm3u8_native', m3u8_id='hls', fatal=False))
+ formats.extend(self._extract_f4m_formats(
+ base_url + '/manifest.f4m', video_id,
+ f4m_id='hds', fatal=False))
self._sort_formats(formats)
metadata = self._download_xml(
@@ -115,13 +133,13 @@ class RadioCanadaAudioVideoIE(InfoExtractor):
'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam',
'info_dict': {
'id': '7527184',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Barack Obama au Vietnam',
'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam',
'upload_date': '20160523',
},
'params': {
- # rtmp download
+ # m3u8 download
'skip_download': True,
},
}
diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py
index e36ce1aa1..dc640b1bc 100644
--- a/youtube_dl/extractor/rai.py
+++ b/youtube_dl/extractor/rai.py
@@ -1,47 +1,141 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urlparse,
-)
+from ..compat import compat_urlparse
from ..utils import (
- ExtractorError,
determine_ext,
+ ExtractorError,
+ find_xpath_attr,
+ fix_xml_ampersands,
+ int_or_none,
parse_duration,
unified_strdate,
- int_or_none,
+ update_url_query,
xpath_text,
)
-class RaiTVIE(InfoExtractor):
- _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
+class RaiBaseIE(InfoExtractor):
+ def _extract_relinker_formats(self, relinker_url, video_id):
+ formats = []
+
+ for platform in ('mon', 'flash', 'native'):
+ relinker = self._download_xml(
+ relinker_url, video_id,
+ note='Downloading XML metadata for platform %s' % platform,
+ transform_source=fix_xml_ampersands,
+ query={'output': 45, 'pl': platform},
+ headers=self.geo_verification_headers())
+
+ media_url = find_xpath_attr(relinker, './url', 'type', 'content').text
+ if media_url == 'http://download.rai.it/video_no_available.mp4':
+ self.raise_geo_restricted()
+
+ ext = determine_ext(media_url)
+ if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'):
+ continue
+
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ media_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ elif ext == 'f4m':
+ manifest_url = update_url_query(
+ media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'),
+ {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'})
+ formats.extend(self._extract_f4m_formats(
+ manifest_url, video_id, f4m_id='hds', fatal=False))
+ else:
+ bitrate = int_or_none(xpath_text(relinker, 'bitrate'))
+ formats.append({
+ 'url': media_url,
+ 'tbr': bitrate if bitrate > 0 else None,
+ 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http',
+ })
+
+ return formats
+
+ def _extract_from_content_id(self, content_id, base_url):
+ media = self._download_json(
+ 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id,
+ content_id, 'Downloading video JSON')
+
+ thumbnails = []
+ for image_type in ('image', 'image_medium', 'image_300'):
+ thumbnail_url = media.get(image_type)
+ if thumbnail_url:
+ thumbnails.append({
+ 'url': compat_urlparse.urljoin(base_url, thumbnail_url),
+ })
+
+ formats = []
+ media_type = media['type']
+ if 'Audio' in media_type:
+ formats.append({
+ 'format_id': media.get('formatoAudio'),
+ 'url': media['audioUrl'],
+ 'ext': media.get('formatoAudio'),
+ })
+ elif 'Video' in media_type:
+ formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id))
+ self._sort_formats(formats)
+ else:
+ raise ExtractorError('not a media file')
+
+ subtitles = {}
+ captions = media.get('subtitlesUrl')
+ if captions:
+ STL_EXT = '.stl'
+ SRT_EXT = '.srt'
+ if captions.endswith(STL_EXT):
+ captions = captions[:-len(STL_EXT)] + SRT_EXT
+ subtitles['it'] = [{
+ 'ext': 'srt',
+ 'url': captions,
+ }]
+
+ return {
+ 'id': content_id,
+ 'title': media['name'],
+ 'description': media.get('desc'),
+ 'thumbnails': thumbnails,
+ 'uploader': media.get('author'),
+ 'upload_date': unified_strdate(media.get('date')),
+ 'duration': parse_duration(media.get('length')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+
+class RaiTVIE(RaiBaseIE):
+ _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+(?:media|ondemand)/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
_TESTS = [
{
'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
- 'md5': '96382709b61dd64a6b88e0f791e6df4c',
+ 'md5': '8970abf8caf8aef4696e7b1f2adfc696',
'info_dict': {
'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Report del 07/04/2014',
'description': 'md5:f27c544694cacb46a078db84ec35d2d9',
'upload_date': '20140407',
'duration': 6160,
+ 'thumbnail': 're:^https?://.*\.jpg$',
}
},
{
+ # no m3u8 stream
'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html',
- 'md5': 'd9751b78eac9710d62c2447b224dea39',
+ # HDS download, MD5 is unstable
'info_dict': {
'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9',
'ext': 'flv',
'title': 'TG PRIMO TEMPO',
'upload_date': '20140612',
'duration': 1758,
+ 'thumbnail': 're:^https?://.*\.jpg$',
},
+ 'skip': 'Geo-restricted to Italy',
},
{
'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html',
@@ -67,127 +161,70 @@ class RaiTVIE(InfoExtractor):
},
{
'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html',
- 'md5': '496ab63e420574447f70d02578333437',
+ 'md5': 'e57493e1cb8bc7c564663f363b171847',
'info_dict': {
'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Il Candidato - Primo episodio: "Le Primarie"',
'description': 'md5:364b604f7db50594678f483353164fb8',
'upload_date': '20140923',
'duration': 386,
+ 'thumbnail': 're:^https?://.*\.jpg$',
}
},
]
def _real_extract(self, url):
video_id = self._match_id(url)
- media = self._download_json(
- 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % video_id,
- video_id, 'Downloading video JSON')
-
- thumbnails = []
- for image_type in ('image', 'image_medium', 'image_300'):
- thumbnail_url = media.get(image_type)
- if thumbnail_url:
- thumbnails.append({
- 'url': thumbnail_url,
- })
-
- subtitles = []
- formats = []
- media_type = media['type']
- if 'Audio' in media_type:
- formats.append({
- 'format_id': media.get('formatoAudio'),
- 'url': media['audioUrl'],
- 'ext': media.get('formatoAudio'),
- })
- elif 'Video' in media_type:
- def fix_xml(xml):
- return xml.replace(' tag elementi', '').replace('>/', '</')
-
- relinker = self._download_xml(
- media['mediaUri'] + '&output=43',
- video_id, transform_source=fix_xml)
-
- has_subtitle = False
-
- for element in relinker.findall('element'):
- media_url = xpath_text(element, 'url')
- ext = determine_ext(media_url)
- content_type = xpath_text(element, 'content-type')
- if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- media_url, video_id, 'mp4', 'm3u8_native',
- m3u8_id='hls', fatal=False))
- elif ext == 'f4m':
- formats.extend(self._extract_f4m_formats(
- media_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44',
- video_id, f4m_id='hds', fatal=False))
- elif ext == 'stl':
- has_subtitle = True
- elif content_type.startswith('video/'):
- bitrate = int_or_none(xpath_text(element, 'bitrate'))
- formats.append({
- 'url': media_url,
- 'tbr': bitrate if bitrate > 0 else None,
- 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http',
- })
- elif content_type.startswith('image/'):
- thumbnails.append({
- 'url': media_url,
- })
-
- self._sort_formats(formats)
- if has_subtitle:
- webpage = self._download_webpage(url, video_id)
- subtitles = self._get_subtitles(video_id, webpage)
- else:
- raise ExtractorError('not a media file')
+ return self._extract_from_content_id(video_id, url)
- return {
- 'id': video_id,
- 'title': media['name'],
- 'description': media.get('desc'),
- 'thumbnails': thumbnails,
- 'uploader': media.get('author'),
- 'upload_date': unified_strdate(media.get('date')),
- 'duration': parse_duration(media.get('length')),
- 'formats': formats,
- 'subtitles': subtitles,
- }
- def _get_subtitles(self, video_id, webpage):
- subtitles = {}
- m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage)
- if m:
- captions = m.group('captions')
- STL_EXT = '.stl'
- SRT_EXT = '.srt'
- if captions.endswith(STL_EXT):
- captions = captions[:-len(STL_EXT)] + SRT_EXT
- subtitles['it'] = [{
- 'ext': 'srt',
- 'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions),
- }]
- return subtitles
-
-
-class RaiIE(InfoExtractor):
+class RaiIE(RaiBaseIE):
_VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
_TESTS = [
{
'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',
- 'md5': 'e0e7a8a131e249d1aa0ebf270d1d8db7',
+ 'md5': '2dd727e61114e1ee9c47f0da6914e178',
'info_dict': {
'id': '59d69d28-6bb6-409d-a4b5-ed44096560af',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Il pacco',
'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',
'upload_date': '20141221',
},
- }
+ },
+ {
+ # Direct relinker URL
+ 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews',
+ # HDS live stream, MD5 is unstable
+ 'info_dict': {
+ 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc',
+ 'ext': 'flv',
+ 'title': 'EuroNews',
+ },
+ 'skip': 'Geo-restricted to Italy',
+ },
+ {
+ # Embedded content item ID
+ 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined',
+ 'md5': '84c1135ce960e8822ae63cec34441d63',
+ 'info_dict': {
+ 'id': '0960e765-62c8-474a-ac4b-7eb3e2be39c8',
+ 'ext': 'mp4',
+ 'title': 'TG1 ore 20:00 del 02/07/2016',
+ 'upload_date': '20160702',
+ },
+ },
+ {
+ 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html',
+ # HDS live stream, MD5 is unstable
+ 'info_dict': {
+ 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9',
+ 'ext': 'flv',
+ 'title': 'La diretta di Rainews24',
+ },
+ },
]
@classmethod
@@ -201,7 +238,30 @@ class RaiIE(InfoExtractor):
iframe_url = self._search_regex(
[r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"',
r'drawMediaRaiTV\(["\'](.+?)["\']'],
- webpage, 'iframe')
- if not iframe_url.startswith('http'):
- iframe_url = compat_urlparse.urljoin(url, iframe_url)
- return self.url_result(iframe_url)
+ webpage, 'iframe', default=None)
+ if iframe_url:
+ if not iframe_url.startswith('http'):
+ iframe_url = compat_urlparse.urljoin(url, iframe_url)
+ return self.url_result(iframe_url)
+
+ content_item_id = self._search_regex(
+ r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)',
+ webpage, 'content item ID', group='content_id', default=None)
+ if content_item_id:
+ return self._extract_from_content_id(content_item_id, url)
+
+ relinker_url = compat_urlparse.urljoin(url, self._search_regex(
+ r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)',
+ webpage, 'relinker URL', group='url'))
+ formats = self._extract_relinker_formats(relinker_url, video_id)
+ self._sort_formats(formats)
+
+ title = self._search_regex(
+ r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1',
+ webpage, 'title', group='title', default=None) or self._og_search_title(webpage)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py
index 796adfdf9..bf200ea4d 100644
--- a/youtube_dl/extractor/rds.py
+++ b/youtube_dl/extractor/rds.py
@@ -1,23 +1,23 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
parse_duration,
parse_iso8601,
+ js_to_json,
)
+from ..compat import compat_str
class RDSIE(InfoExtractor):
IE_DESC = 'RDS.ca'
- _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<display_id>[^/]+)-(?P<id>\d+\.\d+)'
+ _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<id>[^/]+)-\d+\.\d+'
_TESTS = [{
'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799',
'info_dict': {
- 'id': '3.1132799',
+ 'id': '604333',
'display_id': 'fowler-jr-prend-la-direction-de-jacksonville',
'ext': 'mp4',
'title': 'Fowler Jr. prend la direction de Jacksonville',
@@ -33,22 +33,17 @@ class RDSIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- display_id = mobj.group('display_id')
+ display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- # TODO: extract f4m from 9c9media.com
- video_url = self._search_regex(
- r'<span[^>]+itemprop="contentURL"[^>]+content="([^"]+)"',
- webpage, 'video url')
-
- title = self._og_search_title(webpage) or self._html_search_meta(
+ item = self._parse_json(self._search_regex(r'(?s)itemToPush\s*=\s*({.+?});', webpage, 'item'), display_id, js_to_json)
+ video_id = compat_str(item['id'])
+ title = item.get('title') or self._og_search_title(webpage) or self._html_search_meta(
'title', webpage, 'title', fatal=True)
description = self._og_search_description(webpage) or self._html_search_meta(
'description', webpage, 'description')
- thumbnail = self._og_search_thumbnail(webpage) or self._search_regex(
+ thumbnail = item.get('urlImageBig') or self._og_search_thumbnail(webpage) or self._search_regex(
[r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"',
r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'],
webpage, 'thumbnail', fatal=False)
@@ -61,13 +56,15 @@ class RDSIE(InfoExtractor):
age_limit = self._family_friendly_search(webpage)
return {
+ '_type': 'url_transparent',
'id': video_id,
'display_id': display_id,
- 'url': video_url,
+ 'url': '9c9media:rds_web:%s' % video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'timestamp': timestamp,
'duration': duration,
'age_limit': age_limit,
+ 'ie_key': 'NineCNineMedia',
}
diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py
new file mode 100644
index 000000000..f5b2f560c
--- /dev/null
+++ b/youtube_dl/extractor/roosterteeth.py
@@ -0,0 +1,148 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ strip_or_none,
+ unescapeHTML,
+ urlencode_postdata,
+)
+
+
+class RoosterTeethIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/episode/(?P<id>[^/?#&]+)'
+ _LOGIN_URL = 'https://roosterteeth.com/login'
+ _NETRC_MACHINE = 'roosterteeth'
+ _TESTS = [{
+ 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement',
+ 'md5': 'e2bd7764732d785ef797700a2489f212',
+ 'info_dict': {
+ 'id': '26576',
+ 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement',
+ 'ext': 'mp4',
+ 'title': 'Million Dollars, But...: Million Dollars, But... The Game Announcement',
+ 'description': 'md5:0cc3b21986d54ed815f5faeccd9a9ca5',
+ 'thumbnail': 're:^https?://.*\.png$',
+ 'series': 'Million Dollars, But...',
+ 'episode': 'Million Dollars, But... The Game Announcement',
+ 'comment_count': int,
+ },
+ }, {
+ 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://funhaus.roosterteeth.com/episode/funhaus-shorts-2016-austin-sucks-funhaus-shorts',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://screwattack.roosterteeth.com/episode/death-battle-season-3-mewtwo-vs-shadow',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://theknow.roosterteeth.com/episode/the-know-game-news-season-1-boring-steam-sales-are-better',
+ 'only_matching': True,
+ }, {
+ # only available for FIRST members
+ 'url': 'http://roosterteeth.com/episode/rt-docs-the-world-s-greatest-head-massage-the-world-s-greatest-head-massage-an-asmr-journey-part-one',
+ 'only_matching': True,
+ }]
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+
+ login_page = self._download_webpage(
+ self._LOGIN_URL, None,
+ note='Downloading login page',
+ errnote='Unable to download login page')
+
+ login_form = self._hidden_inputs(login_page)
+
+ login_form.update({
+ 'username': username,
+ 'password': password,
+ })
+
+ login_request = self._download_webpage(
+ self._LOGIN_URL, None,
+ note='Logging in as %s' % username,
+ data=urlencode_postdata(login_form),
+ headers={
+ 'Referer': self._LOGIN_URL,
+ })
+
+ if not any(re.search(p, login_request) for p in (
+ r'href=["\']https?://(?:www\.)?roosterteeth\.com/logout"',
+ r'>Sign Out<')):
+ error = self._html_search_regex(
+ r'(?s)<div[^>]+class=(["\']).*?\balert-danger\b.*?\1[^>]*>(?:\s*<button[^>]*>.*?</button>)?(?P<error>.+?)</div>',
+ login_request, 'alert', default=None, group='error')
+ if error:
+ raise ExtractorError('Unable to login: %s' % error, expected=True)
+ raise ExtractorError('Unable to log in')
+
+ def _real_initialize(self):
+ self._login()
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ episode = strip_or_none(unescapeHTML(self._search_regex(
+ (r'videoTitle\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
+ r'<title>(?P<title>[^<]+)</title>'), webpage, 'title',
+ default=None, group='title')))
+
+ title = strip_or_none(self._og_search_title(
+ webpage, default=None)) or episode
+
+ m3u8_url = self._search_regex(
+ r'file\s*:\s*(["\'])(?P<url>http.+?\.m3u8.*?)\1',
+ webpage, 'm3u8 url', default=None, group='url')
+
+ if not m3u8_url:
+ if re.search(r'<div[^>]+class=["\']non-sponsor', webpage):
+ self.raise_login_required(
+ '%s is only available for FIRST members' % display_id)
+
+ if re.search(r'<div[^>]+class=["\']golive-gate', webpage):
+ self.raise_login_required('%s is not available yet' % display_id)
+
+ raise ExtractorError('Unable to extract m3u8 URL')
+
+ formats = self._extract_m3u8_formats(
+ m3u8_url, display_id, ext='mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls')
+ self._sort_formats(formats)
+
+ description = strip_or_none(self._og_search_description(webpage))
+ thumbnail = self._proto_relative_url(self._og_search_thumbnail(webpage))
+
+ series = self._search_regex(
+ (r'<h2>More ([^<]+)</h2>', r'<a[^>]+>See All ([^<]+) Videos<'),
+ webpage, 'series', fatal=False)
+
+ comment_count = int_or_none(self._search_regex(
+ r'>Comments \((\d+)\)<', webpage,
+ 'comment count', fatal=False))
+
+ video_id = self._search_regex(
+ (r'containerId\s*=\s*["\']episode-(\d+)\1',
+ r'<div[^<]+id=["\']episode-(\d+)'), webpage,
+ 'video id', default=display_id)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'series': series,
+ 'episode': episode,
+ 'comment_count': comment_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py
index 4896d09d6..f6454c6b0 100644
--- a/youtube_dl/extractor/rtvnh.py
+++ b/youtube_dl/extractor/rtvnh.py
@@ -9,7 +9,7 @@ class RTVNHIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.rtvnh.nl/video/131946',
- 'md5': '6e1d0ab079e2a00b6161442d3ceacfc1',
+ 'md5': 'cdbec9f44550763c8afc96050fa747dc',
'info_dict': {
'id': '131946',
'ext': 'mp4',
@@ -29,15 +29,29 @@ class RTVNHIE(InfoExtractor):
raise ExtractorError(
'%s returned error code %d' % (self.IE_NAME, status), expected=True)
- formats = self._extract_smil_formats(
- 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id, fatal=False)
-
- for item in meta['source']['fb']:
- if item.get('type') == 'hls':
- formats.extend(self._extract_m3u8_formats(
- item['file'], video_id, ext='mp4', entry_protocol='m3u8_native'))
- elif item.get('type') == '':
- formats.append({'url': item['file']})
+ formats = []
+ rtmp_formats = self._extract_smil_formats(
+ 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id)
+ formats.extend(rtmp_formats)
+
+ for rtmp_format in rtmp_formats:
+ rtmp_url = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
+ rtsp_format = rtmp_format.copy()
+ del rtsp_format['play_path']
+ del rtsp_format['ext']
+ rtsp_format.update({
+ 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
+ 'url': rtmp_url.replace('rtmp://', 'rtsp://'),
+ 'protocol': 'rtsp',
+ })
+ formats.append(rtsp_format)
+ http_base_url = rtmp_url.replace('rtmp://', 'http://')
+ formats.extend(self._extract_m3u8_formats(
+ http_base_url + '/playlist.m3u8', video_id, 'mp4',
+ 'm3u8_native', m3u8_id='hls', fatal=False))
+ formats.extend(self._extract_f4m_formats(
+ http_base_url + '/manifest.f4m',
+ video_id, f4m_id='hds', fatal=False))
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/sandia.py b/youtube_dl/extractor/sandia.py
index 759898a49..96e43af84 100644
--- a/youtube_dl/extractor/sandia.py
+++ b/youtube_dl/extractor/sandia.py
@@ -1,18 +1,12 @@
# coding: utf-8
from __future__ import unicode_literals
-import itertools
import json
-import re
from .common import InfoExtractor
-from ..compat import compat_urlparse
from ..utils import (
int_or_none,
- js_to_json,
mimetype2ext,
- sanitized_Request,
- unified_strdate,
)
@@ -27,7 +21,8 @@ class SandiaIE(InfoExtractor):
'ext': 'mp4',
'title': 'Xyce Software Training - Section 1',
'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}',
- 'upload_date': '20120904',
+ 'upload_date': '20120409',
+ 'timestamp': 1333983600,
'duration': 7794,
}
}
@@ -35,81 +30,36 @@ class SandiaIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- req = sanitized_Request(url)
- req.add_header('Cookie', 'MediasitePlayerCaps=ClientPlugins=4')
- webpage = self._download_webpage(req, video_id)
+ presentation_data = self._download_json(
+ 'http://digitalops.sandia.gov/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions',
+ video_id, data=json.dumps({
+ 'getPlayerOptionsRequest': {
+ 'ResourceId': video_id,
+ 'QueryString': '',
+ }
+ }), headers={
+ 'Content-Type': 'application/json; charset=utf-8',
+ })['d']['Presentation']
- js_path = self._search_regex(
- r'<script type="text/javascript" src="(/Mediasite/FileServer/Presentation/[^"]+)"',
- webpage, 'JS code URL')
- js_url = compat_urlparse.urljoin(url, js_path)
-
- js_code = self._download_webpage(
- js_url, video_id, note='Downloading player')
-
- def extract_str(key, **args):
- return self._search_regex(
- r'Mediasite\.PlaybackManifest\.%s\s*=\s*(.+);\s*?\n' % re.escape(key),
- js_code, key, **args)
-
- def extract_data(key, **args):
- data_json = extract_str(key, **args)
- if data_json is None:
- return data_json
- return self._parse_json(
- data_json, video_id, transform_source=js_to_json)
+ title = presentation_data['Title']
formats = []
- for i in itertools.count():
- fd = extract_data('VideoUrls[%d]' % i, default=None)
- if fd is None:
- break
- formats.append({
- 'format_id': '%s' % i,
- 'format_note': fd['MimeType'].partition('/')[2],
- 'ext': mimetype2ext(fd['MimeType']),
- 'url': fd['Location'],
- 'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None,
- })
+ for stream in presentation_data.get('Streams', []):
+ for fd in stream.get('VideoUrls', []):
+ formats.append({
+ 'format_id': fd['MediaType'],
+ 'format_note': fd['MimeType'].partition('/')[2],
+ 'ext': mimetype2ext(fd['MimeType']),
+ 'url': fd['Location'],
+ 'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None,
+ })
self._sort_formats(formats)
- slide_baseurl = compat_urlparse.urljoin(
- url, extract_data('SlideBaseUrl'))
- slide_template = slide_baseurl + re.sub(
- r'\{0:D?([0-9+])\}', r'%0\1d', extract_data('SlideImageFileNameTemplate'))
- slides = []
- last_slide_time = 0
- for i in itertools.count(1):
- sd = extract_str('Slides[%d]' % i, default=None)
- if sd is None:
- break
- timestamp = int_or_none(self._search_regex(
- r'^Mediasite\.PlaybackManifest\.CreateSlide\("[^"]*"\s*,\s*([0-9]+),',
- sd, 'slide %s timestamp' % i, fatal=False))
- slides.append({
- 'url': slide_template % i,
- 'duration': timestamp - last_slide_time,
- })
- last_slide_time = timestamp
- formats.append({
- 'format_id': 'slides',
- 'protocol': 'slideshow',
- 'url': json.dumps(slides),
- 'preference': -10000, # Downloader not yet written
- })
- self._sort_formats(formats)
-
- title = extract_data('Title')
- description = extract_data('Description', fatal=False)
- duration = int_or_none(extract_data(
- 'Duration', fatal=False), scale=1000)
- upload_date = unified_strdate(extract_data('AirDate', fatal=False))
-
return {
'id': video_id,
'title': title,
- 'description': description,
+ 'description': presentation_data.get('Description'),
'formats': formats,
- 'upload_date': upload_date,
- 'duration': duration,
+ 'timestamp': int_or_none(presentation_data.get('UnixTime'), 1000),
+ 'duration': int_or_none(presentation_data.get('Duration'), 1000),
}
diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py
new file mode 100644
index 000000000..d3aba58a2
--- /dev/null
+++ b/youtube_dl/extractor/sixplay.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ qualities,
+ int_or_none,
+ mimetype2ext,
+ determine_ext,
+)
+
+
+class SixPlayIE(InfoExtractor):
+ _VALID_URL = r'(?:6play:|https?://(?:www\.)?6play\.fr/.+?-c_)(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.6play.fr/jamel-et-ses-amis-au-marrakech-du-rire-p_1316/jamel-et-ses-amis-au-marrakech-du-rire-2015-c_11495320',
+ 'md5': '42310bffe4ba3982db112b9cd3467328',
+ 'info_dict': {
+ 'id': '11495320',
+ 'ext': 'mp4',
+ 'title': 'Jamel et ses amis au Marrakech du rire 2015',
+ 'description': 'md5:ba2149d5c321d5201b78070ee839d872',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ clip_data = self._download_json(
+ 'https://player.m6web.fr/v2/video/config/6play-auth/FR/%s.json' % video_id,
+ video_id)
+ video_data = clip_data['videoInfo']
+
+ quality_key = qualities(['lq', 'sd', 'hq', 'hd'])
+ formats = []
+ for source in clip_data['sources']:
+ source_type, source_url = source.get('type'), source.get('src')
+ if not source_url or source_type == 'hls/primetime':
+ continue
+ ext = mimetype2ext(source_type) or determine_ext(source_url)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ source_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ formats.extend(self._extract_f4m_formats(
+ source_url.replace('.m3u8', '.f4m'),
+ video_id, f4m_id='hds', fatal=False))
+ elif ext == 'mp4':
+ quality = source.get('quality')
+ formats.append({
+ 'url': source_url,
+ 'format_id': quality,
+ 'quality': quality_key(quality),
+ 'ext': ext,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_data['title'].strip(),
+ 'description': video_data.get('description'),
+ 'duration': int_or_none(video_data.get('duration')),
+ 'series': video_data.get('titlePgm'),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/skynewsarabia.py b/youtube_dl/extractor/skynewsarabia.py
index 05e1b02ad..fffc9aa22 100644
--- a/youtube_dl/extractor/skynewsarabia.py
+++ b/youtube_dl/extractor/skynewsarabia.py
@@ -67,7 +67,7 @@ class SkyNewsArabiaIE(SkyNewsArabiaBaseIE):
class SkyNewsArabiaArticleIE(SkyNewsArabiaBaseIE):
- IE_NAME = 'skynewsarabia:video'
+ IE_NAME = 'skynewsarabia:article'
_VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://www.skynewsarabia.com/web/article/794549/%D8%A7%D9%94%D8%AD%D8%AF%D8%A7%D8%AB-%D8%A7%D9%84%D8%B4%D8%B1%D9%82-%D8%A7%D9%84%D8%A7%D9%94%D9%88%D8%B3%D8%B7-%D8%AE%D8%B1%D9%8A%D8%B7%D8%A9-%D8%A7%D9%84%D8%A7%D9%94%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D8%B0%D9%83%D9%8A%D8%A9',
diff --git a/youtube_dl/extractor/skysports.py b/youtube_dl/extractor/skysports.py
new file mode 100644
index 000000000..9dc78c7d2
--- /dev/null
+++ b/youtube_dl/extractor/skysports.py
@@ -0,0 +1,33 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class SkySportsIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine',
+ 'md5': 'c44a1db29f27daf9a0003e010af82100',
+ 'info_dict': {
+ 'id': '10328419',
+ 'ext': 'flv',
+ 'title': 'Bale: Its our time to shine',
+ 'description': 'md5:9fd1de3614d525f5addda32ac3c482c9',
+ },
+ 'add_ie': ['Ooyala'],
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ return {
+ '_type': 'url_transparent',
+ 'id': video_id,
+ 'url': 'ooyala:%s' % self._search_regex(
+ r'data-video-id="([^"]+)"', webpage, 'ooyala id'),
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'ie_key': 'Ooyala',
+ }
diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py
index 0b717a1e4..4967c1b77 100644
--- a/youtube_dl/extractor/slideshare.py
+++ b/youtube_dl/extractor/slideshare.py
@@ -9,6 +9,7 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ get_element_by_id,
)
@@ -40,7 +41,7 @@ class SlideshareIE(InfoExtractor):
bucket = info['jsplayer']['video_bucket']
ext = info['jsplayer']['video_extension']
video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
- description = self._html_search_regex(
+ description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex(
r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage,
'description', fatal=False)
@@ -51,5 +52,5 @@ class SlideshareIE(InfoExtractor):
'ext': ext,
'url': video_url,
'thumbnail': info['slideshow']['pin_image_url'],
- 'description': description,
+ 'description': description.strip() if description else None,
}
diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py
index 49e5d09ae..72fe66142 100644
--- a/youtube_dl/extractor/sohu.py
+++ b/youtube_dl/extractor/sohu.py
@@ -8,10 +8,7 @@ from ..compat import (
compat_str,
compat_urllib_parse_urlencode,
)
-from ..utils import (
- ExtractorError,
- sanitized_Request,
-)
+from ..utils import ExtractorError
class SohuIE(InfoExtractor):
@@ -96,15 +93,10 @@ class SohuIE(InfoExtractor):
else:
base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
- req = sanitized_Request(base_data_url + vid_id)
-
- cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
- if cn_verification_proxy:
- req.add_header('Ytdl-request-proxy', cn_verification_proxy)
-
return self._download_json(
- req, video_id,
- 'Downloading JSON data for %s' % vid_id)
+ base_data_url + vid_id, video_id,
+ 'Downloading JSON data for %s' % vid_id,
+ headers=self.geo_verification_headers())
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py
index 39a7aaf9d..3c552807e 100644
--- a/youtube_dl/extractor/spiegel.py
+++ b/youtube_dl/extractor/spiegel.py
@@ -4,8 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urlparse
from .spiegeltv import SpiegeltvIE
+from ..compat import compat_urlparse
+from ..utils import (
+ extract_attributes,
+ unified_strdate,
+ get_element_by_attribute,
+)
class SpiegelIE(InfoExtractor):
@@ -19,6 +24,7 @@ class SpiegelIE(InfoExtractor):
'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv',
'description': 'md5:8029d8310232196eb235d27575a8b9f4',
'duration': 49,
+ 'upload_date': '20130311',
},
}, {
'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html',
@@ -29,6 +35,7 @@ class SpiegelIE(InfoExtractor):
'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers',
'description': 'md5:c2322b65e58f385a820c10fa03b2d088',
'duration': 983,
+ 'upload_date': '20131115',
},
}, {
'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html',
@@ -38,6 +45,7 @@ class SpiegelIE(InfoExtractor):
'ext': 'mp4',
'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.',
'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"',
+ 'upload_date': '20140904',
}
}, {
'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html',
@@ -52,10 +60,10 @@ class SpiegelIE(InfoExtractor):
if SpiegeltvIE.suitable(handle.geturl()):
return self.url_result(handle.geturl(), 'Spiegeltv')
- title = re.sub(r'\s+', ' ', self._html_search_regex(
- r'(?s)<(?:h1|div) class="module-title"[^>]*>(.*?)</(?:h1|div)>',
- webpage, 'title'))
- description = self._html_search_meta('description', webpage, 'description')
+ video_data = extract_attributes(self._search_regex(r'(<div[^>]+id="spVideoElements"[^>]+>)', webpage, 'video element', default=''))
+
+ title = video_data.get('data-video-title') or get_element_by_attribute('class', 'module-title', webpage)
+ description = video_data.get('data-video-teaser') or self._html_search_meta('description', webpage, 'description')
base_url = self._search_regex(
[r'server\s*:\s*(["\'])(?P<url>.+?)\1', r'var\s+server\s*=\s*"(?P<url>[^"]+)\"'],
@@ -87,8 +95,9 @@ class SpiegelIE(InfoExtractor):
return {
'id': video_id,
'title': title,
- 'description': description,
+ 'description': description.strip() if description else None,
'duration': duration,
+ 'upload_date': unified_strdate(video_data.get('data-video-date')),
'formats': formats,
}
@@ -104,6 +113,7 @@ class SpiegelArticleIE(InfoExtractor):
'ext': 'mp4',
'title': 'Faszination Badminton: Nennt es bloß nicht Federball',
'description': 're:^Patrick Kämnitz gehört.{100,}',
+ 'upload_date': '20140825',
},
}, {
'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html',
diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py
index 74d01183f..409d50304 100644
--- a/youtube_dl/extractor/srmediathek.py
+++ b/youtube_dl/extractor/srmediathek.py
@@ -9,8 +9,9 @@ from ..utils import (
class SRMediathekIE(ARDMediathekIE):
+ IE_NAME = 'sr:mediathek'
IE_DESC = 'Saarländischer Rundfunk'
- _VALID_URL = r'https?://sr-mediathek\.sr-online\.de/index\.php\?.*?&id=(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455',
@@ -34,7 +35,9 @@ class SRMediathekIE(ARDMediathekIE):
# m3u8 download
'skip_download': True,
},
- 'expected_warnings': ['Unable to download f4m manifest']
+ }, {
+ 'url': 'http://sr-mediathek.de/index.php?seite=7&id=7480',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py
index d5c852f52..0f8782d03 100644
--- a/youtube_dl/extractor/stitcher.py
+++ b/youtube_dl/extractor/stitcher.py
@@ -56,7 +56,7 @@ class StitcherIE(InfoExtractor):
episode = self._parse_json(
js_to_json(self._search_regex(
- r'(?s)var\s+stitcher\s*=\s*({.+?});\n', webpage, 'episode config')),
+ r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')),
display_id)['config']['episode']
title = unescapeHTML(episode['title'])
diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py
index 6526a6345..1c04dfb7b 100644
--- a/youtube_dl/extractor/svt.py
+++ b/youtube_dl/extractor/svt.py
@@ -7,13 +7,13 @@ from .common import InfoExtractor
from ..utils import (
determine_ext,
dict_get,
+ int_or_none,
+ try_get,
)
class SVTBaseIE(InfoExtractor):
- def _extract_video(self, info, video_id):
- video_info = self._get_video_info(info)
-
+ def _extract_video(self, video_info, video_id):
formats = []
for vr in video_info['videoReferences']:
player_type = vr.get('playerType')
@@ -37,6 +37,8 @@ class SVTBaseIE(InfoExtractor):
'format_id': player_type,
'url': vurl,
})
+ if not formats and video_info.get('rights', {}).get('geoBlockedSweden'):
+ self.raise_geo_restricted('This video is only available in Sweden')
self._sort_formats(formats)
subtitles = {}
@@ -52,15 +54,32 @@ class SVTBaseIE(InfoExtractor):
subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url})
- duration = video_info.get('materialLength')
- age_limit = 18 if video_info.get('inappropriateForChildren') else 0
+ title = video_info.get('title')
+
+ series = video_info.get('programTitle')
+ season_number = int_or_none(video_info.get('season'))
+ episode = video_info.get('episodeTitle')
+ episode_number = int_or_none(video_info.get('episodeNumber'))
+
+ duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration')))
+ age_limit = None
+ adult = dict_get(
+ video_info, ('inappropriateForChildren', 'blockedForChildren'),
+ skip_false_values=False)
+ if adult is not None:
+ age_limit = 18 if adult else 0
return {
'id': video_id,
+ 'title': title,
'formats': formats,
'subtitles': subtitles,
'duration': duration,
'age_limit': age_limit,
+ 'series': series,
+ 'season_number': season_number,
+ 'episode': episode,
+ 'episode_number': episode_number,
}
@@ -85,9 +104,6 @@ class SVTIE(SVTBaseIE):
if mobj:
return mobj.group('url')
- def _get_video_info(self, info):
- return info['video']
-
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
widget_id = mobj.group('widget_id')
@@ -97,15 +113,15 @@ class SVTIE(SVTBaseIE):
'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id),
article_id)
- info_dict = self._extract_video(info, article_id)
+ info_dict = self._extract_video(info['video'], article_id)
info_dict['title'] = info['context']['title']
return info_dict
class SVTPlayIE(SVTBaseIE):
IE_DESC = 'SVT Play and Öppet arkiv'
- _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp)/(?P<id>[0-9]+)'
+ _TESTS = [{
'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2',
'md5': '2b6704fe4a28801e1a098bbf3c5ac611',
'info_dict': {
@@ -121,25 +137,50 @@ class SVTPlayIE(SVTBaseIE):
}]
},
},
- }
-
- def _get_video_info(self, info):
- return info['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video']
+ }, {
+ # geo restricted to Sweden
+ 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- data = self._parse_json(self._search_regex(
- r'root\["__svtplay"\]\s*=\s*([^;]+);', webpage, 'embedded data'), video_id)
+ data = self._parse_json(
+ self._search_regex(
+ r'root\["__svtplay"\]\s*=\s*([^;]+);',
+ webpage, 'embedded data', default='{}'),
+ video_id, fatal=False)
thumbnail = self._og_search_thumbnail(webpage)
- info_dict = self._extract_video(data, video_id)
- info_dict.update({
- 'title': data['context']['dispatcher']['stores']['MetaStore']['title'],
- 'thumbnail': thumbnail,
- })
-
- return info_dict
+ if data:
+ video_info = try_get(
+ data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'],
+ dict)
+ if video_info:
+ info_dict = self._extract_video(video_info, video_id)
+ info_dict.update({
+ 'title': data['context']['dispatcher']['stores']['MetaStore']['title'],
+ 'thumbnail': thumbnail,
+ })
+ return info_dict
+
+ video_id = self._search_regex(
+ r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)',
+ webpage, 'video id', default=None)
+
+ if video_id:
+ data = self._download_json(
+ 'http://www.svt.se/videoplayer-api/video/%s' % video_id, video_id)
+ info_dict = self._extract_video(data, video_id)
+ if not info_dict.get('title'):
+ info_dict['title'] = re.sub(
+ r'\s*\|\s*.+?$', '',
+ info_dict.get('episode') or self._og_search_title(webpage))
+ return info_dict
diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py
index 4b4b740b4..2ecfd0405 100644
--- a/youtube_dl/extractor/telecinco.py
+++ b/youtube_dl/extractor/telecinco.py
@@ -1,50 +1,41 @@
# coding: utf-8
from __future__ import unicode_literals
-import json
+from .mitele import MiTeleBaseIE
-from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse_unquote,
- compat_urllib_parse_urlencode,
- compat_urlparse,
-)
-from ..utils import (
- get_element_by_attribute,
- parse_duration,
- strip_jsonp,
-)
-
-class TelecincoIE(InfoExtractor):
+class TelecincoIE(MiTeleBaseIE):
IE_DESC = 'telecinco.es, cuatro.com and mediaset.es'
_VALID_URL = r'https?://www\.(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html'
_TESTS = [{
'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html',
- 'md5': '5cbef3ad5ef17bf0d21570332d140729',
+ 'md5': '8d7b2d5f699ee2709d992a63d5cd1712',
'info_dict': {
- 'id': 'MDSVID20141015_0058',
+ 'id': 'JEA5ijCnF6p5W08A1rNKn7',
'ext': 'mp4',
- 'title': 'Con Martín Berasategui, hacer un bacalao al ...',
+ 'title': 'Bacalao con kokotxas al pil-pil',
+ 'description': 'md5:1382dacd32dd4592d478cbdca458e5bb',
'duration': 662,
},
}, {
'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html',
- 'md5': '0a5b9f3cc8b074f50a0578f823a12694',
+ 'md5': '284393e5387b3b947b77c613ef04749a',
'info_dict': {
- 'id': 'MDSVID20150916_0128',
+ 'id': 'jn24Od1zGLG4XUZcnUnZB6',
'ext': 'mp4',
- 'title': '¿Quién es este ex futbolista con el que hablan ...',
+ 'title': '¿Quién es este ex futbolista con el que hablan Leo Messi y Luis Suárez?',
+ 'description': 'md5:a62ecb5f1934fc787107d7b9a2262805',
'duration': 79,
},
}, {
'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html',
- 'md5': 'ad1bfaaba922dd4a295724b05b68f86a',
+ 'md5': '749afab6ea5a136a8806855166ae46a2',
'info_dict': {
- 'id': 'MDSVID20150513_0220',
+ 'id': 'aywerkD2Sv1vGNqq9b85Q2',
'ext': 'mp4',
'title': '#DOYLACARA. Con la trata no hay trato',
+ 'description': 'md5:2771356ff7bfad9179c5f5cd954f1477',
'duration': 50,
},
}, {
@@ -56,40 +47,16 @@ class TelecincoIE(InfoExtractor):
}]
def _real_extract(self, url):
- episode = self._match_id(url)
- webpage = self._download_webpage(url, episode)
- embed_data_json = self._search_regex(
- r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data',
- ).replace('\'', '"')
- embed_data = json.loads(embed_data_json)
-
- domain = embed_data['mediaUrl']
- if not domain.startswith('http'):
- # only happens in telecinco.es videos
- domain = 'http://' + domain
- info_url = compat_urlparse.urljoin(
- domain,
- compat_urllib_parse_unquote(embed_data['flashvars']['host'])
- )
- info_el = self._download_xml(info_url, episode).find('./video/info')
-
- video_link = info_el.find('videoUrl/link').text
- token_query = compat_urllib_parse_urlencode({'id': video_link})
- token_info = self._download_json(
- embed_data['flashvars']['ov_tk'] + '?' + token_query,
- episode,
- transform_source=strip_jsonp
- )
- formats = self._extract_m3u8_formats(
- token_info['tokenizedUrl'], episode, ext='mp4', entry_protocol='m3u8_native')
- self._sort_formats(formats)
-
- return {
- 'id': embed_data['videoId'],
- 'display_id': episode,
- 'title': info_el.find('title').text,
- 'formats': formats,
- 'description': get_element_by_attribute('class', 'text', webpage),
- 'thumbnail': info_el.find('thumb').text,
- 'duration': parse_duration(info_el.find('duration').text),
- }
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ title = self._html_search_meta(
+ ['og:title', 'twitter:title'], webpage, 'title')
+ info = self._get_player_info(url, webpage)
+ info.update({
+ 'display_id': display_id,
+ 'title': title,
+ 'description': self._html_search_meta(
+ ['og:description', 'twitter:description'],
+ webpage, 'title', fatal=False),
+ })
+ return info
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py
index 6c848dc6f..e595c4a69 100644
--- a/youtube_dl/extractor/tf1.py
+++ b/youtube_dl/extractor/tf1.py
@@ -48,6 +48,6 @@ class TF1IE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
wat_id = self._html_search_regex(
- r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8}).*?\1',
+ r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})\1',
webpage, 'wat id', group='id')
return self.url_result('wat:%s' % wat_id, 'Wat')
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index 07d222ae3..bb3efc4ea 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -6,6 +6,7 @@ import time
import hmac
import binascii
import hashlib
+import netrc
from .once import OnceIE
@@ -24,6 +25,9 @@ from ..utils import (
xpath_with_ns,
mimetype2ext,
find_xpath_attr,
+ unescapeHTML,
+ urlencode_postdata,
+ unified_timestamp,
)
default_ns = 'http://www.w3.org/2005/SMIL21/Language'
@@ -62,10 +66,11 @@ class ThePlatformBaseIE(OnceIE):
return formats, subtitles
- def get_metadata(self, path, video_id):
+ def _download_theplatform_metadata(self, path, video_id):
info_url = 'http://link.theplatform.com/s/%s?format=preview' % path
- info = self._download_json(info_url, video_id)
+ return self._download_json(info_url, video_id)
+ def _parse_theplatform_metadata(self, info):
subtitles = {}
captions = info.get('captions')
if isinstance(captions, list):
@@ -86,6 +91,10 @@ class ThePlatformBaseIE(OnceIE):
'uploader': info.get('billingCode'),
}
+ def _extract_theplatform_metadata(self, path, video_id):
+ info = self._download_theplatform_metadata(path, video_id)
+ return self._parse_theplatform_metadata(info)
+
class ThePlatformIE(ThePlatformBaseIE):
_VALID_URL = r'''(?x)
@@ -158,6 +167,7 @@ class ThePlatformIE(ThePlatformBaseIE):
'url': 'http://player.theplatform.com/p/NnzsPC/onsite_universal/select/media/guid/2410887629/2928790?fwsitesection=nbc_the_blacklist_video_library&autoPlay=true&carouselID=137781',
'only_matching': True,
}]
+ _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s'
@classmethod
def _extract_urls(cls, webpage):
@@ -192,6 +202,96 @@ class ThePlatformIE(ThePlatformBaseIE):
sig = flags + expiration_date + checksum + str_to_hex(sig_secret)
return '%s&sig=%s' % (url, sig)
+ def _extract_mvpd_auth(self, url, video_id, requestor_id, resource):
+ def xml_text(xml_str, tag):
+ return self._search_regex(
+ '<%s>(.+?)</%s>' % (tag, tag), xml_str, tag)
+
+ mvpd_headers = {
+ 'ap_42': 'anonymous',
+ 'ap_11': 'Linux i686',
+ 'ap_z': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0',
+ 'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0',
+ }
+
+ guid = xml_text(resource, 'guid')
+ requestor_info = self._downloader.cache.load('mvpd', requestor_id) or {}
+ authn_token = requestor_info.get('authn_token')
+ if authn_token:
+ token_expires = unified_timestamp(xml_text(authn_token, 'simpleTokenExpires').replace('_GMT', ''))
+ if token_expires and token_expires >= time.time():
+ authn_token = None
+ if not authn_token:
+ # TODO add support for other TV Providers
+ mso_id = 'DTV'
+ login_info = netrc.netrc().authenticators(mso_id)
+ if not login_info:
+ return None
+
+ def post_form(form_page, note, data={}):
+ post_url = self._html_search_regex(r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_page, 'post url', group='url')
+ return self._download_webpage(
+ post_url, video_id, note, data=urlencode_postdata(data or self._hidden_inputs(form_page)), headers={
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ })
+
+ provider_redirect_page = self._download_webpage(
+ self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id,
+ 'Downloading Provider Redirect Page', query={
+ 'noflash': 'true',
+ 'mso_id': mso_id,
+ 'requestor_id': requestor_id,
+ 'no_iframe': 'false',
+ 'domain_name': 'adobe.com',
+ 'redirect_url': url,
+ })
+ provider_login_page = post_form(
+ provider_redirect_page, 'Downloading Provider Login Page')
+ mvpd_confirm_page = post_form(provider_login_page, 'Logging in', {
+ 'username': login_info[0],
+ 'password': login_info[2],
+ })
+ post_form(mvpd_confirm_page, 'Confirming Login')
+
+ session = self._download_webpage(
+ self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id,
+ 'Retrieving Session', data=urlencode_postdata({
+ '_method': 'GET',
+ 'requestor_id': requestor_id,
+ }), headers=mvpd_headers)
+ authn_token = unescapeHTML(xml_text(session, 'authnToken'))
+ requestor_info['authn_token'] = authn_token
+ self._downloader.cache.store('mvpd', requestor_id, requestor_info)
+
+ authz_token = requestor_info.get(guid)
+ if not authz_token:
+ authorize = self._download_webpage(
+ self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id,
+ 'Retrieving Authorization Token', data=urlencode_postdata({
+ 'resource_id': resource,
+ 'requestor_id': requestor_id,
+ 'authentication_token': authn_token,
+ 'mso_id': xml_text(authn_token, 'simpleTokenMsoID'),
+ 'userMeta': '1',
+ }), headers=mvpd_headers)
+ authz_token = unescapeHTML(xml_text(authorize, 'authzToken'))
+ requestor_info[guid] = authz_token
+ self._downloader.cache.store('mvpd', requestor_id, requestor_info)
+
+ mvpd_headers.update({
+ 'ap_19': xml_text(authn_token, 'simpleSamlNameID'),
+ 'ap_23': xml_text(authn_token, 'simpleSamlSessionIndex'),
+ })
+
+ return self._download_webpage(
+ self._SERVICE_PROVIDER_TEMPLATE % 'shortAuthorize',
+ video_id, 'Retrieving Media Token', data=urlencode_postdata({
+ 'authz_token': authz_token,
+ 'requestor_id': requestor_id,
+ 'session_guid': xml_text(authn_token, 'simpleTokenAuthenticationGuid'),
+ 'hashed_guid': 'false',
+ }), headers=mvpd_headers)
+
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
@@ -265,7 +365,7 @@ class ThePlatformIE(ThePlatformBaseIE):
formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)
self._sort_formats(formats)
- ret = self.get_metadata(path, video_id)
+ ret = self._extract_theplatform_metadata(path, video_id)
combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)
ret.update({
'id': video_id,
@@ -339,7 +439,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
timestamp = int_or_none(entry.get('media$availableDate'), scale=1000)
categories = [item['media$name'] for item in entry.get('media$categories', [])]
- ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id)
+ ret = self._extract_theplatform_metadata('%s/%s' % (provider_id, first_video_id), video_id)
subtitles = self._merge_subtitles(subtitles, ret['subtitles'])
ret.update({
'id': video_id,
diff --git a/youtube_dl/extractor/threeqsdn.py b/youtube_dl/extractor/threeqsdn.py
index c77a07989..a0bc12c81 100644
--- a/youtube_dl/extractor/threeqsdn.py
+++ b/youtube_dl/extractor/threeqsdn.py
@@ -92,12 +92,11 @@ class ThreeQSDNIE(InfoExtractor):
if not item_url or item_url in urls:
return
urls.add(item_url)
- type_ = item.get('type')
- ext = determine_ext(item_url, default_ext=None)
- if type_ == 'application/dash+xml' or ext == 'mpd':
+ ext = mimetype2ext(item.get('type')) or determine_ext(item_url, default_ext=None)
+ if ext == 'mpd':
formats.extend(self._extract_mpd_formats(
item_url, video_id, mpd_id='mpd', fatal=False))
- elif type_ in ('application/vnd.apple.mpegURL', 'application/x-mpegurl') or ext == 'm3u8':
+ elif ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
item_url, video_id, 'mp4',
entry_protocol='m3u8' if live else 'm3u8_native',
@@ -111,7 +110,7 @@ class ThreeQSDNIE(InfoExtractor):
formats.append({
'url': item_url,
'format_id': item.get('quality'),
- 'ext': 'mp4' if item_url.startswith('rtsp') else mimetype2ext(type_) or ext,
+ 'ext': 'mp4' if item_url.startswith('rtsp') else ext,
'vcodec': 'none' if stream_type == 'audio' else None,
})
diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py
index 4797d1310..54c2d0aa6 100644
--- a/youtube_dl/extractor/toutv.py
+++ b/youtube_dl/extractor/toutv.py
@@ -1,74 +1,41 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- unified_strdate,
-)
+from ..utils import int_or_none
class TouTvIE(InfoExtractor):
IE_NAME = 'tou.tv'
- _VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))'
+ _VALID_URL = r'https?://ici\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+/S[0-9]+E[0-9]+)'
_TEST = {
- 'url': 'http://www.tou.tv/30-vies/S04E41',
+ 'url': 'http://ici.tou.tv/garfield-tout-court/S2015E17',
'info_dict': {
- 'id': '30-vies_S04E41',
+ 'id': '122017',
'ext': 'mp4',
- 'title': '30 vies Saison 4 / Épisode 41',
- 'description': 'md5:da363002db82ccbe4dafeb9cab039b09',
- 'age_limit': 8,
- 'uploader': 'Groupe des Nouveaux Médias',
- 'duration': 1296,
- 'upload_date': '20131118',
- 'thumbnail': 'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg',
+ 'title': 'Saison 2015 Épisode 17',
+ 'description': 'La photo de famille 2',
+ 'upload_date': '20100717',
},
'params': {
- 'skip_download': True, # Requires rtmpdump
+ # m3u8 download
+ 'skip_download': True,
},
- 'skip': 'Only available in Canada'
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- webpage = self._download_webpage(url, video_id)
-
- mediaId = self._search_regex(
- r'"idMedia":\s*"([^"]+)"', webpage, 'media ID')
-
- streams_url = 'http://release.theplatform.com/content.select?pid=' + mediaId
- streams_doc = self._download_xml(
- streams_url, video_id, note='Downloading stream list')
-
- video_url = next(n.text
- for n in streams_doc.findall('.//choice/url')
- if '//ad.doubleclick' not in n.text)
- if video_url.endswith('/Unavailable.flv'):
- raise ExtractorError(
- 'Access to this video is blocked from outside of Canada',
- expected=True)
-
- duration_str = self._html_search_meta(
- 'video:duration', webpage, 'duration')
- duration = int(duration_str) if duration_str else None
- upload_date_str = self._html_search_meta(
- 'video:release_date', webpage, 'upload date')
- upload_date = unified_strdate(upload_date_str) if upload_date_str else None
+ path = self._match_id(url)
+ metadata = self._download_json('http://ici.tou.tv/presentation/%s' % path, path)
+ video_id = metadata['IdMedia']
+ details = metadata['Details']
+ title = details['OriginalTitle']
return {
+ '_type': 'url_transparent',
+ 'url': 'radiocanada:%s:%s' % (metadata.get('AppCode', 'toutv'), video_id),
'id': video_id,
- 'title': self._og_search_title(webpage),
- 'url': video_url,
- 'description': self._og_search_description(webpage),
- 'uploader': self._dc_search_uploader(webpage),
- 'thumbnail': self._og_search_thumbnail(webpage),
- 'age_limit': self._media_rating_search(webpage),
- 'duration': duration,
- 'upload_date': upload_date,
- 'ext': 'mp4',
+ 'title': title,
+ 'thumbnail': details.get('ImageUrl'),
+ 'duration': int_or_none(details.get('LengthInSeconds')),
}
diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py
index a4997cb89..5070082da 100644
--- a/youtube_dl/extractor/tvp.py
+++ b/youtube_dl/extractor/tvp.py
@@ -4,6 +4,12 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ clean_html,
+ get_element_by_attribute,
+ ExtractorError,
+)
class TVPIE(InfoExtractor):
@@ -21,7 +27,7 @@ class TVPIE(InfoExtractor):
},
}, {
'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
- 'md5': 'c3b15ed1af288131115ff17a17c19dda',
+ 'md5': 'b0005b542e5b4de643a9690326ab1257',
'info_dict': {
'id': '17916176',
'ext': 'mp4',
@@ -53,6 +59,11 @@ class TVPIE(InfoExtractor):
webpage = self._download_webpage(
'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id)
+ error_massage = get_element_by_attribute('class', 'msg error', webpage)
+ if error_massage:
+ raise ExtractorError('%s said: %s' % (
+ self.IE_NAME, clean_html(error_massage)), expected=True)
+
title = self._search_regex(
r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1',
webpage, 'title', group='title')
@@ -66,24 +77,50 @@ class TVPIE(InfoExtractor):
r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None)
video_url = self._search_regex(
- r'0:{src:([\'"])(?P<url>.*?)\1', webpage, 'formats', group='url', default=None)
- if not video_url:
+ r'0:{src:([\'"])(?P<url>.*?)\1', webpage,
+ 'formats', group='url', default=None)
+ if not video_url or 'material_niedostepny.mp4' in video_url:
video_url = self._download_json(
'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id,
video_id)['video_url']
- ext = video_url.rsplit('.', 1)[-1]
- if ext != 'ism/manifest':
- if '/' in ext:
- ext = 'mp4'
+ formats = []
+ video_url_base = self._search_regex(
+ r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)',
+ video_url, 'video base url', default=None)
+ if video_url_base:
+ # TODO: Current DASH formats are broken - $Time$ pattern in
+ # <SegmentTemplate> not implemented yet
+ # formats.extend(self._extract_mpd_formats(
+ # video_url_base + '.ism/video.mpd',
+ # video_id, mpd_id='dash', fatal=False))
+ formats.extend(self._extract_f4m_formats(
+ video_url_base + '.ism/video.f4m',
+ video_id, f4m_id='hds', fatal=False))
+ m3u8_formats = self._extract_m3u8_formats(
+ video_url_base + '.ism/video.m3u8', video_id,
+ 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+ self._sort_formats(m3u8_formats)
+ m3u8_formats = list(filter(
+ lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ m3u8_formats))
+ formats.extend(m3u8_formats)
+ for i, m3u8_format in enumerate(m3u8_formats, 2):
+ http_url = '%s-%d.mp4' % (video_url_base, i)
+ if self._is_valid_url(http_url, video_id):
+ f = m3u8_format.copy()
+ f.update({
+ 'url': http_url,
+ 'format_id': f['format_id'].replace('hls', 'http'),
+ 'protocol': 'http',
+ })
+ formats.append(f)
+ else:
formats = [{
'format_id': 'direct',
'url': video_url,
- 'ext': ext,
+ 'ext': determine_ext(video_url, 'mp4'),
}]
- else:
- m3u8_url = re.sub('([^/]*)\.ism/manifest', r'\1.ism/\1.m3u8', video_url)
- formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/tweakers.py b/youtube_dl/extractor/tweakers.py
index f3198fb85..7a9386cde 100644
--- a/youtube_dl/extractor/tweakers.py
+++ b/youtube_dl/extractor/tweakers.py
@@ -1,25 +1,62 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ determine_ext,
+ mimetype2ext,
+)
class TweakersIE(InfoExtractor):
_VALID_URL = r'https?://tweakers\.net/video/(?P<id>\d+)'
_TEST = {
'url': 'https://tweakers.net/video/9926/new-nintendo-3ds-xl-op-alle-fronten-beter.html',
- 'md5': '3147e4ddad366f97476a93863e4557c8',
+ 'md5': 'fe73e417c093a788e0160c4025f88b15',
'info_dict': {
'id': '9926',
'ext': 'mp4',
'title': 'New Nintendo 3DS XL - Op alle fronten beter',
- 'description': 'md5:f97324cc71e86e11c853f0763820e3ba',
+ 'description': 'md5:3789b21fed9c0219e9bcaacd43fab280',
'thumbnail': 're:^https?://.*\.jpe?g$',
'duration': 386,
+ 'uploader_id': 's7JeEm',
}
}
def _real_extract(self, url):
- playlist_id = self._match_id(url)
- entries = self._extract_xspf_playlist(
- 'https://tweakers.net/video/s1playlist/%s/playlist.xspf' % playlist_id, playlist_id)
- return self.playlist_result(entries, playlist_id)
+ video_id = self._match_id(url)
+ video_data = self._download_json(
+ 'https://tweakers.net/video/s1playlist/%s/1920/1080/playlist.json' % video_id,
+ video_id)['items'][0]
+
+ title = video_data['title']
+
+ formats = []
+ for location in video_data.get('locations', {}).get('progressive', []):
+ format_id = location.get('label')
+ width = int_or_none(location.get('width'))
+ height = int_or_none(location.get('height'))
+ for source in location.get('sources', []):
+ source_url = source.get('src')
+ if not source_url:
+ continue
+ ext = mimetype2ext(source.get('type')) or determine_ext(source_url)
+ formats.append({
+ 'format_id': format_id,
+ 'url': source_url,
+ 'width': width,
+ 'height': height,
+ 'ext': ext,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': video_data.get('description'),
+ 'thumbnail': video_data.get('poster'),
+ 'duration': int_or_none(video_data.get('duration')),
+ 'uploader_id': video_data.get('account'),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py
index 20919774d..67b1277cc 100644
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -29,7 +29,7 @@ class TwitchBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv'
_API_BASE = 'https://api.twitch.tv'
- _USHER_BASE = 'http://usher.twitch.tv'
+ _USHER_BASE = 'https://usher.ttvnw.net'
_LOGIN_URL = 'http://www.twitch.tv/login'
_NETRC_MACHINE = 'twitch'
diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py
new file mode 100644
index 000000000..ce3bf6b02
--- /dev/null
+++ b/youtube_dl/extractor/urplay.py
@@ -0,0 +1,67 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class URPlayIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?urplay\.se/program/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://urplay.se/program/190031-tripp-trapp-trad-sovkudde',
+ 'md5': '15ca67b63fd8fb320ac2bcd854bad7b6',
+ 'info_dict': {
+ 'id': '190031',
+ 'ext': 'mp4',
+ 'title': 'Tripp, Trapp, Träd : Sovkudde',
+ 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ urplayer_data = self._parse_json(self._search_regex(
+ r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id)
+ host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect']
+
+ formats = []
+ for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)):
+ file_rtmp = urplayer_data.get('file_rtmp' + quality_attr)
+ if file_rtmp:
+ formats.append({
+ 'url': 'rtmp://%s/urplay/mp4:%s' % (host, file_rtmp),
+ 'format_id': quality + '-rtmp',
+ 'ext': 'flv',
+ 'preference': preference,
+ })
+ file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr)
+ if file_http:
+ file_http_base_url = 'http://%s/%s' % (host, file_http)
+ formats.extend(self._extract_f4m_formats(
+ file_http_base_url + 'manifest.f4m', video_id,
+ preference, '%s-hds' % quality, fatal=False))
+ formats.extend(self._extract_m3u8_formats(
+ file_http_base_url + 'playlist.m3u8', video_id, 'mp4',
+ 'm3u8_native', preference, '%s-hls' % quality, fatal=False))
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for subtitle in urplayer_data.get('subtitles', []):
+ subtitle_url = subtitle.get('file')
+ kind = subtitle.get('kind')
+ if subtitle_url or kind and kind != 'captions':
+ continue
+ subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({
+ 'url': subtitle_url,
+ })
+
+ return {
+ 'id': video_id,
+ 'title': urplayer_data['title'],
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': urplayer_data.get('image'),
+ 'series': urplayer_data.get('series_title'),
+ 'subtitles': subtitles,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/vidbit.py b/youtube_dl/extractor/vidbit.py
new file mode 100644
index 000000000..e7ac5a842
--- /dev/null
+++ b/youtube_dl/extractor/vidbit.py
@@ -0,0 +1,84 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ int_or_none,
+ js_to_json,
+ remove_end,
+ unified_strdate,
+)
+
+
+class VidbitIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?vidbit\.co/(?:watch|embed)\?.*?\bv=(?P<id>[\da-zA-Z]+)'
+ _TESTS = [{
+ 'url': 'http://www.vidbit.co/watch?v=jkL2yDOEq2',
+ 'md5': '1a34b7f14defe3b8fafca9796892924d',
+ 'info_dict': {
+ 'id': 'jkL2yDOEq2',
+ 'ext': 'mp4',
+ 'title': 'Intro to VidBit',
+ 'description': 'md5:5e0d6142eec00b766cbf114bfd3d16b7',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'upload_date': '20160618',
+ 'view_count': int,
+ 'comment_count': int,
+ }
+ }, {
+ 'url': 'http://www.vidbit.co/embed?v=jkL2yDOEq2&auto=0&water=0',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ compat_urlparse.urljoin(url, '/watch?v=%s' % video_id), video_id)
+
+ video_url, title = [None] * 2
+
+ config = self._parse_json(self._search_regex(
+ r'(?s)\.setup\(({.+?})\);', webpage, 'setup', default='{}'),
+ video_id, transform_source=js_to_json)
+ if config:
+ if config.get('file'):
+ video_url = compat_urlparse.urljoin(url, config['file'])
+ title = config.get('title')
+
+ if not video_url:
+ video_url = compat_urlparse.urljoin(url, self._search_regex(
+ r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
+ webpage, 'video URL', group='url'))
+
+ if not title:
+ title = remove_end(
+ self._html_search_regex(
+ (r'<h1>(.+?)</h1>', r'<title>(.+?)</title>'),
+ webpage, 'title', default=None) or self._og_search_title(webpage),
+ ' - VidBit')
+
+ description = self._html_search_meta(
+ ('description', 'og:description', 'twitter:description'),
+ webpage, 'description')
+
+ upload_date = unified_strdate(self._html_search_meta(
+ 'datePublished', webpage, 'upload date'))
+
+ view_count = int_or_none(self._search_regex(
+ r'<strong>(\d+)</strong> views',
+ webpage, 'view count', fatal=False))
+ comment_count = int_or_none(self._search_regex(
+ r'id=["\']cmt_num["\'][^>]*>\((\d+)\)',
+ webpage, 'comment count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'upload_date': upload_date,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ }
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index c52986af6..7e854f326 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -16,6 +16,7 @@ from ..utils import (
ExtractorError,
InAdvancePagedList,
int_or_none,
+ NO_DEFAULT,
RegexNotFoundError,
sanitized_Request,
smuggle_url,
@@ -56,6 +57,26 @@ class VimeoBaseInfoExtractor(InfoExtractor):
self._set_vimeo_cookie('vuid', vuid)
self._download_webpage(login_request, None, False, 'Wrong login info')
+ def _verify_video_password(self, url, video_id, webpage):
+ password = self._downloader.params.get('videopassword')
+ if password is None:
+ raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
+ token, vuid = self._extract_xsrft_and_vuid(webpage)
+ data = urlencode_postdata({
+ 'password': password,
+ 'token': token,
+ })
+ if url.startswith('http://'):
+ # vimeo only supports https now, but the user can give an http url
+ url = url.replace('http://', 'https://')
+ password_request = sanitized_Request(url + '/password', data)
+ password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ password_request.add_header('Referer', url)
+ self._set_vimeo_cookie('vuid', vuid)
+ return self._download_webpage(
+ password_request, video_id,
+ 'Verifying the password', 'Wrong password')
+
def _extract_xsrft_and_vuid(self, webpage):
xsrft = self._search_regex(
r'(?:(?P<q1>["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)',
@@ -146,7 +167,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
\.
)?
vimeo(?P<pro>pro)?\.com/
- (?!channels/[^/?#]+/?(?:$|[?#])|[^/]+/review/|(?:album|ondemand)/)
+ (?!(?:channels|album)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
(?:.*?/)?
(?:
(?:
@@ -227,8 +248,6 @@ class VimeoIE(VimeoBaseInfoExtractor):
{
'url': 'http://vimeo.com/channels/keypeele/75629013',
'md5': '2f86a05afe9d7abc0b9126d229bbe15d',
- 'note': 'Video is freely available via original URL '
- 'and protected with password when accessed via http://vimeo.com/75629013',
'info_dict': {
'id': '75629013',
'ext': 'mp4',
@@ -272,7 +291,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
{
# contains original format
'url': 'https://vimeo.com/33951933',
- 'md5': '53c688fa95a55bf4b7293d37a89c5c53',
+ 'md5': '2d9f5475e0537f013d0073e812ab89e6',
'info_dict': {
'id': '33951933',
'ext': 'mp4',
@@ -285,6 +304,29 @@ class VimeoIE(VimeoBaseInfoExtractor):
},
},
{
+ # only available via https://vimeo.com/channels/tributes/6213729 and
+ # not via https://vimeo.com/6213729
+ 'url': 'https://vimeo.com/channels/tributes/6213729',
+ 'info_dict': {
+ 'id': '6213729',
+ 'ext': 'mp4',
+ 'title': 'Vimeo Tribute: The Shining',
+ 'uploader': 'Casey Donahue',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/caseydonahue',
+ 'uploader_id': 'caseydonahue',
+ 'upload_date': '20090821',
+ 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Unable to download JSON metadata'],
+ },
+ {
+ 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741',
+ 'only_matching': True,
+ },
+ {
'url': 'https://vimeo.com/109815029',
'note': 'Video not completely processed, "failed" seed status',
'only_matching': True,
@@ -294,6 +336,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
'only_matching': True,
},
{
+ 'url': 'https://vimeo.com/album/2632481/video/79010983',
+ 'only_matching': True,
+ },
+ {
# source file returns 403: Forbidden
'url': 'https://vimeo.com/7809605',
'only_matching': True,
@@ -318,26 +364,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
if mobj:
return mobj.group(1)
-
- def _verify_video_password(self, url, video_id, webpage):
- password = self._downloader.params.get('videopassword')
- if password is None:
- raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
- token, vuid = self._extract_xsrft_and_vuid(webpage)
- data = urlencode_postdata({
- 'password': password,
- 'token': token,
- })
- if url.startswith('http://'):
- # vimeo only supports https now, but the user can give an http url
- url = url.replace('http://', 'https://')
- password_request = sanitized_Request(url + '/password', data)
- password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- password_request.add_header('Referer', url)
- self._set_vimeo_cookie('vuid', vuid)
- return self._download_webpage(
- password_request, video_id,
- 'Verifying the password', 'Wrong password')
+ # Look more for non-standard embedded Vimeo player
+ mobj = re.search(
+ r'<video[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)(?P=q1)', webpage)
+ if mobj:
+ return mobj.group('url')
def _verify_player_video_password(self, url, video_id):
password = self._downloader.params.get('videopassword')
@@ -369,7 +400,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
orig_url = url
if mobj.group('pro') or mobj.group('player'):
url = 'https://player.vimeo.com/video/' + video_id
- else:
+ elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')):
url = 'https://vimeo.com/' + video_id
# Retrieve video webpage to extract further information
@@ -630,8 +661,21 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):
webpage = self._login_list_password(page_url, list_id, webpage)
yield self._extract_list_title(webpage)
- for video_id in re.findall(r'id="clip_(\d+?)"', webpage):
- yield self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo')
+ # Try extracting href first since not all videos are available via
+ # short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729)
+ clips = re.findall(
+ r'id="clip_(\d+)"[^>]*>\s*<a[^>]+href="(/(?:[^/]+/)*\1)', webpage)
+ if clips:
+ for video_id, video_url in clips:
+ yield self.url_result(
+ compat_urlparse.urljoin(base_url, video_url),
+ VimeoIE.ie_key(), video_id=video_id)
+ # More relaxed fallback
+ else:
+ for video_id in re.findall(r'id=["\']clip_(\d+)', webpage):
+ yield self.url_result(
+ 'https://vimeo.com/%s' % video_id,
+ VimeoIE.ie_key(), video_id=video_id)
if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
break
@@ -668,7 +712,7 @@ class VimeoUserIE(VimeoChannelIE):
class VimeoAlbumIE(VimeoChannelIE):
IE_NAME = 'vimeo:album'
- _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)'
+ _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)(?:$|[?#]|/(?!video))'
_TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'
_TESTS = [{
'url': 'https://vimeo.com/album/2632481',
@@ -688,6 +732,13 @@ class VimeoAlbumIE(VimeoChannelIE):
'params': {
'videopassword': 'youtube-dl',
}
+ }, {
+ 'url': 'https://vimeo.com/album/2632481/sort:plays/format:thumbnail',
+ 'only_matching': True,
+ }, {
+ # TODO: respect page number
+ 'url': 'https://vimeo.com/album/2632481/page:2/sort:plays/format:thumbnail',
+ 'only_matching': True,
}]
def _page_url(self, base_url, pagenum):
@@ -746,12 +797,39 @@ class VimeoReviewIE(VimeoBaseInfoExtractor):
'thumbnail': 're:^https?://.*\.jpg$',
'uploader_id': 'user22258446',
}
+ }, {
+ 'note': 'Password protected',
+ 'url': 'https://vimeo.com/user37284429/review/138823582/c4d865efde',
+ 'info_dict': {
+ 'id': '138823582',
+ 'ext': 'mp4',
+ 'title': 'EFFICIENT PICKUP MASTERCLASS MODULE 1',
+ 'uploader': 'TMB',
+ 'uploader_id': 'user37284429',
+ },
+ 'params': {
+ 'videopassword': 'holygrail',
+ },
}]
+ def _real_initialize(self):
+ self._login()
+
+ def _get_config_url(self, webpage_url, video_id, video_password_verified=False):
+ webpage = self._download_webpage(webpage_url, video_id)
+ config_url = self._html_search_regex(
+ r'data-config-url="([^"]+)"', webpage, 'config URL',
+ default=NO_DEFAULT if video_password_verified else None)
+ if config_url is None:
+ self._verify_video_password(webpage_url, video_id, webpage)
+ config_url = self._get_config_url(
+ webpage_url, video_id, video_password_verified=True)
+ return config_url
+
def _real_extract(self, url):
video_id = self._match_id(url)
- config = self._download_json(
- 'https://player.vimeo.com/video/%s/config' % video_id, video_id)
+ config_url = self._get_config_url(url, video_id)
+ config = self._download_json(config_url, video_id)
info_dict = self._parse_config(config, video_id)
self._vimeo_sort_formats(info_dict['formats'])
info_dict['id'] = video_id
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py
index a6a6cc479..0183f052a 100644
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -24,6 +24,7 @@ class VineIE(InfoExtractor):
'upload_date': '20130519',
'uploader': 'Jack Dorsey',
'uploader_id': '76',
+ 'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
@@ -39,6 +40,7 @@ class VineIE(InfoExtractor):
'upload_date': '20140815',
'uploader': 'Mars Ruiz',
'uploader_id': '1102363502380728320',
+ 'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
@@ -54,6 +56,7 @@ class VineIE(InfoExtractor):
'upload_date': '20130430',
'uploader': 'Z3k3',
'uploader_id': '936470460173008896',
+ 'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
@@ -71,6 +74,7 @@ class VineIE(InfoExtractor):
'upload_date': '20150705',
'uploader': 'Pimry_zaa',
'uploader_id': '1135760698325307392',
+ 'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
@@ -86,10 +90,12 @@ class VineIE(InfoExtractor):
data = self._parse_json(
self._search_regex(
- r'window\.POST_DATA\s*=\s*{\s*%s\s*:\s*({.+?})\s*};\s*</script>' % video_id,
+ r'window\.POST_DATA\s*=\s*({.+?});\s*</script>',
webpage, 'vine data'),
video_id)
+ data = data[list(data.keys())[0]]
+
formats = [{
'format_id': '%(format)s-%(rate)s' % f,
'vcodec': f.get('format'),
@@ -109,6 +115,7 @@ class VineIE(InfoExtractor):
'upload_date': unified_strdate(data.get('created')),
'uploader': username,
'uploader_id': data.get('userIdStr'),
+ 'view_count': int_or_none(data.get('loops', {}).get('count')),
'like_count': int_or_none(data.get('likes', {}).get('count')),
'comment_count': int_or_none(data.get('comments', {}).get('count')),
'repost_count': int_or_none(data.get('reposts', {}).get('count')),
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index 79c819bc3..758d9c86b 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
import re
import json
+import sys
from .common import InfoExtractor
from ..compat import compat_str
@@ -10,7 +11,6 @@ from ..utils import (
ExtractorError,
int_or_none,
orderedSet,
- sanitized_Request,
str_to_int,
unescapeHTML,
unified_strdate,
@@ -27,12 +27,12 @@ class VKIE(InfoExtractor):
https?://
(?:
(?:
- (?:m\.)?vk\.com/video_|
+ (?:(?:m|new)\.)?vk\.com/video_|
(?:www\.)?daxab.com/
)
ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)|
(?:
- (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video|
+ (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?video|
(?:www\.)?daxab.com/embed/
)
(?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))?
@@ -182,6 +182,10 @@ class VKIE(InfoExtractor):
# pladform embed
'url': 'https://vk.com/video-76116461_171554880',
'only_matching': True,
+ },
+ {
+ 'url': 'http://new.vk.com/video205387401_165548505',
+ 'only_matching': True,
}
]
@@ -190,7 +194,7 @@ class VKIE(InfoExtractor):
if username is None:
return
- login_page = self._download_webpage(
+ login_page, url_handle = self._download_webpage_handle(
'https://vk.com', None, 'Downloading login page')
login_form = self._hidden_inputs(login_page)
@@ -200,11 +204,26 @@ class VKIE(InfoExtractor):
'pass': password.encode('cp1251'),
})
- request = sanitized_Request(
- 'https://login.vk.com/?act=login',
- urlencode_postdata(login_form))
+ # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header
+ # and expects the first one to be set rather than second (see
+ # https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201).
+ # As of RFC6265 the newer one cookie should be set into cookie store
+ # what actually happens.
+ # We will workaround this VK issue by resetting the remixlhk cookie to
+ # the first one manually.
+ cookies = url_handle.headers.get('Set-Cookie')
+ if sys.version_info[0] >= 3:
+ cookies = cookies.encode('iso-8859-1')
+ cookies = cookies.decode('utf-8')
+ remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies)
+ if remixlhk:
+ value, domain = remixlhk.groups()
+ self._set_cookie(domain, 'remixlhk', value)
+
login_page = self._download_webpage(
- request, None, note='Logging in as %s' % username)
+ 'https://login.vk.com/?act=login', None,
+ note='Logging in as %s' % username,
+ data=urlencode_postdata(login_form))
if re.search(r'onLoginFailed', login_page):
raise ExtractorError(
@@ -339,7 +358,7 @@ class VKIE(InfoExtractor):
class VKUserVideosIE(InfoExtractor):
IE_NAME = 'vk:uservideos'
IE_DESC = "VK - User's Videos"
- _VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)'
+ _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)'
_TEMPLATE_URL = 'https://vk.com/videos'
_TESTS = [{
'url': 'http://vk.com/videos205387401',
@@ -354,6 +373,12 @@ class VKUserVideosIE(InfoExtractor):
}, {
'url': 'http://vk.com/videos-97664626?section=all',
'only_matching': True,
+ }, {
+ 'url': 'http://m.vk.com/videos205387401',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://new.vk.com/videos205387401',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py
index 8e35f24e8..bec7ab327 100644
--- a/youtube_dl/extractor/vrt.py
+++ b/youtube_dl/extractor/vrt.py
@@ -25,7 +25,8 @@ class VRTIE(InfoExtractor):
'timestamp': 1414271750.949,
'upload_date': '20141025',
'duration': 929,
- }
+ },
+ 'skip': 'HTTP Error 404: Not Found',
},
# sporza.be
{
@@ -39,7 +40,8 @@ class VRTIE(InfoExtractor):
'timestamp': 1413835980.560,
'upload_date': '20141020',
'duration': 3238,
- }
+ },
+ 'skip': 'HTTP Error 404: Not Found',
},
# cobra.be
{
@@ -53,16 +55,39 @@ class VRTIE(InfoExtractor):
'timestamp': 1413967500.494,
'upload_date': '20141022',
'duration': 661,
- }
+ },
+ 'skip': 'HTTP Error 404: Not Found',
},
{
# YouTube video
'url': 'http://deredactie.be/cm/vrtnieuws/videozone/nieuws/cultuurenmedia/1.2622957',
- 'only_matching': True,
+ 'md5': 'b8b93da1df1cea6c8556255a796b7d61',
+ 'info_dict': {
+ 'id': 'Wji-BZ0oCwg',
+ 'ext': 'mp4',
+ 'title': 'ROGUE ONE: A STAR WARS STORY Official Teaser Trailer',
+ 'description': 'md5:8e468944dce15567a786a67f74262583',
+ 'uploader': 'Star Wars',
+ 'uploader_id': 'starwars',
+ 'upload_date': '20160407',
+ },
+ 'add_ie': ['Youtube'],
},
{
'url': 'http://cobra.canvas.be/cm/cobra/videozone/rubriek/film-videozone/1.2377055',
- 'only_matching': True,
+ 'md5': '',
+ 'info_dict': {
+ 'id': '2377055',
+ 'ext': 'mp4',
+ 'title': 'Cafe Derby',
+ 'description': 'Lenny Van Wesemael debuteert met de langspeelfilm Café Derby. Een waar gebeurd maar ook verzonnen verhaal.',
+ 'upload_date': '20150626',
+ 'timestamp': 1435305240.769,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ }
}
]
@@ -98,6 +123,32 @@ class VRTIE(InfoExtractor):
formats.extend(self._extract_m3u8_formats(
src, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
+ formats.extend(self._extract_f4m_formats(
+ src.replace('playlist.m3u8', 'manifest.f4m'),
+ video_id, f4m_id='hds', fatal=False))
+ if 'data-video-geoblocking="true"' not in webpage:
+ rtmp_formats = self._extract_smil_formats(
+ src.replace('playlist.m3u8', 'jwplayer.smil'),
+ video_id, fatal=False)
+ formats.extend(rtmp_formats)
+ for rtmp_format in rtmp_formats:
+ rtmp_format_c = rtmp_format.copy()
+ rtmp_format_c['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
+ del rtmp_format_c['play_path']
+ del rtmp_format_c['ext']
+ http_format = rtmp_format_c.copy()
+ http_format.update({
+ 'url': rtmp_format_c['url'].replace('rtmp://', 'http://').replace('vod.', 'download.').replace('/_definst_/', '/').replace('mp4:', ''),
+ 'format_id': rtmp_format['format_id'].replace('rtmp', 'http'),
+ 'protocol': 'http',
+ })
+ rtsp_format = rtmp_format_c.copy()
+ rtsp_format.update({
+ 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
+ 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
+ 'protocol': 'rtsp',
+ })
+ formats.extend([http_format, rtsp_format])
else:
formats.extend(self._extract_f4m_formats(
'%s/manifest.f4m' % src, video_id, f4m_id='hds', fatal=False))
diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py
index 5a41f8ffa..bcb140305 100644
--- a/youtube_dl/extractor/xnxx.py
+++ b/youtube_dl/extractor/xnxx.py
@@ -6,17 +6,23 @@ from ..compat import compat_urllib_parse_unquote
class XNXXIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:video|www)\.xnxx\.com/video(?P<id>[0-9]+)/(.*)'
- _TEST = {
- 'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_',
- 'md5': '0831677e2b4761795f68d417e0b7b445',
+ _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/'
+ _TESTS = [{
+ 'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video',
+ 'md5': 'ef7ecee5af78f8b03dca2cf31341d3a0',
'info_dict': {
- 'id': '1135332',
+ 'id': '55awb78',
'ext': 'flv',
- 'title': 'lida » Naked Funny Actress (5)',
+ 'title': 'Skyrim Test Video',
'age_limit': 18,
- }
- }
+ },
+ }, {
+ 'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.xnxx.com/video-55awb78/',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index 4075b8a4f..83bc1fef2 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -4,17 +4,23 @@ import itertools
import re
from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
from ..utils import (
int_or_none,
orderedSet,
+ parse_duration,
sanitized_Request,
str_to_int,
)
class XTubeIE(InfoExtractor):
- _VALID_URL = r'(?:xtube:|https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-))(?P<id>[^/?&#]+)'
+ _VALID_URL = r'''(?x)
+ (?:
+ xtube:|
+ https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-)
+ )
+ (?P<id>[^/?&#]+)
+ '''
_TESTS = [{
# old URL schema
@@ -27,6 +33,8 @@ class XTubeIE(InfoExtractor):
'description': 'contains:an ET kind of thing',
'uploader': 'greenshowers',
'duration': 450,
+ 'view_count': int,
+ 'comment_count': int,
'age_limit': 18,
}
}, {
@@ -51,21 +59,30 @@ class XTubeIE(InfoExtractor):
req.add_header('Cookie', 'age_verified=1; cookiesAccepted=1')
webpage = self._download_webpage(req, display_id)
- flashvars = self._parse_json(
- self._search_regex(
- r'xt\.playerOps\s*=\s*({.+?});', webpage, 'player ops'),
- video_id)['flashvars']
-
- title = flashvars.get('title') or self._search_regex(
- r'<h1>([^<]+)</h1>', webpage, 'title')
- video_url = compat_urllib_parse_unquote(flashvars['video_url'])
- duration = int_or_none(flashvars.get('video_duration'))
-
- uploader = self._search_regex(
- r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"',
- webpage, 'uploader', fatal=False)
+ sources = self._parse_json(self._search_regex(
+ r'sources\s*:\s*({.+?}),', webpage, 'sources'), video_id)
+
+ formats = []
+ for format_id, format_url in sources.items():
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ 'height': int_or_none(format_id),
+ })
+ self._sort_formats(formats)
+
+ title = self._search_regex(
+ (r'<h1>(?P<title>[^<]+)</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'),
+ webpage, 'title', group='title')
description = self._search_regex(
r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False)
+ uploader = self._search_regex(
+ (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"',
+ r'<span[^>]+class="nickname"[^>]*>([^<]+)'),
+ webpage, 'uploader', fatal=False)
+ duration = parse_duration(self._search_regex(
+ r'<dt>Runtime:</dt>\s*<dd>([^<]+)</dd>',
+ webpage, 'duration', fatal=False))
view_count = str_to_int(self._search_regex(
r'<dt>Views:</dt>\s*<dd>([\d,\.]+)</dd>',
webpage, 'view count', fatal=False))
@@ -76,7 +93,6 @@ class XTubeIE(InfoExtractor):
return {
'id': video_id,
'display_id': display_id,
- 'url': video_url,
'title': title,
'description': description,
'uploader': uploader,
@@ -84,6 +100,7 @@ class XTubeIE(InfoExtractor):
'view_count': view_count,
'comment_count': comment_count,
'age_limit': 18,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py
index 0be8932ad..a66daee46 100644
--- a/youtube_dl/extractor/xuite.py
+++ b/youtube_dl/extractor/xuite.py
@@ -68,6 +68,20 @@ class XuiteIE(InfoExtractor):
},
'skip': 'Video removed',
}, {
+ # Video with encoded media id
+ # from http://forgetfulbc.blogspot.com/2016/06/date.html
+ 'url': 'http://vlog.xuite.net/embed/cE1xbENoLTI3NDQ3MzM2LmZsdg==?ar=0&as=0',
+ 'info_dict': {
+ 'id': 'cE1xbENoLTI3NDQ3MzM2LmZsdg==',
+ 'ext': 'mp4',
+ 'title': '男女平權只是口號?專家解釋約會時男生是否該幫女生付錢 (中字)',
+ 'description': 'md5:f0abdcb69df300f522a5442ef3146f2a',
+ 'timestamp': 1466160960,
+ 'upload_date': '20160617',
+ 'uploader': 'B.C. & Lowy',
+ 'uploader_id': '232279340',
+ },
+ }, {
'url': 'http://vlog.xuite.net/play/S1dDUjdyLTMyOTc3NjcuZmx2/%E5%AD%AB%E7%87%95%E5%A7%BF-%E7%9C%BC%E6%B7%9A%E6%88%90%E8%A9%A9',
'only_matching': True,
}]
@@ -80,10 +94,9 @@ class XuiteIE(InfoExtractor):
def base64_encode_utf8(data):
return base64.b64encode(data.encode('utf-8')).decode('utf-8')
- def _extract_flv_config(self, media_id):
- base64_media_id = self.base64_encode_utf8(media_id)
+ def _extract_flv_config(self, encoded_media_id):
flv_config = self._download_xml(
- 'http://vlog.xuite.net/flash/player?media=%s' % base64_media_id,
+ 'http://vlog.xuite.net/flash/player?media=%s' % encoded_media_id,
'flv config')
prop_dict = {}
for prop in flv_config.findall('./property'):
@@ -108,9 +121,14 @@ class XuiteIE(InfoExtractor):
'%s returned error: %s' % (self.IE_NAME, error_msg),
expected=True)
- video_id = self._html_search_regex(
- r'data-mediaid="(\d+)"', webpage, 'media id')
- flv_config = self._extract_flv_config(video_id)
+ encoded_media_id = self._search_regex(
+ r'attributes\.name\s*=\s*"([^"]+)"', webpage,
+ 'encoded media id', default=None)
+ if encoded_media_id is None:
+ video_id = self._html_search_regex(
+ r'data-mediaid="(\d+)"', webpage, 'media id')
+ encoded_media_id = self.base64_encode_utf8(video_id)
+ flv_config = self._extract_flv_config(encoded_media_id)
FORMATS = {
'audio': 'mp3',
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index 927a964a4..b0679dfb7 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -19,6 +19,7 @@ from ..utils import (
mimetype2ext,
)
+from .brightcove import BrightcoveNewIE
from .nbc import NBCSportsVPlayerIE
@@ -227,7 +228,12 @@ class YahooIE(InfoExtractor):
# Look for NBCSports iframes
nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
if nbc_sports_url:
- return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
+ return self.url_result(nbc_sports_url, NBCSportsVPlayerIE.ie_key())
+
+ # Look for Brightcove New Studio embeds
+ bc_url = BrightcoveNewIE._extract_url(webpage)
+ if bc_url:
+ return self.url_result(bc_url, BrightcoveNewIE.ie_key())
# Query result is often embedded in webpage as JSON. Sometimes explicit requests
# to video API results in a failure with geo restriction reason therefore using
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py
index 147608ebe..e37f237c7 100644
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -16,7 +16,6 @@ from ..compat import (
from ..utils import (
ExtractorError,
get_element_by_attribute,
- sanitized_Request,
)
@@ -218,14 +217,10 @@ class YoukuIE(InfoExtractor):
headers = {
'Referer': req_url,
}
+ headers.update(self.geo_verification_headers())
self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com')
- req = sanitized_Request(req_url, headers=headers)
- cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
- if cn_verification_proxy:
- req.add_header('Ytdl-request-proxy', cn_verification_proxy)
-
- raw_data = self._download_json(req, video_id, note=note)
+ raw_data = self._download_json(req_url, video_id, note=note, headers=headers)
return raw_data['data']
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 00dd602ff..8aa7dfc41 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -501,6 +501,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'youtube_include_dash_manifest': True,
'format': '141',
},
+ 'skip': 'format 141 not served anymore',
},
# DASH manifest with encrypted signature
{
@@ -517,7 +518,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
},
'params': {
'youtube_include_dash_manifest': True,
- 'format': '141',
+ 'format': '141/bestaudio[ext=m4a]',
},
},
# JS player signature function name containing $
@@ -537,7 +538,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
},
'params': {
'youtube_include_dash_manifest': True,
- 'format': '141',
+ 'format': '141/bestaudio[ext=m4a]',
},
},
# Controversy video
@@ -618,7 +619,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic',
'license': 'Standard YouTube License',
'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
- 'uploader': 'Olympics',
+ 'uploader': 'Olympic',
'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
},
'params': {
@@ -671,7 +672,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
'uploader': 'dorappi2000',
'license': 'Standard YouTube License',
- 'formats': 'mincount:33',
+ 'formats': 'mincount:32',
},
},
# DASH manifest with segment_list
@@ -691,7 +692,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'params': {
'youtube_include_dash_manifest': True,
'format': '135', # bestvideo
- }
+ },
+ 'skip': 'This live event has ended.',
},
{
# Multifeed videos (multiple cameras), URL is for Main Camera
@@ -762,6 +764,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
},
'playlist_count': 2,
+ 'skip': 'Not multifeed anymore',
},
{
'url': 'http://vid.plus/FlRa-iH7PGw',
@@ -814,6 +817,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'params': {
'skip_download': True,
},
+ 'skip': 'This video does not exist.',
},
{
# Video licensed under Creative Commons
@@ -1331,7 +1335,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(?:[a-zA-Z-]+="[^"]*"\s+)*?
(?:title|href)="([^"]+)"\s+
(?:[a-zA-Z-]+="[^"]*"\s+)*?
- class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*>
+ class="[^"]*"[^>]*>
[^<]+\.{3}\s*
</a>
''', r'\1', video_description)
@@ -1726,6 +1730,39 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}
+class YoutubeSharedVideoIE(InfoExtractor):
+ _VALID_URL = r'(?:https?:)?//(?:www\.)?youtube\.com/shared\?ci=(?P<id>[0-9A-Za-z_-]{11})'
+ IE_NAME = 'youtube:shared'
+
+ _TEST = {
+ 'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU',
+ 'info_dict': {
+ 'id': 'uPDB5I9wfp8',
+ 'ext': 'webm',
+ 'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3',
+ 'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d',
+ 'upload_date': '20160219',
+ 'uploader': 'Pocoyo - Português (BR)',
+ 'uploader_id': 'PocoyoBrazil',
+ },
+ 'add_ie': ['Youtube'],
+ 'params': {
+ # There are already too many Youtube downloads
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ real_video_id = self._html_search_meta(
+ 'videoId', webpage, 'YouTube video id', fatal=True)
+
+ return self.url_result(real_video_id, YoutubeIE.ie_key())
+
+
class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
IE_DESC = 'YouTube.com playlists'
_VALID_URL = r"""(?x)(?:
@@ -1941,10 +1978,13 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
else super(YoutubeChannelIE, cls).suitable(url))
+ def _build_template_url(self, url, channel_id):
+ return self._TEMPLATE_URL % channel_id
+
def _real_extract(self, url):
channel_id = self._match_id(url)
- url = self._TEMPLATE_URL % channel_id
+ url = self._build_template_url(url, channel_id)
# Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
# Workaround by extracting as a playlist if managed to obtain channel playlist URL
@@ -1958,9 +1998,13 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
channel_playlist_id = self._html_search_meta(
'channelId', channel_page, 'channel id', default=None)
if not channel_playlist_id:
- channel_playlist_id = self._search_regex(
- r'data-(?:channel-external-|yt)id="([^"]+)"',
- channel_page, 'channel id', default=None)
+ channel_url = self._html_search_meta(
+ ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
+ channel_page, 'channel url', default=None)
+ if channel_url:
+ channel_playlist_id = self._search_regex(
+ r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
+ channel_url, 'channel id', default=None)
if channel_playlist_id and channel_playlist_id.startswith('UC'):
playlist_id = 'UU' + channel_playlist_id[2:]
return self.url_result(
@@ -1983,20 +2027,39 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
for video_id, video_title in self.extract_videos_from_page(channel_page)]
return self.playlist_result(entries, channel_id)
+ try:
+ next(self._entries(channel_page, channel_id))
+ except StopIteration:
+ alert_message = self._html_search_regex(
+ r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
+ channel_page, 'alert', default=None, group='alert')
+ if alert_message:
+ raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
+
return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
class YoutubeUserIE(YoutubeChannelIE):
IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
- _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/|c/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
- _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
+ _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
+ _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
IE_NAME = 'youtube:user'
_TESTS = [{
'url': 'https://www.youtube.com/user/TheLinuxFoundation',
'playlist_mincount': 320,
'info_dict': {
- 'title': 'TheLinuxFoundation',
+ 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
+ 'title': 'Uploads from The Linux Foundation',
+ }
+ }, {
+ # Only available via https://www.youtube.com/c/12minuteathlete/videos
+ # but not https://www.youtube.com/user/12minuteathlete/videos
+ 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
+ 'playlist_mincount': 249,
+ 'info_dict': {
+ 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
+ 'title': 'Uploads from 12 Minute Athlete',
}
}, {
'url': 'ytuser:phihag',
@@ -2004,6 +2067,13 @@ class YoutubeUserIE(YoutubeChannelIE):
}, {
'url': 'https://www.youtube.com/c/gametrailers',
'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/gametrailers',
+ 'only_matching': True,
+ }, {
+ # This channel is not available.
+ 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
+ 'only_matching': True,
}]
@classmethod
@@ -2016,6 +2086,10 @@ class YoutubeUserIE(YoutubeChannelIE):
else:
return super(YoutubeUserIE, cls).suitable(url)
+ def _build_template_url(self, url, channel_id):
+ mobj = re.match(self._VALID_URL, url)
+ return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
+
class YoutubeLiveIE(YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com live streams'