aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
authorRandom User <rndusr@posteo.de>2017-03-25 21:36:59 +0100
committerRandom User <rndusr@posteo.de>2017-03-25 21:36:59 +0100
commit4f06c1c9fcbfbc74b81b5fa89a616914b5ce5aad (patch)
treea51b702e001d350b908780a119f76d8ea706d511 /youtube_dl/extractor
parentc73e330e7adc9c0c15ac51aeea8fbb7dad95351a (diff)
parent942b44a0525f677924c660bcb00902d705d91fc2 (diff)
Merge branch 'master' of github.com-rndusr:rg3/youtube-dl into fix/str-item-assignment
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/addanime.py3
-rw-r--r--youtube_dl/extractor/adobepass.py7
-rw-r--r--youtube_dl/extractor/afreecatv.py72
-rw-r--r--youtube_dl/extractor/arkena.py3
-rw-r--r--youtube_dl/extractor/atresplayer.py17
-rw-r--r--youtube_dl/extractor/atvat.py73
-rw-r--r--youtube_dl/extractor/bellmedia.py9
-rw-r--r--youtube_dl/extractor/bostonglobe.py72
-rw-r--r--youtube_dl/extractor/brightcove.py17
-rw-r--r--youtube_dl/extractor/ceskatelevize.py3
-rw-r--r--youtube_dl/extractor/channel9.py404
-rw-r--r--youtube_dl/extractor/cloudy.py107
-rw-r--r--youtube_dl/extractor/common.py140
-rw-r--r--youtube_dl/extractor/condenast.py43
-rw-r--r--youtube_dl/extractor/crunchyroll.py22
-rw-r--r--youtube_dl/extractor/discoverygo.py76
-rw-r--r--youtube_dl/extractor/discoverynetworks.py (renamed from youtube_dl/extractor/tlc.py)23
-rw-r--r--youtube_dl/extractor/douyutv.py31
-rw-r--r--youtube_dl/extractor/dplay.py116
-rw-r--r--youtube_dl/extractor/drtv.py5
-rw-r--r--youtube_dl/extractor/extractors.py19
-rw-r--r--youtube_dl/extractor/eyedotv.py2
-rw-r--r--youtube_dl/extractor/facebook.py12
-rw-r--r--youtube_dl/extractor/fox.py7
-rw-r--r--youtube_dl/extractor/franceculture.py32
-rw-r--r--youtube_dl/extractor/freshlive.py5
-rw-r--r--youtube_dl/extractor/generic.py155
-rw-r--r--youtube_dl/extractor/go.py8
-rw-r--r--youtube_dl/extractor/hbo.py45
-rw-r--r--youtube_dl/extractor/livestream.py15
-rw-r--r--youtube_dl/extractor/medialaan.py259
-rw-r--r--youtube_dl/extractor/miomio.py14
-rw-r--r--youtube_dl/extractor/mitele.py4
-rw-r--r--youtube_dl/extractor/ninecninemedia.py6
-rw-r--r--youtube_dl/extractor/npo.py431
-rw-r--r--youtube_dl/extractor/openload.py61
-rw-r--r--youtube_dl/extractor/pluralsight.py7
-rw-r--r--youtube_dl/extractor/pornhub.py32
-rw-r--r--youtube_dl/extractor/prosiebensat1.py20
-rw-r--r--youtube_dl/extractor/redbulltv.py122
-rw-r--r--youtube_dl/extractor/rutube.py11
-rw-r--r--youtube_dl/extractor/ruutu.py3
-rw-r--r--youtube_dl/extractor/senateisvp.py2
-rw-r--r--youtube_dl/extractor/soundcloud.py2
-rw-r--r--youtube_dl/extractor/streamable.py2
-rw-r--r--youtube_dl/extractor/telecinco.py4
-rw-r--r--youtube_dl/extractor/telequebec.py24
-rw-r--r--youtube_dl/extractor/toongoggles.py81
-rw-r--r--youtube_dl/extractor/tunepk.py90
-rw-r--r--youtube_dl/extractor/twentyfourvideo.py14
-rw-r--r--youtube_dl/extractor/twitch.py78
-rw-r--r--youtube_dl/extractor/vier.py33
-rw-r--r--youtube_dl/extractor/viu.py5
-rw-r--r--youtube_dl/extractor/vk.py3
-rw-r--r--youtube_dl/extractor/vrak.py80
-rw-r--r--youtube_dl/extractor/wdr.py19
-rw-r--r--youtube_dl/extractor/youtube.py21
57 files changed, 2073 insertions, 898 deletions
diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py
index 55a9322a7..9f8a71262 100644
--- a/youtube_dl/extractor/addanime.py
+++ b/youtube_dl/extractor/addanime.py
@@ -25,7 +25,8 @@ class AddAnimeIE(InfoExtractor):
'ext': 'mp4',
'description': 'One Piece 606',
'title': 'One Piece 606',
- }
+ },
+ 'skip': 'Video is gone',
}, {
'url': 'http://add-anime.net/video/MDUGWYKNGBD8/One-Piece-687',
'only_matching': True,
diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py
index 4d655bd5e..1b2d364ca 100644
--- a/youtube_dl/extractor/adobepass.py
+++ b/youtube_dl/extractor/adobepass.py
@@ -36,6 +36,11 @@ MSO_INFO = {
'username_field': 'Ecom_User_ID',
'password_field': 'Ecom_Password',
},
+ 'Charter_Direct': {
+ 'name': 'Charter Spectrum',
+ 'username_field': 'IDToken1',
+ 'password_field': 'IDToken2',
+ },
'thr030': {
'name': '3 Rivers Communications'
},
@@ -1453,6 +1458,8 @@ class AdobePassIE(InfoExtractor):
self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {})
count += 1
continue
+ if '<error' in authorize:
+ raise ExtractorError(xml_text(authorize, 'details'), expected=True)
authz_token = unescapeHTML(xml_text(authorize, 'authzToken'))
requestor_info[guid] = authz_token
self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info)
diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py
index e0a0f7c57..b774d6db8 100644
--- a/youtube_dl/extractor/afreecatv.py
+++ b/youtube_dl/extractor/afreecatv.py
@@ -4,15 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse_urlparse,
- compat_urlparse,
-)
+from ..compat import compat_xpath
from ..utils import (
ExtractorError,
int_or_none,
- update_url_query,
- xpath_element,
xpath_text,
)
@@ -43,7 +38,8 @@ class AfreecaTVIE(InfoExtractor):
'uploader': 'dailyapril',
'uploader_id': 'dailyapril',
'upload_date': '20160503',
- }
+ },
+ 'skip': 'Video is gone',
}, {
'url': 'http://afbbs.afreecatv.com:8080/app/read_ucc_bbs.cgi?nStationNo=16711924&nTitleNo=36153164&szBjId=dailyapril&nBbsNo=18605867',
'info_dict': {
@@ -71,6 +67,19 @@ class AfreecaTVIE(InfoExtractor):
'upload_date': '20160502',
},
}],
+ 'skip': 'Video is gone',
+ }, {
+ 'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793',
+ 'info_dict': {
+ 'id': '18650793',
+ 'ext': 'flv',
+ 'uploader': '윈아디',
+ 'uploader_id': 'badkids',
+ 'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!',
+ },
+ 'params': {
+ 'skip_download': True, # requires rtmpdump
+ },
}, {
'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652',
'only_matching': True,
@@ -90,40 +99,33 @@ class AfreecaTVIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- parsed_url = compat_urllib_parse_urlparse(url)
- info_url = compat_urlparse.urlunparse(parsed_url._replace(
- netloc='afbbs.afreecatv.com:8080',
- path='/api/video/get_video_info.php'))
video_xml = self._download_xml(
- update_url_query(info_url, {'nTitleNo': video_id}), video_id)
+ 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php',
+ video_id, query={'nTitleNo': video_id})
- if xpath_element(video_xml, './track/video/file') is None:
+ video_element = video_xml.findall(compat_xpath('./track/video'))[1]
+ if video_element is None or video_element.text is None:
raise ExtractorError('Specified AfreecaTV video does not exist',
expected=True)
- title = xpath_text(video_xml, './track/title', 'title')
+ video_url_raw = video_element.text
+
+ app, playpath = video_url_raw.split('mp4:')
+
+ title = xpath_text(video_xml, './track/title', 'title', fatal=True)
uploader = xpath_text(video_xml, './track/nickname', 'uploader')
uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id')
duration = int_or_none(xpath_text(video_xml, './track/duration',
'duration'))
thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail')
- entries = []
- for i, video_file in enumerate(video_xml.findall('./track/video/file')):
- video_key = self.parse_video_key(video_file.get('key', ''))
- if not video_key:
- continue
- entries.append({
- 'id': '%s_%s' % (video_id, video_key.get('part', i + 1)),
- 'title': title,
- 'upload_date': video_key.get('upload_date'),
- 'duration': int_or_none(video_file.get('duration')),
- 'url': video_file.text,
- })
-
- info = {
+ return {
'id': video_id,
+ 'url': app,
+ 'ext': 'flv',
+ 'play_path': 'mp4:' + playpath,
+ 'rtmp_live': True, # downloading won't end without this
'title': title,
'uploader': uploader,
'uploader_id': uploader_id,
@@ -131,20 +133,6 @@ class AfreecaTVIE(InfoExtractor):
'thumbnail': thumbnail,
}
- if len(entries) > 1:
- info['_type'] = 'multi_video'
- info['entries'] = entries
- elif len(entries) == 1:
- info['url'] = entries[0]['url']
- info['upload_date'] = entries[0].get('upload_date')
- else:
- raise ExtractorError(
- 'No files found for the specified AfreecaTV video, either'
- ' the URL is incorrect or the video has been made private.',
- expected=True)
-
- return info
-
class AfreecaTVGlobalIE(AfreecaTVIE):
IE_NAME = 'afreecatv:global'
diff --git a/youtube_dl/extractor/arkena.py b/youtube_dl/extractor/arkena.py
index 50ffb442d..4495ddbb0 100644
--- a/youtube_dl/extractor/arkena.py
+++ b/youtube_dl/extractor/arkena.py
@@ -93,8 +93,7 @@ class ArkenaIE(InfoExtractor):
exts = (mimetype2ext(f.get('Type')), determine_ext(f_url, None))
if kind == 'm3u8' or 'm3u8' in exts:
formats.extend(self._extract_m3u8_formats(
- f_url, video_id, 'mp4',
- entry_protocol='m3u8' if is_live else 'm3u8_native',
+ f_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=kind, fatal=False, live=is_live))
elif kind == 'flash' or 'f4m' in exts:
formats.extend(self._extract_f4m_formats(
diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py
index e3c669830..99af6dc5a 100644
--- a/youtube_dl/extractor/atresplayer.py
+++ b/youtube_dl/extractor/atresplayer.py
@@ -90,7 +90,8 @@ class AtresPlayerIE(InfoExtractor):
request, None, 'Logging in as %s' % username)
error = self._html_search_regex(
- r'(?s)<ul class="list_error">(.+?)</ul>', response, 'error', default=None)
+ r'(?s)<ul[^>]+class="[^"]*\blist_error\b[^"]*">(.+?)</ul>',
+ response, 'error', default=None)
if error:
raise ExtractorError(
'Unable to login: %s' % error, expected=True)
@@ -155,13 +156,17 @@ class AtresPlayerIE(InfoExtractor):
if format_id == 'token' or not video_url.startswith('http'):
continue
if 'geodeswowsmpra3player' in video_url:
- f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0]
- f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path)
+ # f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0]
+ # f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path)
# this videos are protected by DRM, the f4m downloader doesn't support them
continue
- else:
- f4m_url = video_url[:-9] + '/manifest.f4m'
- formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
+ video_url_hd = video_url.replace('free_es', 'es')
+ formats.extend(self._extract_f4m_formats(
+ video_url_hd[:-9] + '/manifest.f4m', video_id, f4m_id='hds',
+ fatal=False))
+ formats.extend(self._extract_mpd_formats(
+ video_url_hd[:-9] + '/manifest.mpd', video_id, mpd_id='dash',
+ fatal=False))
self._sort_formats(formats)
path_data = player.get('pathData')
diff --git a/youtube_dl/extractor/atvat.py b/youtube_dl/extractor/atvat.py
new file mode 100644
index 000000000..1584d53fc
--- /dev/null
+++ b/youtube_dl/extractor/atvat.py
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ unescapeHTML,
+)
+
+
+class ATVAtIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?atv\.at/(?:[^/]+/){2}(?P<id>[dv]\d+)'
+ _TESTS = [{
+ 'url': 'http://atv.at/aktuell/di-210317-2005-uhr/v1698449/',
+ 'md5': 'c3b6b975fb3150fc628572939df205f2',
+ 'info_dict': {
+ 'id': '1698447',
+ 'ext': 'mp4',
+ 'title': 'DI, 21.03.17 | 20:05 Uhr 1/1',
+ }
+ }, {
+ 'url': 'http://atv.at/aktuell/meinrad-knapp/d8416/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ video_data = self._parse_json(unescapeHTML(self._search_regex(
+ r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="([^"]+)"',
+ webpage, 'player data')), display_id)['config']['initial_video']
+
+ video_id = video_data['id']
+ video_title = video_data['title']
+
+ parts = []
+ for part in video_data.get('parts', []):
+ part_id = part['id']
+ part_title = part['title']
+
+ formats = []
+ for source in part.get('sources', []):
+ source_url = source.get('src')
+ if not source_url:
+ continue
+ ext = determine_ext(source_url)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ source_url, part_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ else:
+ formats.append({
+ 'format_id': source.get('delivery'),
+ 'url': source_url,
+ })
+ self._sort_formats(formats)
+
+ parts.append({
+ 'id': part_id,
+ 'title': part_title,
+ 'thumbnail': part.get('preview_image_url'),
+ 'duration': int_or_none(part.get('duration')),
+ 'is_live': part.get('is_livestream'),
+ 'formats': formats,
+ })
+
+ return {
+ '_type': 'multi_video',
+ 'id': video_id,
+ 'title': video_title,
+ 'entries': parts,
+ }
diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py
index 1f5b6ed92..8820a3914 100644
--- a/youtube_dl/extractor/bellmedia.py
+++ b/youtube_dl/extractor/bellmedia.py
@@ -21,10 +21,11 @@ class BellMediaIE(InfoExtractor):
animalplanet|
bravo|
mtv|
- space
+ space|
+ etalk
)\.ca|
much\.com
- )/.*?(?:\bvid=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})'''
+ )/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})'''
_TESTS = [{
'url': 'http://www.ctv.ca/video/player?vid=706966',
'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0',
@@ -58,6 +59,9 @@ class BellMediaIE(InfoExtractor):
}, {
'url': 'http://www.ctv.ca/DCs-Legends-of-Tomorrow/Video/S2E11-Turncoat-vid1051430',
'only_matching': True,
+ }, {
+ 'url': 'http://www.etalk.ca/video?videoid=663455',
+ 'only_matching': True,
}]
_DOMAINS = {
'thecomedynetwork': 'comedy',
@@ -65,6 +69,7 @@ class BellMediaIE(InfoExtractor):
'sciencechannel': 'discsci',
'investigationdiscovery': 'invdisc',
'animalplanet': 'aniplan',
+ 'etalk': 'ctv',
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/bostonglobe.py b/youtube_dl/extractor/bostonglobe.py
new file mode 100644
index 000000000..57882fbee
--- /dev/null
+++ b/youtube_dl/extractor/bostonglobe.py
@@ -0,0 +1,72 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+from ..utils import (
+ extract_attributes,
+)
+
+
+class BostonGlobeIE(InfoExtractor):
+ _VALID_URL = r'(?i)https?://(?:www\.)?bostonglobe\.com/.*/(?P<id>[^/]+)/\w+(?:\.html)?'
+ _TESTS = [
+ {
+ 'url': 'http://www.bostonglobe.com/metro/2017/02/11/tree-finally-succumbs-disease-leaving-hole-neighborhood/h1b4lviqzMTIn9sVy8F3gP/story.html',
+ 'md5': '0a62181079c85c2d2b618c9a738aedaf',
+ 'info_dict': {
+ 'title': 'A tree finally succumbs to disease, leaving a hole in a neighborhood',
+ 'id': '5320421710001',
+ 'ext': 'mp4',
+ 'description': 'It arrived as a sapling when the Back Bay was in its infancy, a spindly American elm tamped down into a square of dirt cut into the brick sidewalk of 1880s Marlborough Street, no higher than the first bay window of the new brownstone behind it.',
+ 'timestamp': 1486877593,
+ 'upload_date': '20170212',
+ 'uploader_id': '245991542',
+ },
+ },
+ {
+ # Embedded youtube video; we hand it off to the Generic extractor.
+ 'url': 'https://www.bostonglobe.com/lifestyle/names/2017/02/17/does-ben-affleck-play-matt-damon-favorite-version-batman/ruqkc9VxKBYmh5txn1XhSI/story.html',
+ 'md5': '582b40327089d5c0c949b3c54b13c24b',
+ 'info_dict': {
+ 'title': "Who Is Matt Damon's Favorite Batman?",
+ 'id': 'ZW1QCnlA6Qc',
+ 'ext': 'mp4',
+ 'upload_date': '20170217',
+ 'description': 'md5:3b3dccb9375867e0b4d527ed87d307cb',
+ 'uploader': 'The Late Late Show with James Corden',
+ 'uploader_id': 'TheLateLateShow',
+ },
+ 'expected_warnings': ['404'],
+ },
+ ]
+
+ def _real_extract(self, url):
+ page_id = self._match_id(url)
+ webpage = self._download_webpage(url, page_id)
+
+ page_title = self._og_search_title(webpage, default=None)
+
+ # <video data-brightcove-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject">
+ entries = []
+ for video in re.findall(r'(?i)(<video[^>]+>)', webpage):
+ attrs = extract_attributes(video)
+
+ video_id = attrs.get('data-brightcove-video-id')
+ account_id = attrs.get('data-account')
+ player_id = attrs.get('data-player')
+ embed = attrs.get('data-embed')
+
+ if video_id and account_id and player_id and embed:
+ entries.append(
+ 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
+ % (account_id, player_id, embed, video_id))
+
+ if len(entries) == 0:
+ return self.url_result(url, 'Generic')
+ elif len(entries) == 1:
+ return self.url_result(entries[0], 'BrightcoveNew')
+ else:
+ return self.playlist_from_matches(entries, page_id, page_title, ie='BrightcoveNew')
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index 27685eed0..46ef8e605 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -193,7 +193,13 @@ class BrightcoveLegacyIE(InfoExtractor):
if videoPlayer is not None:
if isinstance(videoPlayer, list):
videoPlayer = videoPlayer[0]
- if not (videoPlayer.isdigit() or videoPlayer.startswith('ref:')):
+ videoPlayer = videoPlayer.strip()
+ # UUID is also possible for videoPlayer (e.g.
+ # http://www.popcornflix.com/hoodies-vs-hooligans/7f2d2b87-bbf2-4623-acfb-ea942b4f01dd
+ # or http://www8.hp.com/cn/zh/home.html)
+ if not (re.match(
+ r'^(?:\d+|[\da-fA-F]{8}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{12})$',
+ videoPlayer) or videoPlayer.startswith('ref:')):
return None
params['@videoPlayer'] = videoPlayer
linkBase = find_param('linkBaseURL')
@@ -515,6 +521,9 @@ class BrightcoveNewIE(InfoExtractor):
return entries
def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+ self._initialize_geo_bypass(smuggled_data.get('geo_countries'))
+
account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(
@@ -544,8 +553,10 @@ class BrightcoveNewIE(InfoExtractor):
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
json_data = self._parse_json(e.cause.read().decode(), video_id)[0]
- raise ExtractorError(
- json_data.get('message') or json_data['error_code'], expected=True)
+ message = json_data.get('message') or json_data['error_code']
+ if json_data.get('error_subcode') == 'CLIENT_GEO':
+ self.raise_geo_restricted(msg=message)
+ raise ExtractorError(message, expected=True)
raise
title = json_data['name'].strip()
diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py
index b1dfacf80..dd2529a6d 100644
--- a/youtube_dl/extractor/ceskatelevize.py
+++ b/youtube_dl/extractor/ceskatelevize.py
@@ -160,8 +160,7 @@ class CeskaTelevizeIE(InfoExtractor):
for format_id, stream_url in item.get('streamUrls', {}).items():
if 'playerType=flash' in stream_url:
stream_formats = self._extract_m3u8_formats(
- stream_url, playlist_id, 'mp4',
- entry_protocol='m3u8' if is_live else 'm3u8_native',
+ stream_url, playlist_id, 'mp4', 'm3u8_native',
m3u8_id='hls-%s' % format_id, fatal=False)
else:
stream_formats = self._extract_mpd_formats(
diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py
index 865dbcaba..e92894246 100644
--- a/youtube_dl/extractor/channel9.py
+++ b/youtube_dl/extractor/channel9.py
@@ -4,62 +4,62 @@ import re
from .common import InfoExtractor
from ..utils import (
+ clean_html,
ExtractorError,
- parse_filesize,
+ int_or_none,
+ parse_iso8601,
qualities,
+ unescapeHTML,
)
class Channel9IE(InfoExtractor):
- '''
- Common extractor for channel9.msdn.com.
-
- The type of provided URL (video or playlist) is determined according to
- meta Search.PageType from web page HTML rather than URL itself, as it is
- not always possible to do.
- '''
IE_DESC = 'Channel 9'
IE_NAME = 'channel9'
- _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
+ _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
_TESTS = [{
'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
- 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
+ 'md5': '32083d4eaf1946db6d454313f44510ca',
'info_dict': {
- 'id': 'Events/TechEd/Australia/2013/KOS002',
- 'ext': 'mp4',
+ 'id': '6c413323-383a-49dc-88f9-a22800cab024',
+ 'ext': 'wmv',
'title': 'Developer Kick-Off Session: Stuff We Love',
- 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
+ 'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731',
'duration': 4576,
- 'thumbnail': r're:http://.*\.jpg',
+ 'thumbnail': r're:https?://.*\.jpg',
+ 'timestamp': 1377717420,
+ 'upload_date': '20130828',
'session_code': 'KOS002',
- 'session_day': 'Day 1',
'session_room': 'Arena 1A',
- 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug',
- 'Mads Kristensen'],
+ 'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'],
},
}, {
'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
- 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
+ 'md5': 'dcf983ee6acd2088e7188c3cf79b46bc',
'info_dict': {
- 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
- 'ext': 'mp4',
+ 'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024',
+ 'ext': 'wmv',
'title': 'Self-service BI with Power BI - nuclear testing',
- 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
+ 'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54',
'duration': 1540,
- 'thumbnail': r're:http://.*\.jpg',
+ 'thumbnail': r're:https?://.*\.jpg',
+ 'timestamp': 1386381991,
+ 'upload_date': '20131207',
'authors': ['Mike Wilmot'],
},
}, {
# low quality mp4 is best
'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
'info_dict': {
- 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
+ 'id': '33ad69d2-6a4e-4172-83a1-a523013dec76',
'ext': 'mp4',
'title': 'Ranges for the Standard Library',
- 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
+ 'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372',
'duration': 5646,
- 'thumbnail': r're:http://.*\.jpg',
+ 'thumbnail': r're:https?://.*\.jpg',
+ 'upload_date': '20150930',
+ 'timestamp': 1443640735,
},
'params': {
'skip_download': True,
@@ -70,7 +70,7 @@ class Channel9IE(InfoExtractor):
'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
'title': 'Channel 9',
},
- 'playlist_count': 2,
+ 'playlist_mincount': 100,
}, {
'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
'only_matching': True,
@@ -81,189 +81,6 @@ class Channel9IE(InfoExtractor):
_RSS_URL = 'http://channel9.msdn.com/%s/RSS'
- def _formats_from_html(self, html):
- FORMAT_REGEX = r'''
- (?x)
- <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
- <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
- (?:<div\s+class="popup\s+rounded">\s*
- <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
- </div>)? # File size part may be missing
- '''
- quality = qualities((
- 'MP3', 'MP4',
- 'Low Quality WMV', 'Low Quality MP4',
- 'Mid Quality WMV', 'Mid Quality MP4',
- 'High Quality WMV', 'High Quality MP4'))
- formats = [{
- 'url': x.group('url'),
- 'format_id': x.group('quality'),
- 'format_note': x.group('note'),
- 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
- 'filesize_approx': parse_filesize(x.group('filesize')),
- 'quality': quality(x.group('quality')),
- 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
- } for x in list(re.finditer(FORMAT_REGEX, html))]
-
- self._sort_formats(formats)
-
- return formats
-
- def _extract_title(self, html):
- title = self._html_search_meta('title', html, 'title')
- if title is None:
- title = self._og_search_title(html)
- TITLE_SUFFIX = ' (Channel 9)'
- if title is not None and title.endswith(TITLE_SUFFIX):
- title = title[:-len(TITLE_SUFFIX)]
- return title
-
- def _extract_description(self, html):
- DESCRIPTION_REGEX = r'''(?sx)
- <div\s+class="entry-content">\s*
- <div\s+id="entry-body">\s*
- (?P<description>.+?)\s*
- </div>\s*
- </div>
- '''
- m = re.search(DESCRIPTION_REGEX, html)
- if m is not None:
- return m.group('description')
- return self._html_search_meta('description', html, 'description')
-
- def _extract_duration(self, html):
- m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
- return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
-
- def _extract_slides(self, html):
- m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
- return m.group('slidesurl') if m is not None else None
-
- def _extract_zip(self, html):
- m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
- return m.group('zipurl') if m is not None else None
-
- def _extract_avg_rating(self, html):
- m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
- return float(m.group('avgrating')) if m is not None else 0
-
- def _extract_rating_count(self, html):
- m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
- return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
-
- def _extract_view_count(self, html):
- m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
- return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
-
- def _extract_comment_count(self, html):
- m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
- return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
-
- def _fix_count(self, count):
- return int(str(count).replace(',', '')) if count is not None else None
-
- def _extract_authors(self, html):
- m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
- if m is None:
- return None
- return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
-
- def _extract_session_code(self, html):
- m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
- return m.group('code') if m is not None else None
-
- def _extract_session_day(self, html):
- m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
- return m.group('day').strip() if m is not None else None
-
- def _extract_session_room(self, html):
- m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
- return m.group('room') if m is not None else None
-
- def _extract_session_speakers(self, html):
- return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
-
- def _extract_content(self, html, content_path):
- # Look for downloadable content
- formats = self._formats_from_html(html)
- slides = self._extract_slides(html)
- zip_ = self._extract_zip(html)
-
- # Nothing to download
- if len(formats) == 0 and slides is None and zip_ is None:
- self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
- return
-
- # Extract meta
- title = self._extract_title(html)
- description = self._extract_description(html)
- thumbnail = self._og_search_thumbnail(html)
- duration = self._extract_duration(html)
- avg_rating = self._extract_avg_rating(html)
- rating_count = self._extract_rating_count(html)
- view_count = self._extract_view_count(html)
- comment_count = self._extract_comment_count(html)
-
- common = {
- '_type': 'video',
- 'id': content_path,
- 'description': description,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'avg_rating': avg_rating,
- 'rating_count': rating_count,
- 'view_count': view_count,
- 'comment_count': comment_count,
- }
-
- result = []
-
- if slides is not None:
- d = common.copy()
- d.update({'title': title + '-Slides', 'url': slides})
- result.append(d)
-
- if zip_ is not None:
- d = common.copy()
- d.update({'title': title + '-Zip', 'url': zip_})
- result.append(d)
-
- if len(formats) > 0:
- d = common.copy()
- d.update({'title': title, 'formats': formats})
- result.append(d)
-
- return result
-
- def _extract_entry_item(self, html, content_path):
- contents = self._extract_content(html, content_path)
- if contents is None:
- return contents
-
- if len(contents) > 1:
- raise ExtractorError('Got more than one entry')
- result = contents[0]
- result['authors'] = self._extract_authors(html)
-
- return result
-
- def _extract_session(self, html, content_path):
- contents = self._extract_content(html, content_path)
- if contents is None:
- return contents
-
- session_meta = {
- 'session_code': self._extract_session_code(html),
- 'session_day': self._extract_session_day(html),
- 'session_room': self._extract_session_room(html),
- 'session_speakers': self._extract_session_speakers(html),
- }
-
- for content in contents:
- content.update(session_meta)
-
- return self.playlist_result(contents)
-
def _extract_list(self, video_id, rss_url=None):
if not rss_url:
rss_url = self._RSS_URL % video_id
@@ -274,9 +91,7 @@ class Channel9IE(InfoExtractor):
return self.playlist_result(entries, video_id, title_text)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- content_path = mobj.group('contentpath')
- rss = mobj.group('rss')
+ content_path, rss = re.match(self._VALID_URL, url).groups()
if rss:
return self._extract_list(content_path, url)
@@ -284,17 +99,158 @@ class Channel9IE(InfoExtractor):
webpage = self._download_webpage(
url, content_path, 'Downloading web page')
- page_type = self._search_regex(
- r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2',
- webpage, 'page type', default=None, group='pagetype')
- if page_type:
- if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
- return self._extract_entry_item(webpage, content_path)
- elif page_type == 'Session': # Event session page, may contain downloadable content
- return self._extract_session(webpage, content_path)
- elif page_type == 'Event':
- return self._extract_list(content_path)
+ episode_data = self._search_regex(
+ r"data-episode='([^']+)'", webpage, 'episode data', default=None)
+ if episode_data:
+ episode_data = self._parse_json(unescapeHTML(
+ episode_data), content_path)
+ content_id = episode_data['contentId']
+ is_session = '/Sessions(' in episode_data['api']
+ content_url = 'https://channel9.msdn.com/odata' + episode_data['api']
+ if is_session:
+ content_url += '?$expand=Speakers'
+ else:
+ content_url += '?$expand=Authors'
+ content_data = self._download_json(content_url, content_id)
+ title = content_data['Title']
+
+ QUALITIES = (
+ 'mp3',
+ 'wmv', 'mp4',
+ 'wmv-low', 'mp4-low',
+ 'wmv-mid', 'mp4-mid',
+ 'wmv-high', 'mp4-high',
+ )
+
+ quality_key = qualities(QUALITIES)
+
+ def quality(quality_id, format_url):
+ return (len(QUALITIES) if '_Source.' in format_url
+ else quality_key(quality_id))
+
+ formats = []
+ urls = set()
+
+ SITE_QUALITIES = {
+ 'MP3': 'mp3',
+ 'MP4': 'mp4',
+ 'Low Quality WMV': 'wmv-low',
+ 'Low Quality MP4': 'mp4-low',
+ 'Mid Quality WMV': 'wmv-mid',
+ 'Mid Quality MP4': 'mp4-mid',
+ 'High Quality WMV': 'wmv-high',
+ 'High Quality MP4': 'mp4-high',
+ }
+
+ formats_select = self._search_regex(
+ r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage,
+ 'formats select', default=None)
+ if formats_select:
+ for mobj in re.finditer(
+ r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<',
+ formats_select):
+ format_url = mobj.group('url')
+ if format_url in urls:
+ continue
+ urls.add(format_url)
+ format_id = mobj.group('format')
+ quality_id = SITE_QUALITIES.get(format_id, format_id)
+ formats.append({
+ 'url': format_url,
+ 'format_id': quality_id,
+ 'quality': quality(quality_id, format_url),
+ 'vcodec': 'none' if quality_id == 'mp3' else None,
+ })
+
+ API_QUALITIES = {
+ 'VideoMP4Low': 'mp4-low',
+ 'VideoWMV': 'wmv-mid',
+ 'VideoMP4Medium': 'mp4-mid',
+ 'VideoMP4High': 'mp4-high',
+ 'VideoWMVHQ': 'wmv-hq',
+ }
+
+ for format_id, q in API_QUALITIES.items():
+ q_url = content_data.get(format_id)
+ if not q_url or q_url in urls:
+ continue
+ urls.add(q_url)
+ formats.append({
+ 'url': q_url,
+ 'format_id': q,
+ 'quality': quality(q, q_url),
+ })
+
+ self._sort_formats(formats)
+
+ slides = content_data.get('Slides')
+ zip_file = content_data.get('ZipFile')
+
+ if not formats and not slides and not zip_file:
+ raise ExtractorError(
+ 'None of recording, slides or zip are available for %s' % content_path)
+
+ subtitles = {}
+ for caption in content_data.get('Captions', []):
+ caption_url = caption.get('Url')
+ if not caption_url:
+ continue
+ subtitles.setdefault(caption.get('Language', 'en'), []).append({
+ 'url': caption_url,
+ 'ext': 'vtt',
+ })
+
+ common = {
+ 'id': content_id,
+ 'title': title,
+ 'description': clean_html(content_data.get('Description') or content_data.get('Body')),
+ 'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'),
+ 'duration': int_or_none(content_data.get('MediaLengthInSeconds')),
+ 'timestamp': parse_iso8601(content_data.get('PublishedDate')),
+ 'avg_rating': int_or_none(content_data.get('Rating')),
+ 'rating_count': int_or_none(content_data.get('RatingCount')),
+ 'view_count': int_or_none(content_data.get('Views')),
+ 'comment_count': int_or_none(content_data.get('CommentCount')),
+ 'subtitles': subtitles,
+ }
+ if is_session:
+ speakers = []
+ for s in content_data.get('Speakers', []):
+ speaker_name = s.get('FullName')
+ if not speaker_name:
+ continue
+ speakers.append(speaker_name)
+
+ common.update({
+ 'session_code': content_data.get('Code'),
+ 'session_room': content_data.get('Room'),
+ 'session_speakers': speakers,
+ })
else:
- raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
- else: # Assuming list
+ authors = []
+ for a in content_data.get('Authors', []):
+ author_name = a.get('DisplayName')
+ if not author_name:
+ continue
+ authors.append(author_name)
+ common['authors'] = authors
+
+ contents = []
+
+ if slides:
+ d = common.copy()
+ d.update({'title': title + '-Slides', 'url': slides})
+ contents.append(d)
+
+ if zip_file:
+ d = common.copy()
+ d.update({'title': title + '-Zip', 'url': zip_file})
+ contents.append(d)
+
+ if formats:
+ d = common.copy()
+ d.update({'title': title, 'formats': formats})
+ contents.append(d)
+ return self.playlist_result(contents)
+ else:
return self._extract_list(content_path)
diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py
index ae5ba0015..9bc8dbea4 100644
--- a/youtube_dl/extractor/cloudy.py
+++ b/youtube_dl/extractor/cloudy.py
@@ -1,97 +1,56 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..compat import (
- compat_parse_qs,
- compat_HTTPError,
-)
from ..utils import (
- ExtractorError,
- HEADRequest,
- remove_end,
+ str_to_int,
+ unified_strdate,
)
class CloudyIE(InfoExtractor):
_IE_DESC = 'cloudy.ec'
- _VALID_URL = r'''(?x)
- https?://(?:www\.)?cloudy\.ec/
- (?:v/|embed\.php\?id=)
- (?P<id>[A-Za-z0-9]+)
- '''
- _EMBED_URL = 'http://www.cloudy.ec/embed.php?id=%s'
- _API_URL = 'http://www.cloudy.ec/api/player.api.php'
- _MAX_TRIES = 2
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?cloudy\.ec/(?:v/|embed\.php\?.*?\bid=)(?P<id>[A-Za-z0-9]+)'
+ _TESTS = [{
'url': 'https://www.cloudy.ec/v/af511e2527aac',
- 'md5': '5cb253ace826a42f35b4740539bedf07',
+ 'md5': '29832b05028ead1b58be86bf319397ca',
'info_dict': {
'id': 'af511e2527aac',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Funny Cats and Animals Compilation june 2013',
+ 'upload_date': '20130913',
+ 'view_count': int,
}
- }
-
- def _extract_video(self, video_id, file_key, error_url=None, try_num=0):
-
- if try_num > self._MAX_TRIES - 1:
- raise ExtractorError('Unable to extract video URL', expected=True)
-
- form = {
- 'file': video_id,
- 'key': file_key,
- }
-
- if error_url:
- form.update({
- 'numOfErrors': try_num,
- 'errorCode': '404',
- 'errorUrl': error_url,
- })
+ }, {
+ 'url': 'http://www.cloudy.ec/embed.php?autoplay=1&id=af511e2527aac',
+ 'only_matching': True,
+ }]
- player_data = self._download_webpage(
- self._API_URL, video_id, 'Downloading player data', query=form)
- data = compat_parse_qs(player_data)
-
- try_num += 1
-
- if 'error' in data:
- raise ExtractorError(
- '%s error: %s' % (self.IE_NAME, ' '.join(data['error_msg'])),
- expected=True)
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
- title = data.get('title', [None])[0]
- if title:
- title = remove_end(title, '&asdasdas').strip()
+ webpage = self._download_webpage(
+ 'http://www.cloudy.ec/embed.php?id=%s' % video_id, video_id)
- video_url = data.get('url', [None])[0]
+ info = self._parse_html5_media_entries(url, webpage, video_id)[0]
- if video_url:
- try:
- self._request_webpage(HEADRequest(video_url), video_id, 'Checking video URL')
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code in [404, 410]:
- self.report_warning('Invalid video URL, requesting another', video_id)
- return self._extract_video(video_id, file_key, video_url, try_num)
+ webpage = self._download_webpage(
+ 'https://www.cloudy.ec/v/%s' % video_id, video_id, fatal=False)
- return {
- 'id': video_id,
- 'url': video_url,
- 'title': title,
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ if webpage:
+ info.update({
+ 'title': self._search_regex(
+ r'<h\d[^>]*>([^<]+)<', webpage, 'title'),
+ 'upload_date': unified_strdate(self._search_regex(
+ r'>Published at (\d{4}-\d{1,2}-\d{1,2})', webpage,
+ 'upload date', fatal=False)),
+ 'view_count': str_to_int(self._search_regex(
+ r'([\d,.]+) views<', webpage, 'view count', fatal=False)),
+ })
- url = self._EMBED_URL % video_id
- webpage = self._download_webpage(url, video_id)
+ if not info.get('title'):
+ info['title'] = video_id
- file_key = self._search_regex(
- [r'key\s*:\s*"([^"]+)"', r'filekey\s*=\s*"([^"]+)"'],
- webpage, 'file_key')
+ info['id'] = video_id
- return self._extract_video(video_id, file_key)
+ return info
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index c2ca73ee1..6c3c095f7 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -36,34 +36,35 @@ from ..utils import (
clean_html,
compiled_regex_type,
determine_ext,
+ determine_protocol,
error_to_compat_str,
ExtractorError,
+ extract_attributes,
fix_xml_ampersands,
float_or_none,
GeoRestrictedError,
GeoUtils,
int_or_none,
js_to_json,
+ mimetype2ext,
+ orderedSet,
+ parse_codecs,
+ parse_duration,
parse_iso8601,
+ parse_m3u8_attributes,
RegexNotFoundError,
- sanitize_filename,
sanitized_Request,
+ sanitize_filename,
unescapeHTML,
unified_strdate,
unified_timestamp,
+ update_Request,
+ update_url_query,
+ urljoin,
url_basename,
xpath_element,
xpath_text,
xpath_with_ns,
- determine_protocol,
- parse_duration,
- mimetype2ext,
- update_Request,
- update_url_query,
- parse_m3u8_attributes,
- extract_attributes,
- parse_codecs,
- urljoin,
)
@@ -714,6 +715,13 @@ class InfoExtractor(object):
video_info['title'] = video_title
return video_info
+ def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None):
+ urlrs = orderedSet(
+ self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
+ for m in matches)
+ return self.playlist_result(
+ urlrs, playlist_id=video_id, playlist_title=video_title)
+
@staticmethod
def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
"""Returns a playlist"""
@@ -2204,56 +2212,9 @@ class InfoExtractor(object):
this_video_id = video_id or video_data['mediaid']
- formats = []
- for source in video_data['sources']:
- source_url = self._proto_relative_url(source['file'])
- if base_url:
- source_url = compat_urlparse.urljoin(base_url, source_url)
- source_type = source.get('type') or ''
- ext = mimetype2ext(source_type) or determine_ext(source_url)
- if source_type == 'hls' or ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
- elif ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- source_url, this_video_id, mpd_id=mpd_id, fatal=False))
- # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
- elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
- formats.append({
- 'url': source_url,
- 'vcodec': 'none',
- 'ext': ext,
- })
- else:
- height = int_or_none(source.get('height'))
- if height is None:
- # Often no height is provided but there is a label in
- # format like 1080p.
- height = int_or_none(self._search_regex(
- r'^(\d{3,})[pP]$', source.get('label') or '',
- 'height', default=None))
- a_format = {
- 'url': source_url,
- 'width': int_or_none(source.get('width')),
- 'height': height,
- 'ext': ext,
- }
- if source_url.startswith('rtmp'):
- a_format['ext'] = 'flv'
-
- # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
- # of jwplayer.flash.swf
- rtmp_url_parts = re.split(
- r'((?:mp4|mp3|flv):)', source_url, 1)
- if len(rtmp_url_parts) == 3:
- rtmp_url, prefix, play_path = rtmp_url_parts
- a_format.update({
- 'url': rtmp_url,
- 'play_path': prefix + play_path,
- })
- if rtmp_params:
- a_format.update(rtmp_params)
- formats.append(a_format)
+ formats = self._parse_jwplayer_formats(
+ video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
+ mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
self._sort_formats(formats)
subtitles = {}
@@ -2284,6 +2245,65 @@ class InfoExtractor(object):
else:
return self.playlist_result(entries)
+ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
+ m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
+ formats = []
+ for source in jwplayer_sources_data:
+ source_url = self._proto_relative_url(source['file'])
+ if base_url:
+ source_url = compat_urlparse.urljoin(base_url, source_url)
+ source_type = source.get('type') or ''
+ ext = mimetype2ext(source_type) or determine_ext(source_url)
+ if source_type == 'hls' or ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ source_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id=m3u8_id, fatal=False))
+ elif ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ source_url, video_id, mpd_id=mpd_id, fatal=False))
+ elif ext == 'smil':
+ formats.extend(self._extract_smil_formats(
+ source_url, video_id, fatal=False))
+ # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
+ elif source_type.startswith('audio') or ext in (
+ 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
+ formats.append({
+ 'url': source_url,
+ 'vcodec': 'none',
+ 'ext': ext,
+ })
+ else:
+ height = int_or_none(source.get('height'))
+ if height is None:
+ # Often no height is provided but there is a label in
+ # format like "1080p", "720p SD", or 1080.
+ height = int_or_none(self._search_regex(
+ r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
+ 'height', default=None))
+ a_format = {
+ 'url': source_url,
+ 'width': int_or_none(source.get('width')),
+ 'height': height,
+ 'tbr': int_or_none(source.get('bitrate')),
+ 'ext': ext,
+ }
+ if source_url.startswith('rtmp'):
+ a_format['ext'] = 'flv'
+ # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
+ # of jwplayer.flash.swf
+ rtmp_url_parts = re.split(
+ r'((?:mp4|mp3|flv):)', source_url, 1)
+ if len(rtmp_url_parts) == 3:
+ rtmp_url, prefix, play_path = rtmp_url_parts
+ a_format.update({
+ 'url': rtmp_url,
+ 'play_path': prefix + play_path,
+ })
+ if rtmp_params:
+ a_format.update(rtmp_params)
+ formats.append(a_format)
+ return formats
+
def _live_title(self, name):
""" Generate the title for a live video """
now = datetime.datetime.now()
diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py
index 8d8f60598..d3463b874 100644
--- a/youtube_dl/extractor/condenast.py
+++ b/youtube_dl/extractor/condenast.py
@@ -9,13 +9,14 @@ from ..compat import (
compat_urlparse,
)
from ..utils import (
- orderedSet,
- remove_end,
- extract_attributes,
- mimetype2ext,
determine_ext,
+ extract_attributes,
int_or_none,
+ js_to_json,
+ mimetype2ext,
+ orderedSet,
parse_iso8601,
+ remove_end,
)
@@ -67,6 +68,16 @@ class CondeNastIE(InfoExtractor):
'timestamp': 1363219200,
}
}, {
+ 'url': 'http://video.gq.com/watch/the-closer-with-keith-olbermann-the-only-true-surprise-trump-s-an-idiot?c=series',
+ 'info_dict': {
+ 'id': '58d1865bfd2e6126e2000015',
+ 'ext': 'mp4',
+ 'title': 'The Only True Surprise? Trump’s an Idiot',
+ 'uploader': 'gq',
+ 'upload_date': '20170321',
+ 'timestamp': 1490126427,
+ },
+ }, {
# JS embed
'url': 'http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js',
'md5': 'f1a6f9cafb7083bab74a710f65d08999',
@@ -114,26 +125,33 @@ class CondeNastIE(InfoExtractor):
})
video_id = query['videoId']
video_info = None
- info_page = self._download_webpage(
+ info_page = self._download_json(
'http://player.cnevids.com/player/video.js',
- video_id, 'Downloading video info', query=query, fatal=False)
+ video_id, 'Downloading video info', fatal=False, query=query)
if info_page:
- video_info = self._parse_json(self._search_regex(
- r'loadCallback\(({.+})\)', info_page, 'video info'), video_id)['video']
- else:
+ video_info = info_page.get('video')
+ if not video_info:
info_page = self._download_webpage(
'http://player.cnevids.com/player/loader.js',
video_id, 'Downloading loader info', query=query)
- video_info = self._parse_json(self._search_regex(
- r'var\s+video\s*=\s*({.+?});', info_page, 'video info'), video_id)
+ video_info = self._parse_json(
+ self._search_regex(
+ r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'),
+ video_id, transform_source=js_to_json)['video']
+
title = video_info['title']
formats = []
- for fdata in video_info.get('sources', [{}])[0]:
+ for fdata in video_info['sources']:
src = fdata.get('src')
if not src:
continue
ext = mimetype2ext(fdata.get('type')) or determine_ext(src)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ src, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ continue
quality = fdata.get('quality')
formats.append({
'format_id': ext + ('-%s' % quality if quality else ''),
@@ -169,7 +187,6 @@ class CondeNastIE(InfoExtractor):
path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/')))
url_type = 'embed'
- self.to_screen('Extracting from %s with the Condé Nast extractor' % self._SITES[site])
webpage = self._download_webpage(url, item_id)
if url_type == 'series':
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 9c6cf00ca..d15fd3744 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -177,6 +177,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'uploader': 'Kadokawa Pictures Inc.',
'upload_date': '20170118',
'series': "KONOSUBA -God's blessing on this wonderful world!",
+ 'season': "KONOSUBA -God's blessing on this wonderful world! 2",
'season_number': 2,
'episode': 'Give Me Deliverance from this Judicial Injustice!',
'episode_number': 1,
@@ -222,6 +223,23 @@ class CrunchyrollIE(CrunchyrollBaseIE):
# just test metadata extraction
'skip_download': True,
},
+ }, {
+ # A video with a vastly different season name compared to the series name
+ 'url': 'http://www.crunchyroll.com/nyarko-san-another-crawling-chaos/episode-1-test-590532',
+ 'info_dict': {
+ 'id': '590532',
+ 'ext': 'mp4',
+ 'title': 'Haiyoru! Nyaruani (ONA) Episode 1 – Test',
+ 'description': 'Mahiro and Nyaruko talk about official certification.',
+ 'uploader': 'TV TOKYO',
+ 'upload_date': '20120305',
+ 'series': 'Nyarko-san: Another Crawling Chaos',
+ 'season': 'Haiyoru! Nyaruani (ONA)',
+ },
+ 'params': {
+ # Just test metadata extraction
+ 'skip_download': True,
+ },
}]
_FORMAT_IDS = {
@@ -491,7 +509,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
# webpage provide more accurate data than series_title from XML
series = self._html_search_regex(
r'id=["\']showmedia_about_episode_num[^>]+>\s*<a[^>]+>([^<]+)',
- webpage, 'series', default=xpath_text(metadata, 'series_title'))
+ webpage, 'series', fatal=False)
+ season = xpath_text(metadata, 'series_title')
episode = xpath_text(metadata, 'episode_title')
episode_number = int_or_none(xpath_text(metadata, 'episode_number'))
@@ -508,6 +527,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
'uploader': video_uploader,
'upload_date': video_upload_date,
'series': series,
+ 'season': season,
'season_number': season_number,
'episode': episode,
'episode_number': episode_number,
diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py
index 2042493a8..7cd5d4291 100644
--- a/youtube_dl/extractor/discoverygo.py
+++ b/youtube_dl/extractor/discoverygo.py
@@ -1,17 +1,21 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
extract_attributes,
+ ExtractorError,
int_or_none,
parse_age_limit,
- ExtractorError,
+ remove_end,
+ unescapeHTML,
)
-class DiscoveryGoIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://(?:www\.)?(?:
+class DiscoveryGoBaseIE(InfoExtractor):
+ _VALID_URL_TEMPLATE = r'''(?x)https?://(?:www\.)?(?:
discovery|
investigationdiscovery|
discoverylife|
@@ -21,18 +25,23 @@ class DiscoveryGoIE(InfoExtractor):
sciencechannel|
tlc|
velocitychannel
- )go\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'''
+ )go\.com/%s(?P<id>[^/?#&]+)'''
+
+
+class DiscoveryGoIE(DiscoveryGoBaseIE):
+ _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % r'(?:[^/]+/)+'
+ _GEO_COUNTRIES = ['US']
_TEST = {
- 'url': 'https://www.discoverygo.com/love-at-first-kiss/kiss-first-ask-questions-later/',
+ 'url': 'https://www.discoverygo.com/bering-sea-gold/reaper-madness/',
'info_dict': {
- 'id': '57a33c536b66d1cd0345eeb1',
+ 'id': '58c167d86b66d12f2addeb01',
'ext': 'mp4',
- 'title': 'Kiss First, Ask Questions Later!',
- 'description': 'md5:fe923ba34050eae468bffae10831cb22',
- 'duration': 2579,
- 'series': 'Love at First Kiss',
- 'season_number': 1,
- 'episode_number': 1,
+ 'title': 'Reaper Madness',
+ 'description': 'md5:09f2c625c99afb8946ed4fb7865f6e78',
+ 'duration': 2519,
+ 'series': 'Bering Sea Gold',
+ 'season_number': 8,
+ 'episode_number': 6,
'age_limit': 14,
},
}
@@ -113,3 +122,46 @@ class DiscoveryGoIE(InfoExtractor):
'formats': formats,
'subtitles': subtitles,
}
+
+
+class DiscoveryGoPlaylistIE(DiscoveryGoBaseIE):
+ _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % ''
+ _TEST = {
+ 'url': 'https://www.discoverygo.com/bering-sea-gold/',
+ 'info_dict': {
+ 'id': 'bering-sea-gold',
+ 'title': 'Bering Sea Gold',
+ 'description': 'md5:cc5c6489835949043c0cc3ad66c2fa0e',
+ },
+ 'playlist_mincount': 6,
+ }
+
+ @classmethod
+ def suitable(cls, url):
+ return False if DiscoveryGoIE.suitable(url) else super(
+ DiscoveryGoPlaylistIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ entries = []
+ for mobj in re.finditer(r'data-json=(["\'])(?P<json>{.+?})\1', webpage):
+ data = self._parse_json(
+ mobj.group('json'), display_id,
+ transform_source=unescapeHTML, fatal=False)
+ if not isinstance(data, dict) or data.get('type') != 'episode':
+ continue
+ episode_url = data.get('socialUrl')
+ if not episode_url:
+ continue
+ entries.append(self.url_result(
+ episode_url, ie=DiscoveryGoIE.ie_key(),
+ video_id=data.get('id')))
+
+ return self.playlist_result(
+ entries, display_id,
+ remove_end(self._og_search_title(
+ webpage, fatal=False), ' | Discovery GO'),
+ self._og_search_description(webpage))
diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/discoverynetworks.py
index fd145ba42..b6653784c 100644
--- a/youtube_dl/extractor/tlc.py
+++ b/youtube_dl/extractor/discoverynetworks.py
@@ -9,13 +9,13 @@ from ..compat import (
compat_parse_qs,
compat_urlparse,
)
+from ..utils import smuggle_url
-class TlcDeIE(InfoExtractor):
- IE_NAME = 'tlc.de'
- _VALID_URL = r'https?://(?:www\.)?tlc\.de/(?:[^/]+/)*videos/(?P<title>[^/?#]+)?(?:.*#(?P<id>\d+))?'
+class DiscoveryNetworksDeIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?(?:discovery|tlc|animalplanet|dmax)\.de/(?:.*#(?P<id>\d+)|(?:[^/]+/)*videos/(?P<title>[^/?#]+))'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001',
'info_dict': {
'id': '3235167922001',
@@ -29,7 +29,13 @@ class TlcDeIE(InfoExtractor):
'upload_date': '20140404',
'uploader_id': '1659832546',
},
- }
+ }, {
+ 'url': 'http://www.dmax.de/programme/storage-hunters-uk/videos/storage-hunters-uk-episode-6/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.discovery.de/#5332316765001',
+ 'only_matching': True,
+ }]
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1659832546/default_default/index.html?videoId=%s'
def _real_extract(self, url):
@@ -39,5 +45,8 @@ class TlcDeIE(InfoExtractor):
title = mobj.group('title')
webpage = self._download_webpage(url, title)
brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
- brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0]
- return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
+ brightcove_id = compat_parse_qs(compat_urlparse.urlparse(
+ brightcove_legacy_url).query)['@videoPlayer'][0]
+ return self.url_result(smuggle_url(
+ self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, {'geo_countries': ['DE']}),
+ 'BrightcoveNew', brightcove_id)
diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py
index 9a83fb31a..82d8a042f 100644
--- a/youtube_dl/extractor/douyutv.py
+++ b/youtube_dl/extractor/douyutv.py
@@ -1,6 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
+import time
+import hashlib
+
from .common import InfoExtractor
from ..utils import (
ExtractorError,
@@ -16,7 +19,7 @@ class DouyuTVIE(InfoExtractor):
'info_dict': {
'id': '17732',
'display_id': 'iseven',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*',
'thumbnail': r're:^https?://.*\.jpg$',
@@ -31,7 +34,7 @@ class DouyuTVIE(InfoExtractor):
'info_dict': {
'id': '85982',
'display_id': '85982',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 're:^小漠从零单排记!——CSOL2躲猫猫 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': 'md5:746a2f7a253966a06755a912f0acc0d2',
'thumbnail': r're:^https?://.*\.jpg$',
@@ -47,7 +50,7 @@ class DouyuTVIE(InfoExtractor):
'info_dict': {
'id': '17732',
'display_id': '17732',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*',
'thumbnail': r're:^https?://.*\.jpg$',
@@ -66,10 +69,6 @@ class DouyuTVIE(InfoExtractor):
'only_matching': True,
}]
- # Decompile core.swf in webpage by ffdec "Search SWFs in memory". core.swf
- # is encrypted originally, but ffdec can dump memory to get the decrypted one.
- _API_KEY = 'A12Svb&%1UUmf@hC'
-
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -80,6 +79,7 @@ class DouyuTVIE(InfoExtractor):
room_id = self._html_search_regex(
r'"room_id\\?"\s*:\s*(\d+),', page, 'room id')
+ # Grab metadata from mobile API
room = self._download_json(
'http://m.douyu.com/html5/live?roomId=%s' % room_id, video_id,
note='Downloading room info')['data']
@@ -88,8 +88,19 @@ class DouyuTVIE(InfoExtractor):
if room.get('show_status') == '2':
raise ExtractorError('Live stream is offline', expected=True)
- formats = self._extract_m3u8_formats(
- room['hls_url'], video_id, ext='mp4')
+ # Grab the URL from PC client API
+ # The m3u8 url from mobile API requires re-authentication every 5 minutes
+ tt = int(time.time())
+ signContent = 'lapi/live/thirdPart/getPlay/%s?aid=pcclient&rate=0&time=%d9TUk5fjjUjg9qIMH3sdnh' % (room_id, tt)
+ sign = hashlib.md5(signContent.encode('ascii')).hexdigest()
+ video_url = self._download_json(
+ 'http://coapi.douyucdn.cn/lapi/live/thirdPart/getPlay/' + room_id,
+ video_id, note='Downloading video URL info',
+ query={'rate': 0}, headers={
+ 'auth': sign,
+ 'time': str(tt),
+ 'aid': 'pcclient'
+ })['data']['live_url']
title = self._live_title(unescapeHTML(room['room_name']))
description = room.get('show_details')
@@ -99,7 +110,7 @@ class DouyuTVIE(InfoExtractor):
return {
'id': room_id,
'display_id': video_id,
- 'formats': formats,
+ 'url': video_url,
'title': title,
'description': description,
'thumbnail': thumbnail,
diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py
index 32028bc3b..87c5dd63e 100644
--- a/youtube_dl/extractor/dplay.py
+++ b/youtube_dl/extractor/dplay.py
@@ -6,37 +6,24 @@ import re
import time
from .common import InfoExtractor
-from ..compat import compat_urlparse
+from ..compat import (
+ compat_urlparse,
+ compat_HTTPError,
+)
from ..utils import (
USER_AGENTS,
+ ExtractorError,
int_or_none,
+ unified_strdate,
+ remove_end,
update_url_query,
)
class DPlayIE(InfoExtractor):
- _VALID_URL = r'https?://(?P<domain>it\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?P<domain>www\.dplay\.(?:dk|se|no))/[^/]+/(?P<id>[^/?#]+)'
_TESTS = [{
- # geo restricted, via direct unsigned hls URL
- 'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/',
- 'info_dict': {
- 'id': '1255600',
- 'display_id': 'stagione-1-episodio-25',
- 'ext': 'mp4',
- 'title': 'Episodio 25',
- 'description': 'md5:cae5f40ad988811b197d2d27a53227eb',
- 'duration': 2761,
- 'timestamp': 1454701800,
- 'upload_date': '20160205',
- 'creator': 'RTIT',
- 'series': 'Take me out',
- 'season_number': 1,
- 'episode_number': 25,
- 'age_limit': 0,
- },
- 'expected_warnings': ['Unable to download f4m manifest'],
- }, {
# non geo restricted, via secure api, unsigned download hls URL
'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/',
'info_dict': {
@@ -168,3 +155,90 @@ class DPlayIE(InfoExtractor):
'formats': formats,
'subtitles': subtitles,
}
+
+
+class DPlayItIE(InfoExtractor):
+ _VALID_URL = r'https?://it\.dplay\.com/[^/]+/[^/]+/(?P<id>[^/?#]+)'
+ _GEO_COUNTRIES = ['IT']
+ _TEST = {
+ 'url': 'http://it.dplay.com/nove/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij/',
+ 'md5': '2b808ffb00fc47b884a172ca5d13053c',
+ 'info_dict': {
+ 'id': '6918',
+ 'display_id': 'luigi-di-maio-la-psicosi-di-stanislawskij',
+ 'ext': 'mp4',
+ 'title': 'Biografie imbarazzanti: Luigi Di Maio: la psicosi di Stanislawskij',
+ 'description': 'md5:3c7a4303aef85868f867a26f5cc14813',
+ 'thumbnail': r're:^https?://.*\.jpe?g',
+ 'upload_date': '20160524',
+ 'series': 'Biografie imbarazzanti',
+ 'season_number': 1,
+ 'episode': 'Luigi Di Maio: la psicosi di Stanislawskij',
+ 'episode_number': 1,
+ },
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ info_url = self._search_regex(
+ r'url\s*:\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)',
+ webpage, 'video id')
+
+ title = remove_end(self._og_search_title(webpage), ' | Dplay')
+
+ try:
+ info = self._download_json(
+ info_url, display_id, headers={
+ 'Authorization': 'Bearer %s' % self._get_cookies(url).get(
+ 'dplayit_token').value,
+ 'Referer': url,
+ })
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403):
+ info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
+ error = info['errors'][0]
+ if error.get('code') == 'access.denied.geoblocked':
+ self.raise_geo_restricted(
+ msg=error.get('detail'), countries=self._GEO_COUNTRIES)
+ raise ExtractorError(info['errors'][0]['detail'], expected=True)
+ raise
+
+ hls_url = info['data']['attributes']['streaming']['hls']['url']
+
+ formats = self._extract_m3u8_formats(
+ hls_url, display_id, ext='mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls')
+
+ series = self._html_search_regex(
+ r'(?s)<h1[^>]+class=["\'].*?\bshow_title\b.*?["\'][^>]*>(.+?)</h1>',
+ webpage, 'series', fatal=False)
+ episode = self._search_regex(
+ r'<p[^>]+class=["\'].*?\bdesc_ep\b.*?["\'][^>]*>\s*<br/>\s*<b>([^<]+)',
+ webpage, 'episode', fatal=False)
+
+ mobj = re.search(
+ r'(?s)<span[^>]+class=["\']dates["\'][^>]*>.+?\bS\.(?P<season_number>\d+)\s+E\.(?P<episode_number>\d+)\s*-\s*(?P<upload_date>\d{2}/\d{2}/\d{4})',
+ webpage)
+ if mobj:
+ season_number = int(mobj.group('season_number'))
+ episode_number = int(mobj.group('episode_number'))
+ upload_date = unified_strdate(mobj.group('upload_date'))
+ else:
+ season_number = episode_number = upload_date = None
+
+ return {
+ 'id': info_url.rpartition('/')[-1],
+ 'display_id': display_id,
+ 'title': title,
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'series': series,
+ 'season_number': season_number,
+ 'episode': episode,
+ 'episode_number': episode_number,
+ 'upload_date': upload_date,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py
index e966d7483..e4917014a 100644
--- a/youtube_dl/extractor/drtv.py
+++ b/youtube_dl/extractor/drtv.py
@@ -15,6 +15,8 @@ from ..utils import (
class DRTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder|radio/ondemand)/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)'
+ _GEO_BYPASS = False
+ _GEO_COUNTRIES = ['DK']
IE_NAME = 'drtv'
_TESTS = [{
'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10',
@@ -137,7 +139,7 @@ class DRTVIE(InfoExtractor):
if not formats and restricted_to_denmark:
self.raise_geo_restricted(
'Unfortunately, DR is not allowed to show this program outside Denmark.',
- expected=True)
+ countries=self._GEO_COUNTRIES)
self._sort_formats(formats)
@@ -156,6 +158,7 @@ class DRTVIE(InfoExtractor):
class DRTVLiveIE(InfoExtractor):
IE_NAME = 'drtv:live'
_VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv|TV)/live/(?P<id>[\da-z-]+)'
+ _GEO_COUNTRIES = ['DK']
_TEST = {
'url': 'https://www.dr.dk/tv/live/dr1',
'info_dict': {
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index b1613a9d3..6a7028a4d 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -71,6 +71,7 @@ from .arte import (
)
from .atresplayer import AtresPlayerIE
from .atttechchannel import ATTTechChannelIE
+from .atvat import ATVAtIE
from .audimedia import AudiMediaIE
from .audioboom import AudioBoomIE
from .audiomack import AudiomackIE, AudiomackAlbumIE
@@ -117,6 +118,7 @@ from .bleacherreport import (
from .blinkx import BlinkxIE
from .bloomberg import BloombergIE
from .bokecc import BokeCCIE
+from .bostonglobe import BostonGlobeIE
from .bpb import BpbIE
from .br import BRIE
from .bravotv import BravoTVIE
@@ -246,7 +248,10 @@ from .dfb import DFBIE
from .dhm import DHMIE
from .dotsub import DotsubIE
from .douyutv import DouyuTVIE
-from .dplay import DPlayIE
+from .dplay import (
+ DPlayIE,
+ DPlayItIE,
+)
from .dramafever import (
DramaFeverIE,
DramaFeverSeriesIE,
@@ -262,7 +267,11 @@ from .dvtv import DVTVIE
from .dumpert import DumpertIE
from .defense import DefenseGouvFrIE
from .discovery import DiscoveryIE
-from .discoverygo import DiscoveryGoIE
+from .discoverygo import (
+ DiscoveryGoIE,
+ DiscoveryGoPlaylistIE,
+)
+from .discoverynetworks import DiscoveryNetworksDeIE
from .disney import DisneyIE
from .dispeak import DigitallySpeakingIE
from .dropbox import DropboxIE
@@ -793,6 +802,7 @@ from .rai import (
)
from .rbmaradio import RBMARadioIE
from .rds import RDSIE
+from .redbulltv import RedBullTVIE
from .redtube import RedTubeIE
from .regiotv import RegioTVIE
from .rentv import (
@@ -966,7 +976,6 @@ from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE
from .threeqsdn import ThreeQSDNIE
from .tinypic import TinyPicIE
-from .tlc import TlcDeIE
from .tmz import (
TMZIE,
TMZArticleIE,
@@ -979,6 +988,7 @@ from .tnaflix import (
)
from .toggle import ToggleIE
from .tonline import TOnlineIE
+from .toongoggles import ToonGogglesIE
from .toutv import TouTvIE
from .toypics import ToypicsUserIE, ToypicsIE
from .traileraddict import TrailerAddictIE
@@ -999,6 +1009,7 @@ from .tunein import (
TuneInTopicIE,
TuneInShortenerIE,
)
+from .tunepk import TunePkIE
from .turbo import TurboIE
from .tutv import TutvIE
from .tv2 import (
@@ -1165,6 +1176,8 @@ from .voicerepublic import VoiceRepublicIE
from .voxmedia import VoxMediaIE
from .vporn import VpornIE
from .vrt import VRTIE
+from .vrak import VrakIE
+from .medialaan import MedialaanIE
from .vube import VubeIE
from .vuclip import VuClipIE
from .vvvvid import VVVVIDIE
diff --git a/youtube_dl/extractor/eyedotv.py b/youtube_dl/extractor/eyedotv.py
index 2f3035147..f62ddebae 100644
--- a/youtube_dl/extractor/eyedotv.py
+++ b/youtube_dl/extractor/eyedotv.py
@@ -54,7 +54,7 @@ class EyedoTVIE(InfoExtractor):
'id': video_id,
'title': title,
'formats': self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4', 'm3u8' if is_live else 'm3u8_native'),
+ m3u8_url, video_id, 'mp4', 'm3u8_native'),
'description': xpath_text(video_data, _add_ns('Description')),
'duration': parse_duration(xpath_text(video_data, _add_ns('Duration'))),
'uploader': xpath_text(video_data, _add_ns('Createur')),
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index 70b8c95c5..b69c1ede0 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -196,6 +196,10 @@ class FacebookIE(InfoExtractor):
}, {
'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670',
'only_matching': True,
+ }, {
+ # no title
+ 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/',
+ 'only_matching': True,
}]
@staticmethod
@@ -303,7 +307,7 @@ class FacebookIE(InfoExtractor):
if not video_data:
server_js_data = self._parse_json(
self._search_regex(
- r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall)',
+ r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall|permalink_video_pagelet)',
webpage, 'js data', default='{}'),
video_id, transform_source=js_to_json, fatal=False)
if server_js_data:
@@ -353,15 +357,15 @@ class FacebookIE(InfoExtractor):
self._sort_formats(formats)
video_title = self._html_search_regex(
- r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title',
- default=None)
+ r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage,
+ 'title', default=None)
if not video_title:
video_title = self._html_search_regex(
r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
webpage, 'alternative title', default=None)
if not video_title:
video_title = self._html_search_meta(
- 'description', webpage, 'title')
+ 'description', webpage, 'title', default=None)
if video_title:
video_title = limit_length(video_title, 80)
else:
diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py
index 9f2e5d065..159fdf9c4 100644
--- a/youtube_dl/extractor/fox.py
+++ b/youtube_dl/extractor/fox.py
@@ -47,9 +47,12 @@ class FOXIE(AdobePassIE):
resource = self._get_mvpd_resource('fbc-fox', None, ap_p['videoGUID'], rating)
query['auth'] = self._extract_mvpd_auth(url, video_id, 'fbc-fox', resource)
- return {
+ info = self._search_json_ld(webpage, video_id, fatal=False)
+ info.update({
'_type': 'url_transparent',
'ie_key': 'ThePlatform',
'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}),
'id': video_id,
- }
+ })
+
+ return info
diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py
index b98da692c..b8fa17588 100644
--- a/youtube_dl/extractor/franceculture.py
+++ b/youtube_dl/extractor/franceculture.py
@@ -4,7 +4,8 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
determine_ext,
- unified_strdate,
+ extract_attributes,
+ int_or_none,
)
@@ -19,6 +20,7 @@ class FranceCultureIE(InfoExtractor):
'title': 'Rendez-vous au pays des geeks',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20140301',
+ 'timestamp': 1393642916,
'vcodec': 'none',
}
}
@@ -28,30 +30,34 @@ class FranceCultureIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
- video_url = self._search_regex(
- r'(?s)<div[^>]+class="[^"]*?title-zone-diffusion[^"]*?"[^>]*>.*?<button[^>]+data-asset-source="([^"]+)"',
- webpage, 'video path')
+ video_data = extract_attributes(self._search_regex(
+ r'(?s)<div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>.*?(<button[^>]+data-asset-source="[^"]+"[^>]+>)',
+ webpage, 'video data'))
- title = self._og_search_title(webpage)
+ video_url = video_data['data-asset-source']
+ title = video_data.get('data-asset-title') or self._og_search_title(webpage)
- upload_date = unified_strdate(self._search_regex(
- '(?s)<div[^>]+class="date"[^>]*>.*?<span[^>]+class="inner"[^>]*>([^<]+)<',
- webpage, 'upload date', fatal=False))
+ description = self._html_search_regex(
+ r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>',
+ webpage, 'description', default=None)
thumbnail = self._search_regex(
- r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+data-dejavu-src="([^"]+)"',
+ r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"',
webpage, 'thumbnail', fatal=False)
uploader = self._html_search_regex(
- r'(?s)<div id="emission".*?<span class="author">(.*?)</span>',
+ r'(?s)<span class="author">(.*?)</span>',
webpage, 'uploader', default=None)
- vcodec = 'none' if determine_ext(video_url.lower()) == 'mp3' else None
+ ext = determine_ext(video_url.lower())
return {
'id': display_id,
'display_id': display_id,
'url': video_url,
'title': title,
+ 'description': description,
'thumbnail': thumbnail,
- 'vcodec': vcodec,
+ 'ext': ext,
+ 'vcodec': 'none' if ext == 'mp3' else None,
'uploader': uploader,
- 'upload_date': upload_date,
+ 'timestamp': int_or_none(video_data.get('data-asset-created-date')),
+ 'duration': int_or_none(video_data.get('data-duration')),
}
diff --git a/youtube_dl/extractor/freshlive.py b/youtube_dl/extractor/freshlive.py
index a90f9156c..72a845945 100644
--- a/youtube_dl/extractor/freshlive.py
+++ b/youtube_dl/extractor/freshlive.py
@@ -56,9 +56,8 @@ class FreshLiveIE(InfoExtractor):
is_live = info.get('liveStreamUrl') is not None
formats = self._extract_m3u8_formats(
- stream_url, video_id, ext='mp4',
- entry_protocol='m3u8' if is_live else 'm3u8_native',
- m3u8_id='hls')
+ stream_url, video_id, 'mp4',
+ 'm3u8_native', m3u8_id='hls')
if is_live:
title = self._live_title(title)
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 3fe0237b6..274f81738 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -84,6 +84,7 @@ from .twentymin import TwentyMinutenIE
from .ustream import UstreamIE
from .openload import OpenloadIE
from .videopress import VideoPressIE
+from .rutube import RutubeIE
class GenericIE(InfoExtractor):
@@ -448,6 +449,23 @@ class GenericIE(InfoExtractor):
},
}],
},
+ {
+ # Brightcove with UUID in videoPlayer
+ 'url': 'http://www8.hp.com/cn/zh/home.html',
+ 'info_dict': {
+ 'id': '5255815316001',
+ 'ext': 'mp4',
+ 'title': 'Sprocket Video - China',
+ 'description': 'Sprocket Video - China',
+ 'uploader': 'HP-Video Gallery',
+ 'timestamp': 1482263210,
+ 'upload_date': '20161220',
+ 'uploader_id': '1107601872001',
+ },
+ 'params': {
+ 'skip_download': True, # m3u8 download
+ },
+ },
# ooyala video
{
'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
@@ -884,12 +902,13 @@ class GenericIE(InfoExtractor):
},
# LazyYT
{
- 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
+ 'url': 'https://skiplagged.com/',
'info_dict': {
- 'id': '1986',
- 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
+ 'id': 'skiplagged',
+ 'title': 'Skiplagged: The smart way to find cheap flights',
},
- 'playlist_mincount': 2,
+ 'playlist_mincount': 1,
+ 'add_ie': ['Youtube'],
},
# Cinchcast embed
{
@@ -1517,10 +1536,38 @@ class GenericIE(InfoExtractor):
'add_ie': [VideoPressIE.ie_key()],
},
{
+ # Rutube embed
+ 'url': 'http://magazzino.friday.ru/videos/vipuski/kazan-2',
+ 'info_dict': {
+ 'id': '9b3d5bee0a8740bf70dfd29d3ea43541',
+ 'ext': 'flv',
+ 'title': 'Магаззино: Казань 2',
+ 'description': 'md5:99bccdfac2269f0e8fdbc4bbc9db184a',
+ 'uploader': 'Магаззино',
+ 'upload_date': '20170228',
+ 'uploader_id': '996642',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': [RutubeIE.ie_key()],
+ },
+ {
# ThePlatform embedded with whitespaces in URLs
'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm',
'only_matching': True,
},
+ {
+ # Senate ISVP iframe https
+ 'url': 'https://www.hsgac.senate.gov/hearings/canadas-fast-track-refugee-plan-unanswered-questions-and-implications-for-us-national-security',
+ 'md5': 'fb8c70b0b515e5037981a2492099aab8',
+ 'info_dict': {
+ 'id': 'govtaff020316',
+ 'ext': 'mp4',
+ 'title': 'Integrated Senate Video Player',
+ },
+ 'add_ie': [SenateISVPIE.ie_key()],
+ },
# {
# # TODO: find another test
# # http://schema.org/VideoObject
@@ -1820,14 +1867,6 @@ class GenericIE(InfoExtractor):
video_description = self._og_search_description(webpage, default=None)
video_thumbnail = self._og_search_thumbnail(webpage, default=None)
- # Helper method
- def _playlist_from_matches(matches, getter=None, ie=None):
- urlrs = orderedSet(
- self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
- for m in matches)
- return self.playlist_result(
- urlrs, playlist_id=video_id, playlist_title=video_title)
-
# Look for Brightcove Legacy Studio embeds
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
if bc_urls:
@@ -1848,28 +1887,28 @@ class GenericIE(InfoExtractor):
# Look for Brightcove New Studio embeds
bc_urls = BrightcoveNewIE._extract_urls(webpage)
if bc_urls:
- return _playlist_from_matches(bc_urls, ie='BrightcoveNew')
+ return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')
# Look for ThePlatform embeds
tp_urls = ThePlatformIE._extract_urls(webpage)
if tp_urls:
- return _playlist_from_matches(tp_urls, ie='ThePlatform')
+ return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform')
# Look for Vessel embeds
vessel_urls = VesselIE._extract_urls(webpage)
if vessel_urls:
- return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key())
+ return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key())
# Look for embedded rtl.nl player
matches = re.findall(
r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
webpage)
if matches:
- return _playlist_from_matches(matches, ie='RtlNl')
+ return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')
vimeo_urls = VimeoIE._extract_urls(url, webpage)
if vimeo_urls:
- return _playlist_from_matches(vimeo_urls, ie=VimeoIE.ie_key())
+ return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key())
vid_me_embed_url = self._search_regex(
r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
@@ -1891,25 +1930,25 @@ class GenericIE(InfoExtractor):
(?:embed|v|p)/.+?)
\1''', webpage)
if matches:
- return _playlist_from_matches(
- matches, lambda m: unescapeHTML(m[1]))
+ return self.playlist_from_matches(
+ matches, video_id, video_title, lambda m: unescapeHTML(m[1]))
# Look for lazyYT YouTube embed
matches = re.findall(
r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
if matches:
- return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
+ return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m))
# Look for Wordpress "YouTube Video Importer" plugin
matches = re.findall(r'''(?x)<div[^>]+
class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
if matches:
- return _playlist_from_matches(matches, lambda m: m[-1])
+ return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1])
matches = DailymotionIE._extract_urls(webpage)
if matches:
- return _playlist_from_matches(matches)
+ return self.playlist_from_matches(matches, video_id, video_title)
# Look for embedded Dailymotion playlist player (#3822)
m = re.search(
@@ -1918,8 +1957,8 @@ class GenericIE(InfoExtractor):
playlists = re.findall(
r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
if playlists:
- return _playlist_from_matches(
- playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
+ return self.playlist_from_matches(
+ playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
# Look for embedded Wistia player
match = re.search(
@@ -2026,8 +2065,9 @@ class GenericIE(InfoExtractor):
if mobj is not None:
embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
if embeds:
- return _playlist_from_matches(
- embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')
+ return self.playlist_from_matches(
+ embeds, video_id, video_title,
+ getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')
# Look for Aparat videos
mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
@@ -2089,13 +2129,13 @@ class GenericIE(InfoExtractor):
# Look for funnyordie embed
matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
if matches:
- return _playlist_from_matches(
- matches, getter=unescapeHTML, ie='FunnyOrDie')
+ return self.playlist_from_matches(
+ matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')
# Look for BBC iPlayer embed
matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
if matches:
- return _playlist_from_matches(matches, ie='BBCCoUk')
+ return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk')
# Look for embedded RUTV player
rutv_url = RUTVIE._extract_url(webpage)
@@ -2110,32 +2150,32 @@ class GenericIE(InfoExtractor):
# Look for embedded SportBox player
sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
if sportbox_urls:
- return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
+ return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed')
# Look for embedded XHamster player
xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
if xhamster_urls:
- return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
+ return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed')
# Look for embedded TNAFlixNetwork player
tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage)
if tnaflix_urls:
- return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key())
+ return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key())
# Look for embedded PornHub player
pornhub_urls = PornHubIE._extract_urls(webpage)
if pornhub_urls:
- return _playlist_from_matches(pornhub_urls, ie=PornHubIE.ie_key())
+ return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key())
# Look for embedded DrTuber player
drtuber_urls = DrTuberIE._extract_urls(webpage)
if drtuber_urls:
- return _playlist_from_matches(drtuber_urls, ie=DrTuberIE.ie_key())
+ return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key())
# Look for embedded RedTube player
redtube_urls = RedTubeIE._extract_urls(webpage)
if redtube_urls:
- return _playlist_from_matches(redtube_urls, ie=RedTubeIE.ie_key())
+ return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key())
# Look for embedded Tvigle player
mobj = re.search(
@@ -2181,12 +2221,12 @@ class GenericIE(InfoExtractor):
# Look for embedded soundcloud player
soundcloud_urls = SoundcloudIE._extract_urls(webpage)
if soundcloud_urls:
- return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key())
+ return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key())
# Look for tunein player
tunein_urls = TuneInBaseIE._extract_urls(webpage)
if tunein_urls:
- return _playlist_from_matches(tunein_urls)
+ return self.playlist_from_matches(tunein_urls, video_id, video_title)
# Look for embedded mtvservices player
mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
@@ -2469,30 +2509,36 @@ class GenericIE(InfoExtractor):
# Look for DBTV embeds
dbtv_urls = DBTVIE._extract_urls(webpage)
if dbtv_urls:
- return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key())
+ return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key())
# Look for Videa embeds
videa_urls = VideaIE._extract_urls(webpage)
if videa_urls:
- return _playlist_from_matches(videa_urls, ie=VideaIE.ie_key())
+ return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key())
# Look for 20 minuten embeds
twentymin_urls = TwentyMinutenIE._extract_urls(webpage)
if twentymin_urls:
- return _playlist_from_matches(
- twentymin_urls, ie=TwentyMinutenIE.ie_key())
+ return self.playlist_from_matches(
+ twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key())
# Look for Openload embeds
openload_urls = OpenloadIE._extract_urls(webpage)
if openload_urls:
- return _playlist_from_matches(
- openload_urls, ie=OpenloadIE.ie_key())
+ return self.playlist_from_matches(
+ openload_urls, video_id, video_title, ie=OpenloadIE.ie_key())
# Look for VideoPress embeds
videopress_urls = VideoPressIE._extract_urls(webpage)
if videopress_urls:
- return _playlist_from_matches(
- videopress_urls, ie=VideoPressIE.ie_key())
+ return self.playlist_from_matches(
+ videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key())
+
+ # Look for Rutube embeds
+ rutube_urls = RutubeIE._extract_urls(webpage)
+ if rutube_urls:
+ return self.playlist_from_matches(
+ rutube_urls, ie=RutubeIE.ie_key())
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(
@@ -2521,7 +2567,11 @@ class GenericIE(InfoExtractor):
jwplayer_data = self._find_jwplayer_data(
webpage, video_id, transform_source=js_to_json)
if jwplayer_data:
- return self._parse_jwplayer_data(jwplayer_data, video_id)
+ info = self._parse_jwplayer_data(
+ jwplayer_data, video_id, require_title=False)
+ if not info.get('title'):
+ info['title'] = video_title
+ return info
def check_video(vurl):
if YoutubeIE.suitable(vurl):
@@ -2596,11 +2646,14 @@ class GenericIE(InfoExtractor):
found = re.search(REDIRECT_REGEX, refresh_header)
if found:
new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1)))
- self.report_following_redirect(new_url)
- return {
- '_type': 'url',
- 'url': new_url,
- }
+ if new_url != url:
+ self.report_following_redirect(new_url)
+ return {
+ '_type': 'url',
+ 'url': new_url,
+ }
+ else:
+ found = None
if not found:
# twitter:player is a https URL to iframe player that may or may not
diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py
index 21ed846b2..4c9be47b4 100644
--- a/youtube_dl/extractor/go.py
+++ b/youtube_dl/extractor/go.py
@@ -36,7 +36,7 @@ class GoIE(AdobePassIE):
'requestor_id': 'DisneyXD',
}
}
- _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|season-\d+/\d+-(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys())
+ _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys())
_TESTS = [{
'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx',
'info_dict': {
@@ -52,6 +52,12 @@ class GoIE(AdobePassIE):
}, {
'url': 'http://abc.go.com/shows/after-paradise/video/most-recent/vdka3335601',
'only_matching': True,
+ }, {
+ 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://abc.go.com/shows/world-news-tonight/episode-guide/2017-02/17-021717-intense-stand-off-between-man-with-rifle-and-police-in-oakland',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/hbo.py b/youtube_dl/extractor/hbo.py
index 8116ad9bd..931f71a5a 100644
--- a/youtube_dl/extractor/hbo.py
+++ b/youtube_dl/extractor/hbo.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
xpath_text,
xpath_element,
@@ -14,14 +15,26 @@ from ..utils import (
class HBOBaseIE(InfoExtractor):
_FORMATS_INFO = {
+ 'pro7': {
+ 'width': 1280,
+ 'height': 720,
+ },
'1920': {
'width': 1280,
'height': 720,
},
+ 'pro6': {
+ 'width': 768,
+ 'height': 432,
+ },
'640': {
'width': 768,
'height': 432,
},
+ 'pro5': {
+ 'width': 640,
+ 'height': 360,
+ },
'highwifi': {
'width': 640,
'height': 360,
@@ -78,6 +91,17 @@ class HBOBaseIE(InfoExtractor):
formats.extend(self._extract_m3u8_formats(
video_url.replace('.tar', '/base_index_w8.m3u8'),
video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+ elif source.tag == 'hls':
+ # #EXT-X-BYTERANGE is not supported by native hls downloader
+ # and ffmpeg (#10955)
+ # formats.extend(self._extract_m3u8_formats(
+ # video_url.replace('.tar', '/base_index.m3u8'),
+ # video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+ continue
+ elif source.tag == 'dash':
+ formats.extend(self._extract_mpd_formats(
+ video_url.replace('.tar', '/manifest.mpd'),
+ video_id, mpd_id='dash', fatal=False))
else:
format_info = self._FORMATS_INFO.get(source.tag, {})
formats.append({
@@ -112,10 +136,11 @@ class HBOBaseIE(InfoExtractor):
class HBOIE(HBOBaseIE):
+ IE_NAME = 'hbo'
_VALID_URL = r'https?://(?:www\.)?hbo\.com/video/video\.html\?.*vid=(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.hbo.com/video/video.html?autoplay=true&g=u&vid=1437839',
- 'md5': '1c33253f0c7782142c993c0ba62a8753',
+ 'md5': '2c6a6bc1222c7e91cb3334dad1746e5a',
'info_dict': {
'id': '1437839',
'ext': 'mp4',
@@ -131,11 +156,12 @@ class HBOIE(HBOBaseIE):
class HBOEpisodeIE(HBOBaseIE):
- _VALID_URL = r'https?://(?:www\.)?hbo\.com/(?!video)([^/]+/)+video/(?P<id>[0-9a-z-]+)\.html'
+ IE_NAME = 'hbo:episode'
+ _VALID_URL = r'https?://(?:www\.)?hbo\.com/(?P<path>(?!video)(?:(?:[^/]+/)+video|watch-free-episodes)/(?P<id>[0-9a-z-]+))(?:\.html)?'
_TESTS = [{
'url': 'http://www.hbo.com/girls/episodes/5/52-i-love-you-baby/video/ep-52-inside-the-episode.html?autoplay=true',
- 'md5': '689132b253cc0ab7434237fc3a293210',
+ 'md5': '61ead79b9c0dfa8d3d4b07ef4ac556fb',
'info_dict': {
'id': '1439518',
'display_id': 'ep-52-inside-the-episode',
@@ -147,16 +173,19 @@ class HBOEpisodeIE(HBOBaseIE):
}, {
'url': 'http://www.hbo.com/game-of-thrones/about/video/season-5-invitation-to-the-set.html?autoplay=true',
'only_matching': True,
+ }, {
+ 'url': 'http://www.hbo.com/watch-free-episodes/last-week-tonight-with-john-oliver',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- display_id = self._match_id(url)
+ path, display_id = re.match(self._VALID_URL, url).groups()
- webpage = self._download_webpage(url, display_id)
+ content = self._download_json(
+ 'http://www.hbo.com/api/content/' + path, display_id)['content']
- video_id = self._search_regex(
- r'(?P<q1>[\'"])videoId(?P=q1)\s*:\s*(?P<q2>[\'"])(?P<video_id>\d+)(?P=q2)',
- webpage, 'video ID', group='video_id')
+ video_id = compat_str((content.get('parsed', {}).get(
+ 'common:FullBleedVideo', {}) or content['selectedEpisode'])['videoId'])
info_dict = self._extract_from_id(video_id)
info_dict['display_id'] = display_id
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py
index c863413bf..7f946c6ed 100644
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -119,7 +119,8 @@ class LivestreamIE(InfoExtractor):
m3u8_url = video_data.get('m3u8_url')
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+ m3u8_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
f4m_url = video_data.get('f4m_url')
if f4m_url:
@@ -158,11 +159,11 @@ class LivestreamIE(InfoExtractor):
if smil_url:
formats.extend(self._extract_smil_formats(smil_url, broadcast_id))
- entry_protocol = 'm3u8' if is_live else 'm3u8_native'
m3u8_url = stream_info.get('m3u8_url')
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
- m3u8_url, broadcast_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=False))
+ m3u8_url, broadcast_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
rtsp_url = stream_info.get('rtsp_url')
if rtsp_url:
@@ -276,7 +277,7 @@ class LivestreamOriginalIE(InfoExtractor):
'view_count': view_count,
}
- def _extract_video_formats(self, video_data, video_id, entry_protocol):
+ def _extract_video_formats(self, video_data, video_id):
formats = []
progressive_url = video_data.get('progressiveUrl')
@@ -289,7 +290,8 @@ class LivestreamOriginalIE(InfoExtractor):
m3u8_url = video_data.get('httpUrl')
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=False))
+ m3u8_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
rtsp_url = video_data.get('rtspUrl')
if rtsp_url:
@@ -340,11 +342,10 @@ class LivestreamOriginalIE(InfoExtractor):
}
video_data = self._download_json(stream_url, content_id)
is_live = video_data.get('isLive')
- entry_protocol = 'm3u8' if is_live else 'm3u8_native'
info.update({
'id': content_id,
'title': self._live_title(info['title']) if is_live else info['title'],
- 'formats': self._extract_video_formats(video_data, content_id, entry_protocol),
+ 'formats': self._extract_video_formats(video_data, content_id),
'is_live': is_live,
})
return info
diff --git a/youtube_dl/extractor/medialaan.py b/youtube_dl/extractor/medialaan.py
new file mode 100644
index 000000000..6e067474b
--- /dev/null
+++ b/youtube_dl/extractor/medialaan.py
@@ -0,0 +1,259 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ parse_duration,
+ try_get,
+ unified_timestamp,
+ urlencode_postdata,
+)
+
+
+class MedialaanIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:www\.)?
+ (?:
+ (?P<site_id>vtm|q2|vtmkzoom)\.be/
+ (?:
+ video(?:/[^/]+/id/|/?\?.*?\baid=)|
+ (?:[^/]+/)*
+ )
+ )
+ (?P<id>[^/?#&]+)
+ '''
+ _NETRC_MACHINE = 'medialaan'
+ _APIKEY = '3_HZ0FtkMW_gOyKlqQzW5_0FHRC7Nd5XpXJZcDdXY4pk5eES2ZWmejRW5egwVm4ug-'
+ _SITE_TO_APP_ID = {
+ 'vtm': 'vtm_watch',
+ 'q2': 'q2',
+ 'vtmkzoom': 'vtmkzoom',
+ }
+ _TESTS = [{
+ # vod
+ 'url': 'http://vtm.be/video/volledige-afleveringen/id/vtm_20170219_VM0678361_vtmwatch',
+ 'info_dict': {
+ 'id': 'vtm_20170219_VM0678361_vtmwatch',
+ 'ext': 'mp4',
+ 'title': 'Allemaal Chris afl. 6',
+ 'description': 'md5:4be86427521e7b07e0adb0c9c554ddb2',
+ 'timestamp': 1487533280,
+ 'upload_date': '20170219',
+ 'duration': 2562,
+ 'series': 'Allemaal Chris',
+ 'season': 'Allemaal Chris',
+ 'season_number': 1,
+ 'season_id': '256936078124527',
+ 'episode': 'Allemaal Chris afl. 6',
+ 'episode_number': 6,
+ 'episode_id': '256936078591527',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'Requires account credentials',
+ }, {
+ # clip
+ 'url': 'http://vtm.be/video?aid=168332',
+ 'info_dict': {
+ 'id': '168332',
+ 'ext': 'mp4',
+ 'title': '"Veronique liegt!"',
+ 'description': 'md5:1385e2b743923afe54ba4adc38476155',
+ 'timestamp': 1489002029,
+ 'upload_date': '20170308',
+ 'duration': 96,
+ },
+ }, {
+ # vod
+ 'url': 'http://vtm.be/video/volledige-afleveringen/id/257107153551000',
+ 'only_matching': True,
+ }, {
+ # vod
+ 'url': 'http://vtm.be/video?aid=163157',
+ 'only_matching': True,
+ }, {
+ # vod
+ 'url': 'http://www.q2.be/video/volledige-afleveringen/id/2be_20170301_VM0684442_q2',
+ 'only_matching': True,
+ }, {
+ # clip
+ 'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio',
+ 'only_matching': True,
+ }]
+
+ def _real_initialize(self):
+ self._logged_in = False
+
+ def _login(self):
+ username, password = self._get_login_info()
+ if username is None:
+ self.raise_login_required()
+
+ auth_data = {
+ 'APIKey': self._APIKEY,
+ 'sdk': 'js_6.1',
+ 'format': 'json',
+ 'loginID': username,
+ 'password': password,
+ }
+
+ auth_info = self._download_json(
+ 'https://accounts.eu1.gigya.com/accounts.login', None,
+ note='Logging in', errnote='Unable to log in',
+ data=urlencode_postdata(auth_data))
+
+ error_message = auth_info.get('errorDetails') or auth_info.get('errorMessage')
+ if error_message:
+ raise ExtractorError(
+ 'Unable to login: %s' % error_message, expected=True)
+
+ self._uid = auth_info['UID']
+ self._uid_signature = auth_info['UIDSignature']
+ self._signature_timestamp = auth_info['signatureTimestamp']
+
+ self._logged_in = True
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id, site_id = mobj.group('id', 'site_id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ config = self._parse_json(
+ self._search_regex(
+ r'videoJSConfig\s*=\s*JSON\.parse\(\'({.+?})\'\);',
+ webpage, 'config', default='{}'), video_id,
+ transform_source=lambda s: s.replace(
+ '\\\\', '\\').replace(r'\"', '"').replace(r"\'", "'"))
+
+ vod_id = config.get('vodId') or self._search_regex(
+ (r'\\"vodId\\"\s*:\s*\\"(.+?)\\"',
+ r'<[^>]+id=["\']vod-(\d+)'),
+ webpage, 'video_id', default=None)
+
+ # clip, no authentication required
+ if not vod_id:
+ player = self._parse_json(
+ self._search_regex(
+ r'vmmaplayer\(({.+?})\);', webpage, 'vmma player',
+ default=''),
+ video_id, transform_source=lambda s: '[%s]' % s, fatal=False)
+ if player:
+ video = player[-1]
+ info = {
+ 'id': video_id,
+ 'url': video['videoUrl'],
+ 'title': video['title'],
+ 'thumbnail': video.get('imageUrl'),
+ 'timestamp': int_or_none(video.get('createdDate')),
+ 'duration': int_or_none(video.get('duration')),
+ }
+ else:
+ info = self._parse_html5_media_entries(
+ url, webpage, video_id, m3u8_id='hls')[0]
+ info.update({
+ 'id': video_id,
+ 'title': self._html_search_meta('description', webpage),
+ 'duration': parse_duration(self._html_search_meta('duration', webpage)),
+ })
+ # vod, authentication required
+ else:
+ if not self._logged_in:
+ self._login()
+
+ settings = self._parse_json(
+ self._search_regex(
+ r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
+ webpage, 'drupal settings', default='{}'),
+ video_id)
+
+ def get(container, item):
+ return try_get(
+ settings, lambda x: x[container][item],
+ compat_str) or self._search_regex(
+ r'"%s"\s*:\s*"([^"]+)' % item, webpage, item,
+ default=None)
+
+ app_id = get('vod', 'app_id') or self._SITE_TO_APP_ID.get(site_id, 'vtm_watch')
+ sso = get('vod', 'gigyaDatabase') or 'vtm-sso'
+
+ data = self._download_json(
+ 'http://vod.medialaan.io/api/1.0/item/%s/video' % vod_id,
+ video_id, query={
+ 'app_id': app_id,
+ 'user_network': sso,
+ 'UID': self._uid,
+ 'UIDSignature': self._uid_signature,
+ 'signatureTimestamp': self._signature_timestamp,
+ })
+
+ formats = self._extract_m3u8_formats(
+ data['response']['uri'], video_id, entry_protocol='m3u8_native',
+ ext='mp4', m3u8_id='hls')
+
+ self._sort_formats(formats)
+
+ info = {
+ 'id': vod_id,
+ 'formats': formats,
+ }
+
+ api_key = get('vod', 'apiKey')
+ channel = get('medialaanGigya', 'channel')
+
+ if api_key:
+ videos = self._download_json(
+ 'http://vod.medialaan.io/vod/v2/videos', video_id, fatal=False,
+ query={
+ 'channels': channel,
+ 'ids': vod_id,
+ 'limit': 1,
+ 'apikey': api_key,
+ })
+ if videos:
+ video = try_get(
+ videos, lambda x: x['response']['videos'][0], dict)
+ if video:
+ def get(container, item, expected_type=None):
+ return try_get(
+ video, lambda x: x[container][item], expected_type)
+
+ def get_string(container, item):
+ return get(container, item, compat_str)
+
+ info.update({
+ 'series': get_string('program', 'title'),
+ 'season': get_string('season', 'title'),
+ 'season_number': int_or_none(get('season', 'number')),
+ 'season_id': get_string('season', 'id'),
+ 'episode': get_string('episode', 'title'),
+ 'episode_number': int_or_none(get('episode', 'number')),
+ 'episode_id': get_string('episode', 'id'),
+ 'duration': int_or_none(
+ video.get('duration')) or int_or_none(
+ video.get('durationMillis'), scale=1000),
+ 'title': get_string('episode', 'title'),
+ 'description': get_string('episode', 'text'),
+ 'timestamp': unified_timestamp(get_string(
+ 'publication', 'begin')),
+ })
+
+ if not info.get('title'):
+ info['title'] = try_get(
+ config, lambda x: x['videoConfig']['title'],
+ compat_str) or self._html_search_regex(
+ r'\\"title\\"\s*:\s*\\"(.+?)\\"', webpage, 'title',
+ default=None) or self._og_search_title(webpage)
+
+ if not info.get('description'):
+ info['description'] = self._html_search_regex(
+ r'<div[^>]+class="field-item\s+even">\s*<p>(.+?)</p>',
+ webpage, 'description', default=None)
+
+ return info
diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py
index ec1b4c4fe..40f72d66f 100644
--- a/youtube_dl/extractor/miomio.py
+++ b/youtube_dl/extractor/miomio.py
@@ -51,6 +51,7 @@ class MioMioIE(InfoExtractor):
'ext': 'mp4',
'title': 'マツコの知らない世界【劇的進化SP!ビニール傘&冷凍食品2016】 1_2 - 16 05 31',
},
+ 'skip': 'Unable to load videos',
}]
def _extract_mioplayer(self, webpage, video_id, title, http_headers):
@@ -94,9 +95,18 @@ class MioMioIE(InfoExtractor):
return entries
+ def _download_chinese_webpage(self, *args, **kwargs):
+ # Requests with English locales return garbage
+ headers = {
+ 'Accept-Language': 'zh-TW,en-US;q=0.7,en;q=0.3',
+ }
+ kwargs.setdefault('headers', {}).update(headers)
+ return self._download_webpage(*args, **kwargs)
+
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_chinese_webpage(
+ url, video_id)
title = self._html_search_meta(
'description', webpage, 'title', fatal=True)
@@ -106,7 +116,7 @@ class MioMioIE(InfoExtractor):
if '_h5' in mioplayer_path:
player_url = compat_urlparse.urljoin(url, mioplayer_path)
- player_webpage = self._download_webpage(
+ player_webpage = self._download_chinese_webpage(
player_url, video_id,
note='Downloading player webpage', headers={'Referer': url})
entries = self._parse_html5_media_entries(player_url, player_webpage, video_id)
diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py
index 79e0b8ada..28b743cca 100644
--- a/youtube_dl/extractor/mitele.py
+++ b/youtube_dl/extractor/mitele.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import uuid
from .common import InfoExtractor
+from .ooyala import OoyalaIE
from ..compat import (
compat_str,
compat_urllib_parse_urlencode,
@@ -24,6 +25,9 @@ class MiTeleBaseIE(InfoExtractor):
r'(?s)(<ms-video-player.+?</ms-video-player>)',
webpage, 'ms video player'))
video_id = player_data['data-media-id']
+ if player_data.get('data-cms-id') == 'ooyala':
+ return self.url_result(
+ 'ooyala:%s' % video_id, ie=OoyalaIE.ie_key(), video_id=video_id)
config_url = compat_urlparse.urljoin(url, player_data['data-config'])
config = self._download_json(
config_url, video_id, 'Downloading config JSON')
diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py
index d9943fc2c..8961309fd 100644
--- a/youtube_dl/extractor/ninecninemedia.py
+++ b/youtube_dl/extractor/ninecninemedia.py
@@ -34,12 +34,6 @@ class NineCNineMediaStackIE(NineCNineMediaBaseIE):
formats.extend(self._extract_f4m_formats(
stack_base_url + 'f4m', stack_id,
f4m_id='hds', fatal=False))
- mp4_url = self._download_webpage(stack_base_url + 'pd', stack_id, fatal=False)
- if mp4_url:
- formats.append({
- 'url': mp4_url,
- 'format_id': 'mp4',
- })
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py
index 50473d777..38fefe492 100644
--- a/youtube_dl/extractor/npo.py
+++ b/youtube_dl/extractor/npo.py
@@ -3,41 +3,27 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..compat import (
+ compat_HTTPError,
+ compat_str,
+)
from ..utils import (
+ determine_ext,
+ ExtractorError,
fix_xml_ampersands,
orderedSet,
parse_duration,
qualities,
strip_jsonp,
unified_strdate,
- ExtractorError,
)
class NPOBaseIE(InfoExtractor):
def _get_token(self, video_id):
- token_page = self._download_webpage(
- 'http://ida.omroep.nl/npoplayer/i.js',
- video_id, note='Downloading token')
- token = self._search_regex(
- r'npoplayer\.token = "(.+?)"', token_page, 'token')
- # Decryption algorithm extracted from http://npoplayer.omroep.nl/csjs/npoplayer-min.js
- token_l = list(token)
- first = second = None
- for i in range(5, len(token_l) - 4):
- if token_l[i].isdigit():
- if first is None:
- first = i
- elif second is None:
- second = i
- if first is None or second is None:
- first = 12
- second = 13
-
- token_l[first], token_l[second] = token_l[second], token_l[first]
-
- return ''.join(token_l)
+ return self._download_json(
+ 'http://ida.omroep.nl/app.php/auth', video_id,
+ note='Downloading token')['token']
class NPOIE(NPOBaseIE):
@@ -58,103 +44,113 @@ class NPOIE(NPOBaseIE):
(?P<id>[^/?#]+)
'''
- _TESTS = [
- {
- 'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719',
- 'md5': '4b3f9c429157ec4775f2c9cb7b911016',
- 'info_dict': {
- 'id': 'VPWON_1220719',
- 'ext': 'm4v',
- 'title': 'Nieuwsuur',
- 'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.',
- 'upload_date': '20140622',
- },
+ _TESTS = [{
+ 'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719',
+ 'md5': '4b3f9c429157ec4775f2c9cb7b911016',
+ 'info_dict': {
+ 'id': 'VPWON_1220719',
+ 'ext': 'm4v',
+ 'title': 'Nieuwsuur',
+ 'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.',
+ 'upload_date': '20140622',
},
- {
- 'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800',
- 'md5': 'da50a5787dbfc1603c4ad80f31c5120b',
- 'info_dict': {
- 'id': 'VARA_101191800',
- 'ext': 'm4v',
- 'title': 'De Mega Mike & Mega Thomas show: The best of.',
- 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4',
- 'upload_date': '20090227',
- 'duration': 2400,
- },
+ }, {
+ 'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800',
+ 'md5': 'da50a5787dbfc1603c4ad80f31c5120b',
+ 'info_dict': {
+ 'id': 'VARA_101191800',
+ 'ext': 'm4v',
+ 'title': 'De Mega Mike & Mega Thomas show: The best of.',
+ 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4',
+ 'upload_date': '20090227',
+ 'duration': 2400,
},
- {
- 'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289',
- 'md5': 'f8065e4e5a7824068ed3c7e783178f2c',
- 'info_dict': {
- 'id': 'VPWON_1169289',
- 'ext': 'm4v',
- 'title': 'Tegenlicht: De toekomst komt uit Afrika',
- 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea',
- 'upload_date': '20130225',
- 'duration': 3000,
- },
+ }, {
+ 'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289',
+ 'md5': 'f8065e4e5a7824068ed3c7e783178f2c',
+ 'info_dict': {
+ 'id': 'VPWON_1169289',
+ 'ext': 'm4v',
+ 'title': 'Tegenlicht: Zwart geld. De toekomst komt uit Afrika',
+ 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea',
+ 'upload_date': '20130225',
+ 'duration': 3000,
},
- {
- 'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706',
- 'info_dict': {
- 'id': 'WO_VPRO_043706',
- 'ext': 'wmv',
- 'title': 'De nieuwe mens - Deel 1',
- 'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b',
- 'duration': 4680,
- },
- 'params': {
- # mplayer mms download
- 'skip_download': True,
- }
+ }, {
+ 'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706',
+ 'info_dict': {
+ 'id': 'WO_VPRO_043706',
+ 'ext': 'm4v',
+ 'title': 'De nieuwe mens - Deel 1',
+ 'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b',
+ 'duration': 4680,
},
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
# non asf in streams
- {
- 'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771',
- 'md5': 'b3da13de374cbe2d5332a7e910bef97f',
- 'info_dict': {
- 'id': 'WO_NOS_762771',
- 'ext': 'mp4',
- 'title': 'Hoe gaat Europa verder na Parijs?',
- },
- },
- {
- 'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content',
- 'md5': '01c6a2841675995da1f0cf776f03a9c3',
- 'info_dict': {
- 'id': 'VPWON_1233944',
- 'ext': 'm4v',
- 'title': 'Aap, poot, pies',
- 'description': 'md5:c9c8005d1869ae65b858e82c01a91fde',
- 'upload_date': '20150508',
- 'duration': 599,
- },
+ 'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771',
+ 'info_dict': {
+ 'id': 'WO_NOS_762771',
+ 'ext': 'mp4',
+ 'title': 'Hoe gaat Europa verder na Parijs?',
},
- {
- 'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698',
- 'md5': 'd30cd8417b8b9bca1fdff27428860d08',
- 'info_dict': {
- 'id': 'POW_00996502',
- 'ext': 'm4v',
- 'title': '''"Dit is wel een 'landslide'..."''',
- 'description': 'md5:f8d66d537dfb641380226e31ca57b8e8',
- 'upload_date': '20150508',
- 'duration': 462,
- },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content',
+ 'info_dict': {
+ 'id': 'VPWON_1233944',
+ 'ext': 'm4v',
+ 'title': 'Aap, poot, pies',
+ 'description': 'md5:c9c8005d1869ae65b858e82c01a91fde',
+ 'upload_date': '20150508',
+ 'duration': 599,
},
- {
- 'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547',
- 'only_matching': True,
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698',
+ 'info_dict': {
+ 'id': 'POW_00996502',
+ 'ext': 'm4v',
+ 'title': '''"Dit is wel een 'landslide'..."''',
+ 'description': 'md5:f8d66d537dfb641380226e31ca57b8e8',
+ 'upload_date': '20150508',
+ 'duration': 462,
},
- {
- 'url': 'http://www.zapp.nl/de-bzt-show/filmpjes/POMS_KN_7315118',
- 'only_matching': True,
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ # audio
+ 'url': 'http://www.npo.nl/jouw-stad-rotterdam/29-01-2017/RBX_FUNX_6683215/RBX_FUNX_7601437',
+ 'info_dict': {
+ 'id': 'RBX_FUNX_6683215',
+ 'ext': 'mp3',
+ 'title': 'Jouw Stad Rotterdam',
+ 'description': 'md5:db251505244f097717ec59fabc372d9f',
},
- {
- 'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990',
- 'only_matching': True,
+ 'params': {
+ 'skip_download': True,
}
- ]
+ }, {
+ 'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.zapp.nl/de-bzt-show/filmpjes/POMS_KN_7315118',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990',
+ 'only_matching': True,
+ }, {
+ # live stream
+ 'url': 'npo:LI_NL1_4188102',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -183,70 +179,115 @@ class NPOIE(NPOBaseIE):
token = self._get_token(video_id)
formats = []
+ urls = set()
+
+ quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std'])
+ items = self._download_json(
+ 'http://ida.omroep.nl/app.php/%s' % video_id, video_id,
+ 'Downloading formats JSON', query={
+ 'adaptive': 'yes',
+ 'token': token,
+ })['items'][0]
+ for num, item in enumerate(items):
+ item_url = item.get('url')
+ if not item_url or item_url in urls:
+ continue
+ urls.add(item_url)
+ format_id = self._search_regex(
+ r'video/ida/([^/]+)', item_url, 'format id',
+ default=None)
+
+ def add_format_url(format_url):
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ 'quality': quality(format_id),
+ })
+
+ # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706
+ if item.get('contentType') in ('url', 'audio'):
+ add_format_url(item_url)
+ continue
- pubopties = metadata.get('pubopties')
- if pubopties:
- quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std'])
- for format_id in pubopties:
- format_info = self._download_json(
- 'http://ida.omroep.nl/odi/?prid=%s&puboptions=%s&adaptive=yes&token=%s'
- % (video_id, format_id, token),
- video_id, 'Downloading %s JSON' % format_id)
- if format_info.get('error_code', 0) or format_info.get('errorcode', 0):
+ try:
+ stream_info = self._download_json(
+ item_url + '&type=json', video_id,
+ 'Downloading %s stream JSON'
+ % item.get('label') or item.get('format') or format_id or num)
+ except ExtractorError as ee:
+ if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404:
+ error = (self._parse_json(
+ ee.cause.read().decode(), video_id,
+ fatal=False) or {}).get('errorstring')
+ if error:
+ raise ExtractorError(error, expected=True)
+ raise
+ # Stream URL instead of JSON, example: npo:LI_NL1_4188102
+ if isinstance(stream_info, compat_str):
+ if not stream_info.startswith('http'):
continue
- streams = format_info.get('streams')
- if streams:
- try:
- video_info = self._download_json(
- streams[0] + '&type=json',
- video_id, 'Downloading %s stream JSON' % format_id)
- except ExtractorError as ee:
- if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404:
- error = (self._parse_json(ee.cause.read().decode(), video_id, fatal=False) or {}).get('errorstring')
- if error:
- raise ExtractorError(error, expected=True)
- raise
- else:
- video_info = format_info
- video_url = video_info.get('url')
- if not video_url:
+ video_url = stream_info
+ # JSON
+ else:
+ video_url = stream_info.get('url')
+ if not video_url or video_url in urls:
+ continue
+ urls.add(item_url)
+ if determine_ext(video_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, ext='mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+ else:
+ add_format_url(video_url)
+
+ is_live = metadata.get('medium') == 'live'
+
+ if not is_live:
+ for num, stream in enumerate(metadata.get('streams', [])):
+ stream_url = stream.get('url')
+ if not stream_url or stream_url in urls:
continue
- if format_id == 'adaptive':
- formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4'))
- else:
+ urls.add(stream_url)
+ # smooth streaming is not supported
+ stream_type = stream.get('type', '').lower()
+ if stream_type in ['ss', 'ms']:
+ continue
+ if stream_type == 'hds':
+ f4m_formats = self._extract_f4m_formats(
+ stream_url, video_id, fatal=False)
+ # f4m downloader downloads only piece of live stream
+ for f4m_format in f4m_formats:
+ f4m_format['preference'] = -1
+ formats.extend(f4m_formats)
+ elif stream_type == 'hls':
+ formats.extend(self._extract_m3u8_formats(
+ stream_url, video_id, ext='mp4', fatal=False))
+ # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706
+ elif '.asf' in stream_url:
+ asx = self._download_xml(
+ stream_url, video_id,
+ 'Downloading stream %d ASX playlist' % num,
+ transform_source=fix_xml_ampersands, fatal=False)
+ if not asx:
+ continue
+ ref = asx.find('./ENTRY/Ref')
+ if ref is None:
+ continue
+ video_url = ref.get('href')
+ if not video_url or video_url in urls:
+ continue
+ urls.add(video_url)
formats.append({
'url': video_url,
- 'format_id': format_id,
- 'quality': quality(format_id),
+ 'ext': stream.get('formaat', 'asf'),
+ 'quality': stream.get('kwaliteit'),
+ 'preference': -10,
})
-
- streams = metadata.get('streams')
- if streams:
- for i, stream in enumerate(streams):
- stream_url = stream.get('url')
- if not stream_url:
- continue
- if '.asf' not in stream_url:
+ else:
formats.append({
'url': stream_url,
'quality': stream.get('kwaliteit'),
})
- continue
- asx = self._download_xml(
- stream_url, video_id,
- 'Downloading stream %d ASX playlist' % i,
- transform_source=fix_xml_ampersands)
- ref = asx.find('./ENTRY/Ref')
- if ref is None:
- continue
- video_url = ref.get('href')
- if not video_url:
- continue
- formats.append({
- 'url': video_url,
- 'ext': stream.get('formaat', 'asf'),
- 'quality': stream.get('kwaliteit'),
- })
self._sort_formats(formats)
@@ -259,28 +300,28 @@ class NPOIE(NPOBaseIE):
return {
'id': video_id,
- 'title': title,
+ 'title': self._live_title(title) if is_live else title,
'description': metadata.get('info'),
'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'],
'upload_date': unified_strdate(metadata.get('gidsdatum')),
'duration': parse_duration(metadata.get('tijdsduur')),
'formats': formats,
'subtitles': subtitles,
+ 'is_live': is_live,
}
class NPOLiveIE(NPOBaseIE):
IE_NAME = 'npo.nl:live'
- _VALID_URL = r'https?://(?:www\.)?npo\.nl/live/(?P<id>.+)'
+ _VALID_URL = r'https?://(?:www\.)?npo\.nl/live/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'http://www.npo.nl/live/npo-1',
'info_dict': {
- 'id': 'LI_NEDERLAND1_136692',
+ 'id': 'LI_NL1_4188102',
'display_id': 'npo-1',
'ext': 'mp4',
- 'title': 're:^Nederland 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
- 'description': 'Livestream',
+ 'title': 're:^NPO 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'is_live': True,
},
'params': {
@@ -296,58 +337,12 @@ class NPOLiveIE(NPOBaseIE):
live_id = self._search_regex(
r'data-prid="([^"]+)"', webpage, 'live id')
- metadata = self._download_json(
- 'http://e.omroep.nl/metadata/%s' % live_id,
- display_id, transform_source=strip_jsonp)
-
- token = self._get_token(display_id)
-
- formats = []
-
- streams = metadata.get('streams')
- if streams:
- for stream in streams:
- stream_type = stream.get('type').lower()
- # smooth streaming is not supported
- if stream_type in ['ss', 'ms']:
- continue
- stream_info = self._download_json(
- 'http://ida.omroep.nl/aapi/?stream=%s&token=%s&type=jsonp'
- % (stream.get('url'), token),
- display_id, 'Downloading %s JSON' % stream_type)
- if stream_info.get('error_code', 0) or stream_info.get('errorcode', 0):
- continue
- stream_url = self._download_json(
- stream_info['stream'], display_id,
- 'Downloading %s URL' % stream_type,
- 'Unable to download %s URL' % stream_type,
- transform_source=strip_jsonp, fatal=False)
- if not stream_url:
- continue
- if stream_type == 'hds':
- f4m_formats = self._extract_f4m_formats(stream_url, display_id)
- # f4m downloader downloads only piece of live stream
- for f4m_format in f4m_formats:
- f4m_format['preference'] = -1
- formats.extend(f4m_formats)
- elif stream_type == 'hls':
- formats.extend(self._extract_m3u8_formats(stream_url, display_id, 'mp4'))
- else:
- formats.append({
- 'url': stream_url,
- 'preference': -10,
- })
-
- self._sort_formats(formats)
-
return {
+ '_type': 'url_transparent',
+ 'url': 'npo:%s' % live_id,
+ 'ie_key': NPOIE.ie_key(),
'id': live_id,
'display_id': display_id,
- 'title': self._live_title(metadata['titel']),
- 'description': metadata['info'],
- 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'],
- 'formats': formats,
- 'is_live': True,
}
diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py
index fc7ff43a6..58ffde541 100644
--- a/youtube_dl/extractor/openload.py
+++ b/youtube_dl/extractor/openload.py
@@ -75,22 +75,51 @@ class OpenloadIE(InfoExtractor):
'<span[^>]+id="[^"]+"[^>]*>([0-9A-Za-z]+)</span>',
webpage, 'openload ID')
- first_char = int(ol_id[0])
- urlcode = []
- num = 1
-
- while num < len(ol_id):
- i = ord(ol_id[num])
- key = 0
- if i <= 90:
- key = i - 65
- elif i >= 97:
- key = 25 + i - 97
- urlcode.append((key, compat_chr(int(ol_id[num + 2:num + 5]) // int(ol_id[num + 1]) - first_char)))
- num += 5
-
- video_url = 'https://openload.co/stream/' + ''.join(
- [value for _, value in sorted(urlcode, key=lambda x: x[0])])
+ video_url_chars = []
+
+ first_char = ord(ol_id[0])
+ key = first_char - 55
+ maxKey = max(2, key)
+ key = min(maxKey, len(ol_id) - 38)
+ t = ol_id[key:key + 36]
+
+ hashMap = {}
+ v = ol_id.replace(t, '')
+ h = 0
+
+ while h < len(t):
+ f = t[h:h + 3]
+ i = int(f, 8)
+ hashMap[h / 3] = i
+ h += 3
+
+ h = 0
+ H = 0
+ while h < len(v):
+ B = ''
+ C = ''
+ if len(v) >= h + 2:
+ B = v[h:h + 2]
+ if len(v) >= h + 3:
+ C = v[h:h + 3]
+ i = int(B, 16)
+ h += 2
+ if H % 3 == 0:
+ i = int(C, 8)
+ h += 1
+ elif H % 2 == 0 and H != 0 and ord(v[H - 1]) < 60:
+ i = int(C, 10)
+ h += 1
+ index = H % 7
+
+ A = hashMap[index]
+ i ^= 213
+ i ^= A
+ video_url_chars.append(compat_chr(i))
+ H += 1
+
+ video_url = 'https://openload.co/stream/%s?mime=true'
+ video_url = video_url % (''.join(video_url_chars))
title = self._og_search_title(webpage, default=None) or self._search_regex(
r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage,
diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py
index e0cbd045e..e45d9fe55 100644
--- a/youtube_dl/extractor/pluralsight.py
+++ b/youtube_dl/extractor/pluralsight.py
@@ -40,7 +40,7 @@ class PluralsightIE(PluralsightBaseIE):
'info_dict': {
'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04',
'ext': 'mp4',
- 'title': 'Management of SQL Server - Demo Monitoring',
+ 'title': 'Demo Monitoring',
'duration': 338,
},
'skip': 'Requires pluralsight account credentials',
@@ -169,11 +169,10 @@ class PluralsightIE(PluralsightBaseIE):
collection = course['modules']
- module, clip = None, None
+ clip = None
for module_ in collection:
if name in (module_.get('moduleName'), module_.get('name')):
- module = module_
for clip_ in module_.get('clips', []):
clip_index = clip_.get('clipIndex')
if clip_index is None:
@@ -187,7 +186,7 @@ class PluralsightIE(PluralsightBaseIE):
if not clip:
raise ExtractorError('Unable to resolve clip')
- title = '%s - %s' % (module['title'], clip['title'])
+ title = clip['title']
QUALITIES = {
'low': {'width': 640, 'height': 480},
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index 9b413590a..b25f1f193 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -1,7 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
+import functools
import itertools
+import operator
# import os
import re
@@ -18,6 +20,7 @@ from ..utils import (
js_to_json,
orderedSet,
# sanitized_Request,
+ remove_quotes,
str_to_int,
)
# from ..aes import (
@@ -129,9 +132,32 @@ class PornHubIE(InfoExtractor):
tv_webpage = dl_webpage('tv')
- video_url = self._search_regex(
- r'<video[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//.+?)\1', tv_webpage,
- 'video url', group='url')
+ assignments = self._search_regex(
+ r'(var.+?mediastring.+?)</script>', tv_webpage,
+ 'encoded url').split(';')
+
+ js_vars = {}
+
+ def parse_js_value(inp):
+ inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
+ if '+' in inp:
+ inps = inp.split('+')
+ return functools.reduce(
+ operator.concat, map(parse_js_value, inps))
+ inp = inp.strip()
+ if inp in js_vars:
+ return js_vars[inp]
+ return remove_quotes(inp)
+
+ for assn in assignments:
+ assn = assn.strip()
+ if not assn:
+ continue
+ assn = re.sub(r'var\s+', '', assn)
+ vname, value = assn.split('=', 1)
+ js_vars[vname] = parse_js_value(value)
+
+ video_url = js_vars['mediastring']
title = self._search_regex(
r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None)
diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py
index 1245309a7..d8a4bd244 100644
--- a/youtube_dl/extractor/prosiebensat1.py
+++ b/youtube_dl/extractor/prosiebensat1.py
@@ -301,6 +301,21 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE):
},
},
{
+ # title in <h2 class="subtitle">
+ 'url': 'http://www.prosieben.de/stars/oscar-award/videos/jetzt-erst-enthuellt-das-geheimnis-von-emma-stones-oscar-robe-clip',
+ 'info_dict': {
+ 'id': '4895826',
+ 'ext': 'mp4',
+ 'title': 'Jetzt erst enthüllt: Das Geheimnis von Emma Stones Oscar-Robe',
+ 'description': 'md5:e5ace2bc43fadf7b63adc6187e9450b9',
+ 'upload_date': '20170302',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'geo restricted to Germany',
+ },
+ {
# geo restricted to Germany
'url': 'http://www.kabeleinsdoku.de/tv/mayday-alarm-im-cockpit/video/102-notlandung-im-hudson-river-ganze-folge',
'only_matching': True,
@@ -338,6 +353,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE):
r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>',
r'<h2 class="video-title" itemprop="name">\s*(.+?)</h2>',
r'<div[^>]+id="veeseoTitle"[^>]*>(.+?)</div>',
+ r'<h2[^>]+class="subtitle"[^>]*>([^<]+)</h2>',
]
_DESCRIPTION_REGEXES = [
r'<p itemprop="description">\s*(.+?)</p>',
@@ -369,7 +385,9 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE):
def _extract_clip(self, url, webpage):
clip_id = self._html_search_regex(
self._CLIPID_REGEXES, webpage, 'clip id')
- title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title')
+ title = self._html_search_regex(
+ self._TITLE_REGEXES, webpage, 'title',
+ default=None) or self._og_search_title(webpage)
info = self._extract_video_info(url, clip_id)
description = self._html_search_regex(
self._DESCRIPTION_REGEXES, webpage, 'description', default=None)
diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py
new file mode 100644
index 000000000..afab62426
--- /dev/null
+++ b/youtube_dl/extractor/redbulltv.py
@@ -0,0 +1,122 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+ float_or_none,
+ int_or_none,
+ try_get,
+ # unified_timestamp,
+ ExtractorError,
+)
+
+
+class RedBullTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?redbull\.tv/(?:video|film)/(?P<id>AP-\w+)'
+ _TESTS = [{
+ # film
+ 'url': 'https://www.redbull.tv/video/AP-1Q756YYX51W11/abc-of-wrc',
+ 'md5': 'fb0445b98aa4394e504b413d98031d1f',
+ 'info_dict': {
+ 'id': 'AP-1Q756YYX51W11',
+ 'ext': 'mp4',
+ 'title': 'ABC of...WRC',
+ 'description': 'md5:5c7ed8f4015c8492ecf64b6ab31e7d31',
+ 'duration': 1582.04,
+ # 'timestamp': 1488405786,
+ # 'upload_date': '20170301',
+ },
+ }, {
+ # episode
+ 'url': 'https://www.redbull.tv/video/AP-1PMT5JCWH1W11/grime?playlist=shows:shows-playall:web',
+ 'info_dict': {
+ 'id': 'AP-1PMT5JCWH1W11',
+ 'ext': 'mp4',
+ 'title': 'Grime - Hashtags S2 E4',
+ 'description': 'md5:334b741c8c1ce65be057eab6773c1cf5',
+ 'duration': 904.6,
+ # 'timestamp': 1487290093,
+ # 'upload_date': '20170217',
+ 'series': 'Hashtags',
+ 'season_number': 2,
+ 'episode_number': 4,
+ },
+ }, {
+ 'url': 'https://www.redbull.tv/film/AP-1MSKKF5T92111/in-motion',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ session = self._download_json(
+ 'https://api-v2.redbull.tv/session', video_id,
+ note='Downloading access token', query={
+ 'build': '4.370.0',
+ 'category': 'personal_computer',
+ 'os_version': '1.0',
+ 'os_family': 'http',
+ })
+ if session.get('code') == 'error':
+ raise ExtractorError('%s said: %s' % (
+ self.IE_NAME, session['message']))
+ auth = '%s %s' % (session.get('token_type', 'Bearer'), session['access_token'])
+
+ try:
+ info = self._download_json(
+ 'https://api-v2.redbull.tv/content/%s' % video_id,
+ video_id, note='Downloading video information',
+ headers={'Authorization': auth}
+ )
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
+ error_message = self._parse_json(
+ e.cause.read().decode(), video_id)['message']
+ raise ExtractorError('%s said: %s' % (
+ self.IE_NAME, error_message), expected=True)
+ raise
+
+ video = info['video_product']
+
+ title = info['title'].strip()
+
+ formats = self._extract_m3u8_formats(
+ video['url'], video_id, 'mp4', 'm3u8_native')
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for _, captions in (try_get(
+ video, lambda x: x['attachments']['captions'],
+ dict) or {}).items():
+ if not captions or not isinstance(captions, list):
+ continue
+ for caption in captions:
+ caption_url = caption.get('url')
+ if not caption_url:
+ continue
+ ext = caption.get('format')
+ if ext == 'xml':
+ ext = 'ttml'
+ subtitles.setdefault(caption.get('lang') or 'en', []).append({
+ 'url': caption_url,
+ 'ext': ext,
+ })
+
+ subheading = info.get('subheading')
+ if subheading:
+ title += ' - %s' % subheading
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': info.get('long_description') or info.get(
+ 'short_description'),
+ 'duration': float_or_none(video.get('duration'), scale=1000),
+ # 'timestamp': unified_timestamp(info.get('published')),
+ 'series': info.get('show_title'),
+ 'season_number': int_or_none(info.get('season_number')),
+ 'episode_number': int_or_none(info.get('episode_number')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py
index fd1df925b..889fa7628 100644
--- a/youtube_dl/extractor/rutube.py
+++ b/youtube_dl/extractor/rutube.py
@@ -17,7 +17,7 @@ from ..utils import (
class RutubeIE(InfoExtractor):
IE_NAME = 'rutube'
IE_DESC = 'Rutube videos'
- _VALID_URL = r'https?://rutube\.ru/(?:video|play/embed)/(?P<id>[\da-z]{32})'
+ _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})'
_TESTS = [{
'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
@@ -39,8 +39,17 @@ class RutubeIE(InfoExtractor):
}, {
'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661',
'only_matching': True,
+ }, {
+ 'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661',
+ 'only_matching': True,
}]
+ @staticmethod
+ def _extract_urls(webpage):
+ return [mobj.group('url') for mobj in re.finditer(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1',
+ webpage)]
+
def _real_extract(self, url):
video_id = self._match_id(url)
video = self._download_json(
diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py
index 20d01754a..6c09df25a 100644
--- a/youtube_dl/extractor/ruutu.py
+++ b/youtube_dl/extractor/ruutu.py
@@ -82,6 +82,9 @@ class RuutuIE(InfoExtractor):
formats.extend(self._extract_f4m_formats(
video_url, video_id, f4m_id='hds', fatal=False))
elif ext == 'mpd':
+ # video-only and audio-only streams are of different
+ # duration resulting in out of sync issue
+ continue
formats.extend(self._extract_mpd_formats(
video_url, video_id, mpd_id='dash', fatal=False))
else:
diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py
index 387a4f7f6..db5ef8b57 100644
--- a/youtube_dl/extractor/senateisvp.py
+++ b/youtube_dl/extractor/senateisvp.py
@@ -89,7 +89,7 @@ class SenateISVPIE(InfoExtractor):
@staticmethod
def _search_iframe_url(webpage):
mobj = re.search(
- r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
+ r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
webpage)
if mobj:
return mobj.group('url')
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index b3aa4ce26..0ee4a8ff8 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -121,7 +121,7 @@ class SoundcloudIE(InfoExtractor):
},
]
- _CLIENT_ID = 'fDoItMDbsbZz8dY16ZzARCZmzgHBPotA'
+ _CLIENT_ID = '2t9loNQH90kzJcsFCODdigxfp325aq4z'
_IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'
@staticmethod
diff --git a/youtube_dl/extractor/streamable.py b/youtube_dl/extractor/streamable.py
index e973c867c..9f5c237ef 100644
--- a/youtube_dl/extractor/streamable.py
+++ b/youtube_dl/extractor/streamable.py
@@ -65,7 +65,7 @@ class StreamableIE(InfoExtractor):
# to return video info like the title properly sometimes, and doesn't
# include info like the video duration
video = self._download_json(
- 'https://streamable.com/ajax/videos/%s' % video_id, video_id)
+ 'https://ajax.streamable.com/videos/%s' % video_id, video_id)
# Format IDs:
# 0 The video is being uploaded
diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py
index d5abfc9e4..fdcc7d573 100644
--- a/youtube_dl/extractor/telecinco.py
+++ b/youtube_dl/extractor/telecinco.py
@@ -44,6 +44,10 @@ class TelecincoIE(MiTeleBaseIE):
}, {
'url': 'http://www.telecinco.es/espanasinirmaslejos/Espana-gran-destino-turistico_2_1240605043.html',
'only_matching': True,
+ }, {
+ # ooyala video
+ 'url': 'http://www.cuatro.com/chesterinlove/a-carta/chester-chester_in_love-chester_edu_2_2331030022.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py
index 82d73c31d..fafaa826f 100644
--- a/youtube_dl/extractor/telequebec.py
+++ b/youtube_dl/extractor/telequebec.py
@@ -2,15 +2,17 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
int_or_none,
smuggle_url,
+ try_get,
)
class TeleQuebecIE(InfoExtractor):
_VALID_URL = r'https?://zonevideo\.telequebec\.tv/media/(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://zonevideo.telequebec.tv/media/20984/le-couronnement-de-new-york/couronnement-de-new-york',
'md5': 'fe95a0957e5707b1b01f5013e725c90f',
'info_dict': {
@@ -18,10 +20,14 @@ class TeleQuebecIE(InfoExtractor):
'ext': 'mp4',
'title': 'Le couronnement de New York',
'description': 'md5:f5b3d27a689ec6c1486132b2d687d432',
- 'upload_date': '20160220',
- 'timestamp': 1455965438,
+ 'upload_date': '20170201',
+ 'timestamp': 1485972222,
}
- }
+ }, {
+ # no description
+ 'url': 'http://zonevideo.telequebec.tv/media/30261',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
media_id = self._match_id(url)
@@ -31,9 +37,13 @@ class TeleQuebecIE(InfoExtractor):
return {
'_type': 'url_transparent',
'id': media_id,
- 'url': smuggle_url('limelight:media:' + media_data['streamInfo']['sourceId'], {'geo_countries': ['CA']}),
+ 'url': smuggle_url(
+ 'limelight:media:' + media_data['streamInfo']['sourceId'],
+ {'geo_countries': ['CA']}),
'title': media_data['title'],
- 'description': media_data.get('descriptions', [{'text': None}])[0].get('text'),
- 'duration': int_or_none(media_data.get('durationInMilliseconds'), 1000),
+ 'description': try_get(
+ media_data, lambda x: x['descriptions'][0]['text'], compat_str),
+ 'duration': int_or_none(
+ media_data.get('durationInMilliseconds'), 1000),
'ie_key': 'LimelightMedia',
}
diff --git a/youtube_dl/extractor/toongoggles.py b/youtube_dl/extractor/toongoggles.py
new file mode 100644
index 000000000..b5ba1c01d
--- /dev/null
+++ b/youtube_dl/extractor/toongoggles.py
@@ -0,0 +1,81 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_duration,
+)
+
+
+class ToonGogglesIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?toongoggles\.com/shows/(?P<show_id>\d+)(?:/[^/]+/episodes/(?P<episode_id>\d+))?'
+ _TESTS = [{
+ 'url': 'http://www.toongoggles.com/shows/217143/bernard-season-2/episodes/217147/football',
+ 'md5': '18289fc2b951eff6b953a9d8f01e6831',
+ 'info_dict': {
+ 'id': '217147',
+ 'ext': 'mp4',
+ 'title': 'Football',
+ 'uploader_id': '1',
+ 'description': 'Bernard decides to play football in order to be better than Lloyd and tries to beat him no matter how, he even cheats.',
+ 'upload_date': '20160718',
+ 'timestamp': 1468879330,
+ }
+ }, {
+ 'url': 'http://www.toongoggles.com/shows/227759/om-nom-stories-around-the-world',
+ 'info_dict': {
+ 'id': '227759',
+ 'title': 'Om Nom Stories Around The World',
+ },
+ 'playlist_mincount': 11,
+ }]
+
+ def _call_api(self, action, page_id, query):
+ query.update({
+ 'for_ng': 1,
+ 'for_web': 1,
+ 'show_meta': 1,
+ 'version': 7.0,
+ })
+ return self._download_json('http://api.toongoggles.com/' + action, page_id, query=query)
+
+ def _parse_episode_data(self, episode_data):
+ title = episode_data['episode_name']
+
+ return {
+ '_type': 'url_transparent',
+ 'id': episode_data['episode_id'],
+ 'title': title,
+ 'url': 'kaltura:513551:' + episode_data['entry_id'],
+ 'thumbnail': episode_data.get('thumbnail_url'),
+ 'description': episode_data.get('description'),
+ 'duration': parse_duration(episode_data.get('hms')),
+ 'series': episode_data.get('show_name'),
+ 'season_number': int_or_none(episode_data.get('season_num')),
+ 'episode_id': episode_data.get('episode_id'),
+ 'episode': title,
+ 'episode_number': int_or_none(episode_data.get('episode_num')),
+ 'categories': episode_data.get('categories'),
+ 'ie_key': 'Kaltura',
+ }
+
+ def _real_extract(self, url):
+ show_id, episode_id = re.match(self._VALID_URL, url).groups()
+ if episode_id:
+ episode_data = self._call_api('search', episode_id, {
+ 'filter': 'episode',
+ 'id': episode_id,
+ })['objects'][0]
+ return self._parse_episode_data(episode_data)
+ else:
+ show_data = self._call_api('getepisodesbyshow', show_id, {
+ 'max': 1000000000,
+ 'showid': show_id,
+ })
+ entries = []
+ for episode_data in show_data.get('objects', []):
+ entries.append(self._parse_episode_data(episode_data))
+ return self.playlist_result(entries, show_id, show_data.get('show_name'))
diff --git a/youtube_dl/extractor/tunepk.py b/youtube_dl/extractor/tunepk.py
new file mode 100644
index 000000000..9d42651ce
--- /dev/null
+++ b/youtube_dl/extractor/tunepk.py
@@ -0,0 +1,90 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ try_get,
+ unified_timestamp,
+)
+
+
+class TunePkIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:www\.)?tune\.pk/(?:video/|player/embed_player.php?.*?\bvid=)|
+ embed\.tune\.pk/play/
+ )
+ (?P<id>\d+)
+ '''
+ _TESTS = [{
+ 'url': 'https://tune.pk/video/6919541/maudie-2017-international-trailer-1-ft-ethan-hawke-sally-hawkins',
+ 'md5': '0c537163b7f6f97da3c5dd1e3ef6dd55',
+ 'info_dict': {
+ 'id': '6919541',
+ 'ext': 'mp4',
+ 'title': 'Maudie (2017) | International Trailer # 1 ft Ethan Hawke, Sally Hawkins',
+ 'description': 'md5:eb5a04114fafef5cec90799a93a2d09c',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'timestamp': 1487327564,
+ 'upload_date': '20170217',
+ 'uploader': 'Movie Trailers',
+ 'duration': 107,
+ 'view_count': int,
+ }
+ }, {
+ 'url': 'https://tune.pk/player/embed_player.php?vid=6919541&folder=2017/02/17/&width=600&height=350&autoplay=no',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://embed.tune.pk/play/6919541?autoplay=no&ssl=yes&inline=true',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'https://tune.pk/video/%s' % video_id, video_id)
+
+ details = self._parse_json(
+ self._search_regex(
+ r'new\s+TunePlayer\(({.+?})\)\s*;\s*\n', webpage, 'tune player'),
+ video_id)['details']
+
+ video = details['video']
+ title = video.get('title') or self._og_search_title(
+ webpage, default=None) or self._html_search_meta(
+ 'title', webpage, 'title', fatal=True)
+
+ formats = self._parse_jwplayer_formats(
+ details['player']['sources'], video_id)
+ self._sort_formats(formats)
+
+ description = self._og_search_description(
+ webpage, default=None) or self._html_search_meta(
+ 'description', webpage, 'description')
+
+ thumbnail = video.get('thumb') or self._og_search_thumbnail(
+ webpage, default=None) or self._html_search_meta(
+ 'thumbnail', webpage, 'thumbnail')
+
+ timestamp = unified_timestamp(video.get('date_added'))
+ uploader = try_get(
+ video, lambda x: x['uploader']['name'],
+ compat_str) or self._html_search_meta('author', webpage, 'author')
+
+ duration = int_or_none(video.get('duration'))
+ view_count = int_or_none(video.get('views'))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py
index f3541b654..7af11659f 100644
--- a/youtube_dl/extractor/twentyfourvideo.py
+++ b/youtube_dl/extractor/twentyfourvideo.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
parse_iso8601,
@@ -12,7 +14,7 @@ from ..utils import (
class TwentyFourVideoIE(InfoExtractor):
IE_NAME = '24video'
- _VALID_URL = r'https?://(?:www\.)?24video\.(?:net|me|xxx|sex|tube)/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)'
+ _VALID_URL = r'https?://(?P<host>(?:www\.)?24video\.(?:net|me|xxx|sex|tube))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.24video.net/video/view/1044982',
@@ -43,10 +45,12 @@ class TwentyFourVideoIE(InfoExtractor):
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ host = mobj.group('host')
webpage = self._download_webpage(
- 'http://www.24video.sex/video/view/%s' % video_id, video_id)
+ 'http://%s/video/view/%s' % (host, video_id), video_id)
title = self._og_search_title(webpage)
description = self._html_search_regex(
@@ -72,11 +76,11 @@ class TwentyFourVideoIE(InfoExtractor):
# Sets some cookies
self._download_xml(
- r'http://www.24video.sex/video/xml/%s?mode=init' % video_id,
+ r'http://%s/video/xml/%s?mode=init' % (host, video_id),
video_id, 'Downloading init XML')
video_xml = self._download_xml(
- 'http://www.24video.sex/video/xml/%s?mode=play' % video_id,
+ 'http://%s/video/xml/%s?mode=play' % (host, video_id),
video_id, 'Downloading video XML')
video = xpath_element(video_xml, './/video', 'video', fatal=True)
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py
index bbba394b0..2daf9dfac 100644
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -12,7 +12,6 @@ from ..compat import (
compat_str,
compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse,
- compat_urlparse,
)
from ..utils import (
clean_html,
@@ -24,6 +23,7 @@ from ..utils import (
parse_iso8601,
update_url_query,
urlencode_postdata,
+ urljoin,
)
@@ -32,7 +32,7 @@ class TwitchBaseIE(InfoExtractor):
_API_BASE = 'https://api.twitch.tv'
_USHER_BASE = 'https://usher.ttvnw.net'
- _LOGIN_URL = 'http://www.twitch.tv/login'
+ _LOGIN_URL = 'https://www.twitch.tv/login'
_CLIENT_ID = 'jzkbprff40iqj646a697cyrvl0zt2m6'
_NETRC_MACHINE = 'twitch'
@@ -64,6 +64,35 @@ class TwitchBaseIE(InfoExtractor):
raise ExtractorError(
'Unable to login. Twitch said: %s' % message, expected=True)
+ def login_step(page, urlh, note, data):
+ form = self._hidden_inputs(page)
+ form.update(data)
+
+ page_url = urlh.geturl()
+ post_url = self._search_regex(
+ r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page,
+ 'post url', default=page_url, group='url')
+ post_url = urljoin(page_url, post_url)
+
+ headers = {'Referer': page_url}
+
+ try:
+ response = self._download_json(
+ post_url, None, note,
+ data=urlencode_postdata(form),
+ headers=headers)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+ response = self._parse_json(
+ e.cause.read().decode('utf-8'), None)
+ fail(response['message'])
+ raise
+
+ redirect_url = urljoin(post_url, response['redirect'])
+ return self._download_webpage_handle(
+ redirect_url, None, 'Downloading login redirect page',
+ headers=headers)
+
login_page, handle = self._download_webpage_handle(
self._LOGIN_URL, None, 'Downloading login page')
@@ -71,40 +100,19 @@ class TwitchBaseIE(InfoExtractor):
if 'blacklist_message' in login_page:
fail(clean_html(login_page))
- login_form = self._hidden_inputs(login_page)
-
- login_form.update({
- 'username': username,
- 'password': password,
- })
-
- redirect_url = handle.geturl()
-
- post_url = self._search_regex(
- r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
- 'post url', default=redirect_url, group='url')
-
- if not post_url.startswith('http'):
- post_url = compat_urlparse.urljoin(redirect_url, post_url)
-
- headers = {'Referer': redirect_url}
+ redirect_page, handle = login_step(
+ login_page, handle, 'Logging in as %s' % username, {
+ 'username': username,
+ 'password': password,
+ })
- try:
- response = self._download_json(
- post_url, None, 'Logging in as %s' % username,
- data=urlencode_postdata(login_form),
- headers=headers)
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
- response = self._parse_json(
- e.cause.read().decode('utf-8'), None)
- fail(response['message'])
- raise
-
- if response.get('redirect'):
- self._download_webpage(
- response['redirect'], None, 'Downloading login redirect page',
- headers=headers)
+ if re.search(r'(?i)<form[^>]+id="two-factor-submit"', redirect_page) is not None:
+ # TODO: Add mechanism to request an SMS or phone call
+ tfa_token = self._get_tfa_info('two-factor authentication token')
+ login_step(redirect_page, handle, 'Submitting TFA token', {
+ 'authy_token': tfa_token,
+ 'remember_2fa': 'true',
+ })
def _prefer_source(self, formats):
try:
diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py
index d26fb49b3..5086f591e 100644
--- a/youtube_dl/extractor/vier.py
+++ b/youtube_dl/extractor/vier.py
@@ -9,7 +9,7 @@ from .common import InfoExtractor
class VierIE(InfoExtractor):
IE_NAME = 'vier'
- _VALID_URL = r'https?://(?:www\.)?vier\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))'
+ _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))'
_TESTS = [{
'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129',
'info_dict': {
@@ -24,6 +24,19 @@ class VierIE(InfoExtractor):
'skip_download': True,
},
}, {
+ 'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614',
+ 'info_dict': {
+ 'id': '2561614',
+ 'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas',
+ 'ext': 'mp4',
+ 'title': 'ZO grappig: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s',
+ 'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen',
'only_matching': True,
}, {
@@ -35,6 +48,7 @@ class VierIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
embed_id = mobj.group('embed_id')
display_id = mobj.group('display_id') or embed_id
+ site = mobj.group('site')
webpage = self._download_webpage(url, display_id)
@@ -43,7 +57,7 @@ class VierIE(InfoExtractor):
webpage, 'video id')
application = self._search_regex(
[r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
- webpage, 'application', default='vier_vod')
+ webpage, 'application', default=site + '_vod')
filename = self._search_regex(
[r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],
webpage, 'filename')
@@ -68,7 +82,7 @@ class VierIE(InfoExtractor):
class VierVideosIE(InfoExtractor):
IE_NAME = 'vier:videos'
- _VALID_URL = r'https?://(?:www\.)?vier\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)'
+ _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)'
_TESTS = [{
'url': 'http://www.vier.be/demoestuin/videos',
'info_dict': {
@@ -76,6 +90,12 @@ class VierVideosIE(InfoExtractor):
},
'playlist_mincount': 153,
}, {
+ 'url': 'http://www.vijf.be/temptationisland/videos',
+ 'info_dict': {
+ 'id': 'temptationisland',
+ },
+ 'playlist_mincount': 159,
+ }, {
'url': 'http://www.vier.be/demoestuin/videos?page=6',
'info_dict': {
'id': 'demoestuin-page6',
@@ -92,6 +112,7 @@ class VierVideosIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
program = mobj.group('program')
+ site = mobj.group('site')
page_id = mobj.group('page')
if page_id:
@@ -105,13 +126,13 @@ class VierVideosIE(InfoExtractor):
entries = []
for current_page_id in itertools.count(start_page):
current_page = self._download_webpage(
- 'http://www.vier.be/%s/videos?page=%d' % (program, current_page_id),
+ 'http://www.%s.be/%s/videos?page=%d' % (site, program, current_page_id),
program,
'Downloading page %d' % (current_page_id + 1))
page_entries = [
- self.url_result('http://www.vier.be' + video_url, 'Vier')
+ self.url_result('http://www.' + site + '.be' + video_url, 'Vier')
for video_url in re.findall(
- r'<h3><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)]
+ r'<h[23]><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)]
entries.extend(page_entries)
if page_id or '>Meer<' not in current_page:
break
diff --git a/youtube_dl/extractor/viu.py b/youtube_dl/extractor/viu.py
index 3fd889c8e..db6a65d2e 100644
--- a/youtube_dl/extractor/viu.py
+++ b/youtube_dl/extractor/viu.py
@@ -44,7 +44,7 @@ class ViuBaseIE(InfoExtractor):
class ViuIE(ViuBaseIE):
- _VALID_URL = r'(?:viu:|https?://www\.viu\.com/[a-z]{2}/media/)(?P<id>\d+)'
+ _VALID_URL = r'(?:viu:|https?://[^/]+\.viu\.com/[a-z]{2}/media/)(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.viu.com/en/media/1116705532?containerId=playlist-22168059',
'info_dict': {
@@ -69,6 +69,9 @@ class ViuIE(ViuBaseIE):
'skip_download': 'm3u8 download',
},
'skip': 'Geo-restricted to Indonesia',
+ }, {
+ 'url': 'https://india.viu.com/en/media/1126286865',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index 7c42a4f54..dc2719cf9 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -432,8 +432,7 @@ class VKIE(VKBaseIE):
})
elif format_id == 'hls':
formats.extend(self._extract_m3u8_formats(
- format_url, video_id, 'mp4',
- entry_protocol='m3u8' if is_live else 'm3u8_native',
+ format_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=format_id, fatal=False, live=is_live))
elif format_id == 'rtmp':
formats.append({
diff --git a/youtube_dl/extractor/vrak.py b/youtube_dl/extractor/vrak.py
new file mode 100644
index 000000000..daa247cce
--- /dev/null
+++ b/youtube_dl/extractor/vrak.py
@@ -0,0 +1,80 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .brightcove import BrightcoveNewIE
+from ..utils import (
+ int_or_none,
+ parse_age_limit,
+ smuggle_url,
+ unescapeHTML,
+)
+
+
+class VrakIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?vrak\.tv/videos\?.*?\btarget=(?P<id>[\d.]+)'
+ _TEST = {
+ 'url': 'http://www.vrak.tv/videos?target=1.2306782&filtre=emission&id=1.1806721',
+ 'info_dict': {
+ 'id': '5345661243001',
+ 'ext': 'mp4',
+ 'title': 'Obésité, film de hockey et Roseline Filion',
+ 'timestamp': 1488492126,
+ 'upload_date': '20170302',
+ 'uploader_id': '2890187628001',
+ 'creator': 'VRAK.TV',
+ 'age_limit': 8,
+ 'series': 'ALT (Actualité Légèrement Tordue)',
+ 'episode': 'Obésité, film de hockey et Roseline Filion',
+ 'tags': list,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/2890187628001/default_default/index.html?videoId=%s'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(
+ r'<h\d\b[^>]+\bclass=["\']videoTitle["\'][^>]*>([^<]+)',
+ webpage, 'title', default=None) or self._og_search_title(webpage)
+
+ content = self._parse_json(
+ self._search_regex(
+ r'data-player-options-content=(["\'])(?P<content>{.+?})\1',
+ webpage, 'content', default='{}', group='content'),
+ video_id, transform_source=unescapeHTML)
+
+ ref_id = content.get('refId') or self._search_regex(
+ r'refId&quot;:&quot;([^&]+)&quot;', webpage, 'ref id')
+
+ brightcove_id = self._search_regex(
+ r'''(?x)
+ java\.lang\.String\s+value\s*=\s*["']brightcove\.article\.\d+\.%s
+ [^>]*
+ java\.lang\.String\s+value\s*=\s*["'](\d+)
+ ''' % re.escape(ref_id), webpage, 'brightcove id')
+
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': BrightcoveNewIE.ie_key(),
+ 'url': smuggle_url(
+ self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+ {'geo_countries': ['CA']}),
+ 'id': brightcove_id,
+ 'description': content.get('description'),
+ 'creator': content.get('brand'),
+ 'age_limit': parse_age_limit(content.get('rating')),
+ 'series': content.get('showName') or content.get(
+ 'episodeName'), # this is intentional
+ 'season_number': int_or_none(content.get('seasonNumber')),
+ 'episode': title,
+ 'episode_number': int_or_none(content.get('episodeNumber')),
+ 'tags': content.get('tags', []),
+ }
diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py
index f7e6360a3..8bb7362bb 100644
--- a/youtube_dl/extractor/wdr.py
+++ b/youtube_dl/extractor/wdr.py
@@ -19,9 +19,10 @@ class WDRBaseIE(InfoExtractor):
def _extract_wdr_video(self, webpage, display_id):
# for wdr.de the data-extension is in a tag with the class "mediaLink"
# for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn"
- # for wdrmaus its in a link to the page in a multiline "videoLink"-tag
+ # for wdrmaus, in a tag with the class "videoButton" (previously a link
+ # to the page in a multiline "videoLink"-tag)
json_metadata = self._html_search_regex(
- r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"',
+ r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"',
webpage, 'media link', default=None, flags=re.MULTILINE)
if not json_metadata:
@@ -32,7 +33,7 @@ class WDRBaseIE(InfoExtractor):
jsonp_url = media_link_obj['mediaObj']['url']
metadata = self._download_json(
- jsonp_url, 'metadata', transform_source=strip_jsonp)
+ jsonp_url, display_id, transform_source=strip_jsonp)
metadata_tracker_data = metadata['trackerData']
metadata_media_resource = metadata['mediaResource']
@@ -161,23 +162,23 @@ class WDRIE(WDRBaseIE):
{
'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',
'info_dict': {
- 'id': 'mdb-1096487',
- 'ext': 'flv',
+ 'id': 'mdb-1323501',
+ 'ext': 'mp4',
'upload_date': 're:^[0-9]{8}$',
'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$',
- 'description': '- Die Sendung mit der Maus -',
+ 'description': 'Die Seite mit der Maus -',
},
'skip': 'The id changes from week to week because of the new episode'
},
{
- 'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/achterbahn.php5',
+ 'url': 'http://www.wdrmaus.de/filme/sachgeschichten/achterbahn.php5',
'md5': '803138901f6368ee497b4d195bb164f2',
'info_dict': {
'id': 'mdb-186083',
'ext': 'mp4',
'upload_date': '20130919',
'title': 'Sachgeschichte - Achterbahn ',
- 'description': '- Die Sendung mit der Maus -',
+ 'description': 'Die Seite mit der Maus -',
},
},
{
@@ -186,7 +187,7 @@ class WDRIE(WDRBaseIE):
'info_dict': {
'id': 'mdb-869971',
'ext': 'flv',
- 'title': 'Funkhaus Europa Livestream',
+ 'title': 'COSMO Livestream',
'description': 'md5:2309992a6716c347891c045be50992e4',
'upload_date': '20160101',
},
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 81c793921..ca40de522 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -59,6 +59,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
+ _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}'
+
def _set_language(self):
self._set_cookie(
'.youtube.com', 'PREF', 'f1=50000000&hl=en',
@@ -265,9 +267,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
)
)? # all until now is optional -> you can pass the naked ID
([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
- (?!.*?\blist=) # combined list/video URLs are handled by the playlist IE
+ (?!.*?\blist=
+ (?:
+ %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
+ WL # WL are handled by the watch later IE
+ )
+ )
(?(1).+)? # if we found the ID, everything can follow
- $"""
+ $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
_formats = {
'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
@@ -924,6 +931,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'url': 'sJL6WA-aGkQ',
'only_matching': True,
},
+ {
+ 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
+ 'only_matching': True,
+ },
]
def __init__(self, *args, **kwargs):
@@ -1454,7 +1465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Check for "rental" videos
if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
- raise ExtractorError('"rental" videos not supported')
+ raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True)
# Start extracting information
self.report_information_extraction(video_id)
@@ -1864,8 +1875,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
)
.*
|
- ((?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,})
- )"""
+ (%(playlist_id)s)
+ )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true'
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
IE_NAME = 'youtube:playlist'