aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/generic.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/generic.py')
-rw-r--r--youtube_dl/extractor/generic.py168
1 files changed, 132 insertions, 36 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 95d233259..b4138381d 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -51,7 +51,7 @@ from .tnaflix import TNAFlixNetworkEmbedIE
from .vimeo import VimeoIE
from .dailymotion import DailymotionCloudIE
from .onionstudios import OnionStudiosIE
-from .snagfilms import SnagFilmsEmbedIE
+from .viewlift import ViewLiftEmbedIE
from .screenwavemedia import ScreenwaveMediaIE
from .mtv import MTVServicesEmbeddedIE
from .pladform import PladformIE
@@ -61,6 +61,8 @@ from .jwplatform import JWPlatformIE
from .digiteka import DigitekaIE
from .instagram import InstagramIE
from .liveleak import LiveLeakIE
+from .threeqsdn import ThreeQSDNIE
+from .theplatform import ThePlatformIE
class GenericIE(InfoExtractor):
@@ -237,6 +239,7 @@ class GenericIE(InfoExtractor):
'ext': 'mp4',
'title': 'car-20120827-manifest',
'formats': 'mincount:9',
+ 'upload_date': '20130904',
},
'params': {
'format': 'bestvideo',
@@ -596,7 +599,11 @@ class GenericIE(InfoExtractor):
'id': 'k2mm4bCdJ6CQ2i7c8o2',
'ext': 'mp4',
'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
+ 'description': 'md5:faf028e48a461b8b7fad38f1e104b119',
'uploader': 'Spi0n',
+ 'uploader_id': 'xgditw',
+ 'upload_date': '20140425',
+ 'timestamp': 1398441542,
},
'add_ie': ['Dailymotion'],
},
@@ -711,15 +718,18 @@ class GenericIE(InfoExtractor):
},
# Wistia embed
{
- 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
- 'md5': '8788b683c777a5cf25621eaf286d0c23',
+ 'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
+ 'md5': '1953f3a698ab51cfc948ed3992a0b7ff',
'info_dict': {
- 'id': '1cfaf6b7ea',
+ 'id': '6e2wtrbdaf',
'ext': 'mov',
- 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
- 'duration': 643.0,
- 'filesize': 182808282,
- 'uploader': 'education-portal.com',
+ 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england',
+ 'description': 'a Paywall Videos video from Remilon',
+ 'duration': 644.072,
+ 'uploader': 'study.com',
+ 'timestamp': 1459678540,
+ 'upload_date': '20160403',
+ 'filesize': 24687186,
},
},
{
@@ -728,11 +738,30 @@ class GenericIE(InfoExtractor):
'info_dict': {
'id': 'uxjb0lwrcz',
'ext': 'mp4',
- 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
+ 'title': 'Conversation about Hexagonal Rails Part 1',
+ 'description': 'a Martin Fowler video from ThoughtWorks',
'duration': 1715.0,
'uploader': 'thoughtworks.wistia.com',
+ 'timestamp': 1401832161,
+ 'upload_date': '20140603',
},
},
+ # Wistia standard embed (async)
+ {
+ 'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/',
+ 'info_dict': {
+ 'id': '807fafadvk',
+ 'ext': 'mp4',
+ 'title': 'Drip Brennan Dunn Workshop',
+ 'description': 'a JV Webinars video from getdrip-1',
+ 'duration': 4986.95,
+ 'timestamp': 1463607249,
+ 'upload_date': '20160518',
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ },
# Soundcloud embed
{
'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
@@ -755,6 +784,19 @@ class GenericIE(InfoExtractor):
'title': 'Rosetta #CometLanding webcast HL 10',
}
},
+ # Another Livestream embed, without 'new.' in URL
+ {
+ 'url': 'https://www.freespeech.org/',
+ 'info_dict': {
+ 'id': '123537347',
+ 'ext': 'mp4',
+ 'title': 're:^FSTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ },
+ 'params': {
+ # Live stream
+ 'skip_download': True,
+ },
+ },
# LazyYT
{
'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
@@ -839,18 +881,6 @@ class GenericIE(InfoExtractor):
'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
}
},
- # Kaltura embed
- {
- 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
- 'info_dict': {
- 'id': '1_eergr3h1',
- 'ext': 'mp4',
- 'upload_date': '20150226',
- 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
- 'timestamp': int,
- 'title': 'John Carlson Postgame 2/25/15',
- },
- },
# Kaltura embed (different embed code)
{
'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014',
@@ -876,9 +906,23 @@ class GenericIE(InfoExtractor):
'uploader_id': 'echojecka',
},
},
+ # Kaltura embed with single quotes
+ {
+ 'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY',
+ 'info_dict': {
+ 'id': '0_izeg5utt',
+ 'ext': 'mp4',
+ 'title': '35871',
+ 'timestamp': 1355743100,
+ 'upload_date': '20121217',
+ 'uploader_id': 'batchUser',
+ },
+ 'add_ie': ['Kaltura'],
+ },
# Eagle.Platform embed (generic URL)
{
'url': 'http://lenta.ru/news/2015/03/06/navalny/',
+ # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
'info_dict': {
'id': '227304',
'ext': 'mp4',
@@ -893,6 +937,7 @@ class GenericIE(InfoExtractor):
# ClipYou (Eagle.Platform) embed (custom URL)
{
'url': 'http://muz-tv.ru/play/7129/',
+ # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
'info_dict': {
'id': '12820',
'ext': 'mp4',
@@ -981,18 +1026,25 @@ class GenericIE(InfoExtractor):
'ext': 'flv',
'title': "PFT Live: New leader in the 'new-look' defense",
'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
+ 'uploader': 'NBCU-SPORTS',
+ 'upload_date': '20140107',
+ 'timestamp': 1389118457,
},
},
# UDN embed
{
- 'url': 'http://www.udn.com/news/story/7314/822787',
+ 'url': 'https://video.udn.com/news/300346',
'md5': 'fd2060e988c326991037b9aff9df21a6',
'info_dict': {
'id': '300346',
'ext': 'mp4',
'title': '中一中男師變性 全校師生力挺',
'thumbnail': 're:^https?://.*\.jpg$',
- }
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
},
# Ooyala embed
{
@@ -1033,6 +1085,9 @@ class GenericIE(InfoExtractor):
'title': 'SN Presents: Russell Martin, World Citizen',
'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
'uploader': 'Rogers Sportsnet',
+ 'uploader_id': '1704050871',
+ 'upload_date': '20150525',
+ 'timestamp': 1432570283,
},
},
# Dailymotion Cloud video
@@ -1124,6 +1179,9 @@ class GenericIE(InfoExtractor):
'title': 'The Cardinal Pell Interview',
'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ',
'uploader': 'GlobeCast Australia - GlobeStream',
+ 'uploader_id': '2733773828001',
+ 'upload_date': '20160304',
+ 'timestamp': 1457083087,
},
'params': {
# m3u8 downloads
@@ -1154,6 +1212,16 @@ class GenericIE(InfoExtractor):
'uploader': 'Lake8737',
}
},
+ # Duplicated embedded video URLs
+ {
+ 'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443',
+ 'info_dict': {
+ 'id': '149298443_480_16c25b74_2',
+ 'ext': 'mp4',
+ 'title': 'vs. Blue Orange Spring Game',
+ 'uploader': 'www.hudl.com',
+ },
+ },
]
def report_following_redirect(self, new_url):
@@ -1408,7 +1476,8 @@ class GenericIE(InfoExtractor):
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
- video_title = self._html_search_regex(
+ video_title = self._og_search_title(
+ webpage, default=None) or self._html_search_regex(
r'(?s)<title>(.*?)</title>', webpage, 'video title',
default='video')
@@ -1426,6 +1495,9 @@ class GenericIE(InfoExtractor):
video_uploader = self._search_regex(
r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
+ video_description = self._og_search_description(webpage, default=None)
+ video_thumbnail = self._og_search_thumbnail(webpage, default=None)
+
# Helper method
def _playlist_from_matches(matches, getter=None, ie=None):
urlrs = orderedSet(
@@ -1456,6 +1528,11 @@ class GenericIE(InfoExtractor):
if bc_urls:
return _playlist_from_matches(bc_urls, ie='BrightcoveNew')
+ # Look for ThePlatform embeds
+ tp_urls = ThePlatformIE._extract_urls(webpage)
+ if tp_urls:
+ return _playlist_from_matches(tp_urls, ie='ThePlatform')
+
# Look for embedded rtl.nl player
matches = re.findall(
r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
@@ -1524,21 +1601,26 @@ class GenericIE(InfoExtractor):
'url': embed_url,
'ie_key': 'Wistia',
'uploader': video_uploader,
- 'title': video_title,
- 'id': video_id,
}
match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
if match:
return {
'_type': 'url_transparent',
- 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
+ 'url': 'wistia:%s' % match.group('id'),
'ie_key': 'Wistia',
'uploader': video_uploader,
- 'title': video_title,
- 'id': match.group('id')
}
+ match = re.search(
+ r'''(?sx)
+ <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*?
+ <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2
+ ''', webpage)
+ if match:
+ return self.url_result(self._proto_relative_url(
+ 'wistia:%s' % match.group('id')), 'Wistia')
+
# Look for SVT player
svt_url = SVTIE._extract_url(webpage)
if svt_url:
@@ -1814,7 +1896,7 @@ class GenericIE(InfoExtractor):
return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
mobj = re.search(
- r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
+ r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"',
webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'Livestream')
@@ -1826,7 +1908,7 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'Zapiks')
# Look for Kaltura embeds
- mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P<id>[^']+)',", webpage) or
+ mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P<q1>['\"])wid(?P=q1)\s*:\s*(?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?(?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),", webpage) or
re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage))
if mobj is not None:
return self.url_result(smuggle_url(
@@ -1905,10 +1987,10 @@ class GenericIE(InfoExtractor):
if onionstudios_url:
return self.url_result(onionstudios_url)
- # Look for SnagFilms embeds
- snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage)
- if snagfilms_url:
- return self.url_result(snagfilms_url)
+ # Look for ViewLift embeds
+ viewlift_url = ViewLiftEmbedIE._extract_url(webpage)
+ if viewlift_url:
+ return self.url_result(viewlift_url)
# Look for JWPlatform embeds
jwplatform_url = JWPlatformIE._extract_url(webpage)
@@ -1964,6 +2046,19 @@ class GenericIE(InfoExtractor):
if liveleak_url:
return self.url_result(liveleak_url, 'LiveLeak')
+ # Look for 3Q SDN embeds
+ threeqsdn_url = ThreeQSDNIE._extract_url(webpage)
+ if threeqsdn_url:
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': ThreeQSDNIE.ie_key(),
+ 'url': self._proto_relative_url(threeqsdn_url),
+ 'title': video_title,
+ 'description': video_description,
+ 'thumbnail': video_thumbnail,
+ 'uploader': video_uploader,
+ }
+
def check_video(vurl):
if YoutubeIE.suitable(vurl):
return True
@@ -2044,7 +2139,8 @@ class GenericIE(InfoExtractor):
raise UnsupportedError(url)
entries = []
- for video_url in found:
+ for video_url in orderedSet(found):
+ video_url = unescapeHTML(video_url)
video_url = video_url.replace('\\/', '/')
video_url = compat_urlparse.urljoin(url, video_url)
video_id = compat_urllib_parse_unquote(os.path.basename(video_url))