aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/generic.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/generic.py')
-rw-r--r--youtube_dl/extractor/generic.py140
1 files changed, 77 insertions, 63 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 67184bc5d..c108d4a8a 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -86,6 +86,10 @@ from .openload import OpenloadIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
from .limelight import LimelightBaseIE
+from .anvato import AnvatoIE
+from .washingtonpost import WashingtonPostIE
+from .wistia import WistiaIE
+from .mediaset import MediasetIE
class GenericIE(InfoExtractor):
@@ -1427,6 +1431,22 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
+ {
+ # Brightcove embed with whitespace around attribute names
+ 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill',
+ 'info_dict': {
+ 'id': '3167554373001',
+ 'ext': 'mp4',
+ 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill",
+ 'description': 'md5:57bacb0e0f29349de4972bfda3191713',
+ 'uploader_id': '1079349493',
+ 'upload_date': '20140207',
+ 'timestamp': 1391810548,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
# Another form of arte.tv embed
{
'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html',
@@ -1677,6 +1697,42 @@ class GenericIE(InfoExtractor):
},
'playlist_mincount': 5,
},
+ {
+ 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/',
+ 'info_dict': {
+ 'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest',
+ 'title': 'Standoff with Walnut Creek murder suspect ends',
+ 'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788',
+ },
+ 'playlist_mincount': 4,
+ },
+ {
+ # WashingtonPost embed
+ 'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches',
+ 'info_dict': {
+ 'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac',
+ 'ext': 'mp4',
+ 'title': "No one has seen the drama series based on Trump's life \u2014 until now",
+ 'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.',
+ 'timestamp': 1455216756,
+ 'uploader': 'The Washington Post',
+ 'upload_date': '20160211',
+ },
+ 'add_ie': [WashingtonPostIE.ie_key()],
+ },
+ {
+ # Mediaset embed
+ 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml',
+ 'info_dict': {
+ 'id': '720642',
+ 'ext': 'mp4',
+ 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': [MediasetIE.ie_key()],
+ },
# {
# # TODO: find another test
# # http://schema.org/VideoObject
@@ -2070,57 +2126,20 @@ class GenericIE(InfoExtractor):
playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
# Look for embedded Wistia player
- match = re.search(
- r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
- if match:
- embed_url = self._proto_relative_url(
- unescapeHTML(match.group('url')))
- return {
- '_type': 'url_transparent',
- 'url': embed_url,
- 'ie_key': 'Wistia',
- 'uploader': video_uploader,
- }
-
- match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
- if match:
+ wistia_url = WistiaIE._extract_url(webpage)
+ if wistia_url:
return {
'_type': 'url_transparent',
- 'url': 'wistia:%s' % match.group('id'),
- 'ie_key': 'Wistia',
+ 'url': self._proto_relative_url(wistia_url),
+ 'ie_key': WistiaIE.ie_key(),
'uploader': video_uploader,
}
- match = re.search(
- r'''(?sx)
- <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*?
- <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2
- ''', webpage)
- if match:
- return self.url_result(self._proto_relative_url(
- 'wistia:%s' % match.group('id')), 'Wistia')
-
# Look for SVT player
svt_url = SVTIE._extract_url(webpage)
if svt_url:
return self.url_result(svt_url, 'SVT')
- # Look for embedded condenast player
- matches = re.findall(
- r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
- webpage)
- if matches:
- return {
- '_type': 'playlist',
- 'entries': [{
- '_type': 'url',
- 'ie_key': 'CondeNast',
- 'url': ma,
- } for ma in matches],
- 'title': video_title,
- 'id': video_id,
- }
-
# Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
if mobj is not None:
@@ -2514,28 +2533,11 @@ class GenericIE(InfoExtractor):
return self.playlist_result(
limelight_urls, video_id, video_title, video_description)
- mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage)
- if mobj:
- lm = {
- 'Media': 'media',
- 'Channel': 'channel',
- 'ChannelList': 'channel_list',
- }
- return self.url_result(smuggle_url('limelight:%s:%s' % (
- lm[mobj.group(1)], mobj.group(2)), {'source_url': url}),
- 'Limelight%s' % mobj.group(1), mobj.group(2))
-
- mobj = re.search(
- r'''(?sx)
- <object[^>]+class=(["\'])LimelightEmbeddedPlayerFlash\1[^>]*>.*?
- <param[^>]+
- name=(["\'])flashVars\2[^>]+
- value=(["\'])(?:(?!\3).)*mediaId=(?P<id>[a-z0-9]{32})
- ''', webpage)
- if mobj:
- return self.url_result(smuggle_url(
- 'limelight:media:%s' % mobj.group('id'),
- {'source_url': url}), 'LimelightMedia', mobj.group('id'))
+ # Look for Anvato embeds
+ anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id)
+ if anvato_urls:
+ return self.playlist_result(
+ anvato_urls, video_id, video_title, video_description)
# Look for AdobeTVVideo embeds
mobj = re.search(
@@ -2654,6 +2656,18 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
rutube_urls, ie=RutubeIE.ie_key())
+ # Look for WashingtonPost embeds
+ wapo_urls = WashingtonPostIE._extract_urls(webpage)
+ if wapo_urls:
+ return self.playlist_from_matches(
+ wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key())
+
+ # Look for Mediaset embeds
+ mediaset_urls = MediasetIE._extract_urls(webpage)
+ if mediaset_urls:
+ return self.playlist_from_matches(
+ mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key())
+
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(
webpage, video_id, default={}, expected_type='VideoObject')